Project structure
text doc_compare/ init.py config.py models.py extract.py normalize.py store.py compare.py cli.py
You can of course collapse this into fewer files if you prefer.
- Dependencies
bash pip install pymupdf pdfplumber python-docx sentence-transformers rapidfuzz
- Config and simple models
`python
doc_compare/config.py
EMBEDDINGMODELNAME = “sentence-transformers/all-MiniLM-L6-v2”
COSINETHRESHOLDUNCHANGED = 0.93
JACCARDTHRESHOLDMODIFIED = 0.85
LEVENSHTEINTHRESHOLDMODIFIED = 0.90
`
`python
doc_compare/models.py
from dataclasses import dataclass
from typing import List, Dict, Optional
@dataclass
class PageData:
page_number: int
raw_text: str
normalized_text: str
embedding: Optional[list] = None
@dataclass
class DocumentVersion:
document_id: str
version_id: str
pages: List[PageData]
metadata: Dict
`
- Extraction (PDF only, DOCX→PDF assumed upstream)
`python
doc_compare/extract.py
import fitz # PyMuPDF
from typing import List, Dict
from .models import PageData
def extractpdfpages(path: str) → (List[PageData], Dict):
doc = fitz.open(path)
pages =
metadata = doc.metadata or {}
for i, page in enumerate(doc):
text = page.get_text(“text”)
pages.append(
PageData(
page_number=i + 1,
raw_text=text,
normalized_text=“”, # filled later
)
)
doc.close()
return pages, metadata
`
- Normalization
`python
doc_compare/normalize.py
import re
from .models import PageData
HEADERFOOTERREGEXES = [
r"Page \d+ of \d+“,
r”^\s\d+\s$", # bare page numbers
]
def normalize_text(text: str) → str:
basic cleanup
t = text.replace(“\r”, “\n”)
t = re.sub(r"\n{2,}“, “\n”, t)
t = re.sub(r”[ \t]+", " ", t)
# remove headers/footers
lines = []
for line in t.split("\n"):
if any(re.search(pat, line) for pat in HEADERFOOTERREGEXES):
continue
lines.append(line.strip())
t = "\n".join(l for l in lines if l)
return t
def normalize_pages(pages: list[PageData]) → list[PageData]:
for p in pages:
p.normalizedtext = normalizetext(p.raw_text)
return pages
`
- Storage layout (filesystem-based)
`python
doc_compare/store.py
import json
from pathlib import Path
from typing import List
from .models import DocumentVersion, PageData
def savedocumentversion(base_dir: str, doc: DocumentVersion) → None:
root = Path(basedir) / doc.documentid / doc.version_id
root.mkdir(parents=True, exist_ok=True)
meta = {
"documentid": doc.documentid,
"versionid": doc.versionid,
"metadata": doc.metadata,
}
(root / "meta.json").write_text(json.dumps(meta, indent=2), encoding="utf-8")
for p in doc.pages:
pagepath = root / f"page{p.page_number:04d}.json"
page_data = {
"pagenumber": p.pagenumber,
"rawtext": p.rawtext,
"normalizedtext": p.normalizedtext,
"embedding": p.embedding,
}
pagepath.writetext(json.dumps(pagedata, ensureascii=False), encoding="utf-8")
def loaddocumentversion(basedir: str, documentid: str, version_id: str) → DocumentVersion:
root = Path(basedir) / documentid / version_id
meta = json.loads((root / “meta.json”).read_text(encoding=“utf-8”))
pages: List[PageData] =
for pagefile in sorted(root.glob(“page*.json”)):
d = json.loads(pagefile.readtext(encoding=“utf-8”))
pages.append(
PageData(
pagenumber=d[“pagenumber”],
rawtext=d[“rawtext”],
normalizedtext=d[“normalizedtext”],
embedding=d.get(“embedding”),
)
)
return DocumentVersion(
documentid=documentid,
versionid=versionid,
pages=pages,
metadata=meta.get(“metadata”, {}),
)
`
- Embeddings + similarity helpers
`python
doc_compare/compare.py
from typing import Dict, List, Tuple
import numpy as np
from sentence_transformers import SentenceTransformer
from rapidfuzz.distance import Jaccard, Levenshtein
from .models import DocumentVersion, PageData
from .config import (
EMBEDDINGMODELNAME,
COSINETHRESHOLDUNCHANGED,
JACCARDTHRESHOLDMODIFIED,
LEVENSHTEINTHRESHOLDMODIFIED,
)
_model = None
def get_model():
global _model
if _model is None:
model = SentenceTransformer(EMBEDDINGMODEL_NAME)
return _model
def embed_pages(pages: List[PageData]) → List[PageData]:
model = get_model()
texts = [p.normalizedtext or p.rawtext for p in pages]
embs = model.encode(texts, converttonumpy=True)
for p, e in zip(pages, embs):
p.embedding = e.tolist()
return pages
def cosine_sim(a: np.ndarray, b: np.ndarray) → float:
denom = (np.linalg.norm(a) * np.linalg.norm(b)) or 1e-9
return float(np.dot(a, b) / denom)
def jaccard_sim(a: str, b: str) → float:
return 1.0 - Jaccard.normalized_distance(a, b)
def levenshtein_ratio(a: str, b: str) → float:
return 1.0 - Levenshtein.normalized_distance(a, b)
`
- Page matching and diff decision
`python
doc_compare/compare.py (continued)
def matchpagesby_embedding(
oldpages: List[PageData], newpages: List[PageData]
) → List[Tuple[PageData, PageData, float]]:
oldembs = np.array([p.embedding for p in oldpages])
newembs = np.array([p.embedding for p in newpages])
matches = []
used_old = set()
for newidx, newp in enumerate(new_pages):
sims = oldembs @ newembs[new_idx] / (
np.linalg.norm(oldembs, axis=1) * np.linalg.norm(newembs[new_idx]) + 1e-9
)
bestoldidx = int(np.argmax(sims))
if bestoldidx in used_old:
continue
usedold.add(bestold_idx)
matches.append((oldpages[bestoldidx], newp, float(sims[bestoldidx])))
return matches
def ismodified(old: PageData, new: PageData, cossim: float) → Dict:
j = jaccardsim(old.normalizedtext, new.normalized_text)
l = levenshteinratio(old.normalizedtext, new.normalized_text)
signals = {
"cosinesimilarity": cossim,
"jaccard_similarity": j,
"levenshtein_ratio": l,
}
belowcos = cossim < COSINETHRESHOLDUNCHANGED
belowj = j < JACCARDTHRESHOLD_MODIFIED
belowl = l < LEVENSHTEINTHRESHOLD_MODIFIED
modified = sum([belowcos, belowj, below_l]) >= 2
return {"modified": modified, signals}
`
- High-level document comparison
`python
doc_compare/compare.py (continued)
def compare_documents(old: DocumentVersion, new: DocumentVersion) → Dict:
ensure embeddings
if old.pages and old.pages[0].embedding is None:
old.pages = embed_pages(old.pages)
if new.pages and new.pages[0].embedding is None:
new.pages = embed_pages(new.pages)
matches = matchpagesby_embedding(old.pages, new.pages)
pages_modified = []
page_summaries = {}
page_scores = []
for oldp, newp, cos in matches:
res = ismodified(oldp, new_p, cos)
pagescores.append(res["cosinesimilarity"])
if res["modified"]:
pagesmodified.append(newp.page_number)
# very naive summary; you’d replace with LLM or rule-based summary
pagesummaries[str(newp.page_number)] = "Content updated on this page."
overallsimilarity = float(np.mean(pagescores)) if page_scores else 0.0
return {
"documentid": new.documentid,
"versionnew": new.versionid,
"versionold": old.versionid,
"overallsimilarity": overallsimilarity,
"pagesmodified": sorted(pagesmodified),
"pagesummaries": pagesummaries,
}
`
- Simple CLI entry point
`python
doc_compare/cli.py
import argparse
import uuid
from .extract import extractpdfpages
from .normalize import normalize_pages
from .store import savedocumentversion, loaddocumentversion
from .models import DocumentVersion
from .compare import compare_documents
def buildversion(basedir: str, documentid: str, versionid: str, pdf_path: str):
pages, meta = extractpdfpages(pdf_path)
pages = normalize_pages(pages)
doc = DocumentVersion(
documentid=documentid,
versionid=versionid,
pages=pages,
metadata=meta,
)
savedocumentversion(base_dir, doc)
def main():
parser = argparse.ArgumentParser()
parser.add_argument(“–base-dir”, required=True)
parser.addargument(“–old-version”, help=“path to old PDF or existing versionid”)
parser.add_argument(“–new-pdf”, required=True)
parser.add_argument(“–document-id”, default=str(uuid.uuid4()))
parser.add_argument(“–old-version-id”, help=“existing version id”)
parser.addargument(“–new-version-id”, default=“vnew”)
args = parser.parse_args()
# Build new version
buildversion(args.basedir, args.documentid, args.newversionid, args.newpdf)
newdoc = loaddocumentversion(args.basedir, args.documentid, args.newversion_id)
if args.oldversionid:
olddoc = loaddocumentversion(args.basedir, args.documentid, args.oldversion_id)
elif args.old_version:
# treat old_version as a PDF path and build a temp version
tempversionid = "v_old"
buildversion(args.basedir, args.documentid, tempversionid, args.oldversion)
olddoc = loaddocumentversion(args.basedir, args.documentid, tempversion_id)
else:
print("No old version provided; nothing to compare.")
return
result = comparedocuments(olddoc, new_doc)
import json
print(json.dumps(result, indent=2))
if name == “main”:
main()
`
This gives you a working skeleton:
- Drop in PDFs (or DOCX→PDF upstream).
- Build versions.
- Compare any two versions.
- Get JSON with similarity, modified pages, and basic summaries.
If you tell me your preferred stack (FastAPI, Celery, orchestration layer, storage backend), I can adapt this into a service-style architecture next.
Regards, Antony.