# ✅ app.py (Final Hugging Face Version for SmartManuals-AI) # ✅ No metadata filtering; all semantic search with keyword reranking # ✅ Auto-index from Manuals/ on startup, with rerun prevention # ✅ Gradio UI only, no file upload, progress logs import os import json import fitz # PyMuPDF import hashlib import chromadb from tqdm import tqdm from nltk.tokenize import sent_tokenize from sentence_transformers import SentenceTransformer, util from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline import torch import gradio as gr # --------------------------- # ⚙️ Config # --------------------------- MANUALS_FOLDER = "./Manuals" CHROMA_PATH = "./chroma_store" CHUNKS_FILE = "manual_chunks_with_ocr.jsonl" HASH_FILE = "manuals.hash" CHUNK_SIZE = 750 CHUNK_OVERLAP = 100 MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct" HF_TOKEN = os.environ.get("HF_TOKEN") collection = None embedder = None pipe = None # --------------------------- # 🔐 Load model and pipeline # --------------------------- def load_model(): global pipe if HF_TOKEN is None: print("❌ HF_TOKEN is not set") return None try: tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN) model = AutoModelForCausalLM.from_pretrained( MODEL_ID, token=HF_TOKEN, torch_dtype=torch.float32 ) pipe = pipeline( "text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512, temperature=0.2, top_p=0.9, do_sample=True, device=-1 ) print(f"✅ Model loaded: {MODEL_ID}") return tokenizer except Exception as e: print(f"❌ Model load failed: {e}") return None # --------------------------- # 📚 Utilities # --------------------------- def clean_text(text): lines = text.splitlines() return "\n".join([l.strip() for l in lines if l.strip()]) def split_into_chunks(sentences, max_tokens=CHUNK_SIZE, overlap=CHUNK_OVERLAP): chunks, current, cur_len = [], [], 0 for sent in sentences: tok = len(sent.split()) if cur_len + tok > max_tokens: chunks.append(" ".join(current)) current = current[-overlap:] cur_len = sum(len(s.split()) for s in current) current.append(sent) cur_len += tok if current: chunks.append(" ".join(current)) return chunks def hash_folder(folder): hasher = hashlib.sha256() for fname in sorted(os.listdir(folder)): if fname.endswith(".pdf"): with open(os.path.join(folder, fname), "rb") as f: while chunk := f.read(8192): hasher.update(chunk) return hasher.hexdigest() # --------------------------- # 🔁 Indexing # --------------------------- def extract_and_chunk(): from PIL import Image import pytesseract chunks = [] for fname in tqdm(sorted(os.listdir(MANUALS_FOLDER))): if not fname.endswith(".pdf"): continue path = os.path.join(MANUALS_FOLDER, fname) try: doc = fitz.open(path) for i, page in enumerate(doc): text = page.get_text() if not text: img = Image.open(io.BytesIO(page.get_pixmap(dpi=300).tobytes("png"))) text = pytesseract.image_to_string(img) sents = sent_tokenize(clean_text(text)) for j, chunk in enumerate(split_into_chunks(sents)): chunks.append({ "source_file": fname, "chunk_id": f"{fname}::p{i+1}::c{j+1}", "page": i+1, "text": chunk.strip() }) except Exception as e: print(f"Error reading {fname}: {e}") with open(CHUNKS_FILE, "w", encoding="utf-8") as f: for chunk in chunks: json.dump(chunk, f) f.write("\n") return chunks # --------------------------- # 💾 ChromaDB Embedding # --------------------------- def embed_chunks(): global collection, embedder client = chromadb.PersistentClient(path=CHROMA_PATH) embedder = SentenceTransformer("all-MiniLM-L6-v2") try: client.delete_collection("manual_chunks") except: pass collection = client.create_collection("manual_chunks") with open(CHUNKS_FILE, "r", encoding="utf-8") as f: batch, metas, ids, texts = [], [], [], [] for line in f: item = json.loads(line) texts.append(item["text"]) ids.append(item["chunk_id"]) metas.append({"source_file": item["source_file"], "page": item["page"]}) if len(texts) == 16: embs = embedder.encode(texts).tolist() collection.add(documents=texts, ids=ids, metadatas=metas, embeddings=embs) texts, ids, metas = [], [], [] if texts: embs = embedder.encode(texts).tolist() collection.add(documents=texts, ids=ids, metadatas=metas, embeddings=embs) # --------------------------- # 🔍 Semantic QA # --------------------------- def ask(question): if not collection or not embedder or not pipe: return "App not ready." emb = embedder.encode(question).tolist() results = collection.query(query_embeddings=[emb], n_results=3) context = "\n\n".join([r for r in results["documents"][0]]) prompt = f""" Use the context to answer. Say 'I don't know' if unsure. Context: {context} Question: {question} """ return pipe(prompt)[0]['generated_text'] # --------------------------- # 🚀 App Startup # --------------------------- def initialize(): if not os.path.exists(MANUALS_FOLDER): os.makedirs(MANUALS_FOLDER) new_hash = hash_folder(MANUALS_FOLDER) if os.path.exists(HASH_FILE): with open(HASH_FILE, "r") as f: if f.read().strip() == new_hash and os.path.exists(CHUNKS_FILE): print("✅ Manuals unchanged. Skipping re-embedding.") return print("🔄 Indexing manuals...") extract_and_chunk() embed_chunks() with open(HASH_FILE, "w") as f: f.write(new_hash) print("✅ Embedding complete.") # --------------------------- # 🖥️ Gradio Interface # --------------------------- def build_ui(): with gr.Blocks() as demo: gr.Markdown("## 🔍 Ask SmartManuals-AI") inp = gr.Textbox(label="Your question") out = gr.Textbox(label="Answer", lines=6) btn = gr.Button("Ask") btn.click(fn=ask, inputs=inp, outputs=out) return demo # --------------------------- # 🔧 Run App # --------------------------- load_model() initialize() demo = build_ui()