Spaces:

damoojeje
/

SmartManuals-AI

Sleeping

App Files Files Community

SmartManuals-AI / app.py

damoojeje

Update app.py

fcbea64 verified 7 months ago

raw

history blame

6.78 kB

	# ✅ app.py (Final Hugging Face Version for SmartManuals-AI)
	# ✅ No metadata filtering; all semantic search with keyword reranking
	# ✅ Auto-index from Manuals/ on startup, with rerun prevention
	# ✅ Gradio UI only, no file upload, progress logs

	import os
	import json
	import fitz # PyMuPDF
	import hashlib
	import chromadb
	from tqdm import tqdm
	from nltk.tokenize import sent_tokenize
	from sentence_transformers import SentenceTransformer, util
	from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
	import torch
	import gradio as gr

	# ---------------------------
	# ⚙️ Config
	# ---------------------------
	MANUALS_FOLDER = "./Manuals"
	CHROMA_PATH = "./chroma_store"
	CHUNKS_FILE = "manual_chunks_with_ocr.jsonl"
	HASH_FILE = "manuals.hash"
	CHUNK_SIZE = 750
	CHUNK_OVERLAP = 100
	MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
	HF_TOKEN = os.environ.get("HF_TOKEN")

	collection = None
	embedder = None
	pipe = None

	# ---------------------------
	# 🔐 Load model and pipeline
	# ---------------------------
	def load_model():
	global pipe
	if HF_TOKEN is None:
	print("❌ HF_TOKEN is not set")
	return None
	try:
	tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_ID, token=HF_TOKEN, torch_dtype=torch.float32
	)
	pipe = pipeline(
	"text-generation",
	model=model,
	tokenizer=tokenizer,
	max_new_tokens=512,
	temperature=0.2,
	top_p=0.9,
	do_sample=True,
	device=-1
	)
	print(f"✅ Model loaded: {MODEL_ID}")
	return tokenizer
	except Exception as e:
	print(f"❌ Model load failed: {e}")
	return None

	# ---------------------------
	# 📚 Utilities
	# ---------------------------
	def clean_text(text):
	lines = text.splitlines()
	return "\n".join([l.strip() for l in lines if l.strip()])

	def split_into_chunks(sentences, max_tokens=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
	chunks, current, cur_len = [], [], 0
	for sent in sentences:
	tok = len(sent.split())
	if cur_len + tok > max_tokens:
	chunks.append(" ".join(current))
	current = current[-overlap:]
	cur_len = sum(len(s.split()) for s in current)
	current.append(sent)
	cur_len += tok
	if current: chunks.append(" ".join(current))
	return chunks

	def hash_folder(folder):
	hasher = hashlib.sha256()
	for fname in sorted(os.listdir(folder)):
	if fname.endswith(".pdf"):
	with open(os.path.join(folder, fname), "rb") as f:
	while chunk := f.read(8192):
	hasher.update(chunk)
	return hasher.hexdigest()

	# ---------------------------
	# 🔁 Indexing
	# ---------------------------
	def extract_and_chunk():
	from PIL import Image
	import pytesseract

	chunks = []
	for fname in tqdm(sorted(os.listdir(MANUALS_FOLDER))):
	if not fname.endswith(".pdf"): continue
	path = os.path.join(MANUALS_FOLDER, fname)
	try:
	doc = fitz.open(path)
	for i, page in enumerate(doc):
	text = page.get_text()
	if not text:
	img = Image.open(io.BytesIO(page.get_pixmap(dpi=300).tobytes("png")))
	text = pytesseract.image_to_string(img)
	sents = sent_tokenize(clean_text(text))
	for j, chunk in enumerate(split_into_chunks(sents)):
	chunks.append({
	"source_file": fname,
	"chunk_id": f"{fname}::p{i+1}::c{j+1}",
	"page": i+1,
	"text": chunk.strip()
	})
	except Exception as e:
	print(f"Error reading {fname}: {e}")
	with open(CHUNKS_FILE, "w", encoding="utf-8") as f:
	for chunk in chunks:
	json.dump(chunk, f)
	f.write("\n")
	return chunks

	# ---------------------------
	# 💾 ChromaDB Embedding
	# ---------------------------
	def embed_chunks():
	global collection, embedder
	client = chromadb.PersistentClient(path=CHROMA_PATH)
	embedder = SentenceTransformer("all-MiniLM-L6-v2")
	try: client.delete_collection("manual_chunks")
	except: pass
	collection = client.create_collection("manual_chunks")
	with open(CHUNKS_FILE, "r", encoding="utf-8") as f:
	batch, metas, ids, texts = [], [], [], []
	for line in f:
	item = json.loads(line)
	texts.append(item["text"])
	ids.append(item["chunk_id"])
	metas.append({"source_file": item["source_file"], "page": item["page"]})
	if len(texts) == 16:
	embs = embedder.encode(texts).tolist()
	collection.add(documents=texts, ids=ids, metadatas=metas, embeddings=embs)
	texts, ids, metas = [], [], []
	if texts:
	embs = embedder.encode(texts).tolist()
	collection.add(documents=texts, ids=ids, metadatas=metas, embeddings=embs)

	# ---------------------------
	# 🔍 Semantic QA
	# ---------------------------
	def ask(question):
	if not collection or not embedder or not pipe:
	return "App not ready."
	emb = embedder.encode(question).tolist()
	results = collection.query(query_embeddings=[emb], n_results=3)
	context = "\n\n".join([r for r in results["documents"][0]])
	prompt = f"""
	Use the context to answer. Say 'I don't know' if unsure.

	Context:
	{context}

	Question: {question}
	"""
	return pipe(prompt)[0]['generated_text']

	# ---------------------------
	# 🚀 App Startup
	# ---------------------------
	def initialize():
	if not os.path.exists(MANUALS_FOLDER):
	os.makedirs(MANUALS_FOLDER)
	new_hash = hash_folder(MANUALS_FOLDER)
	if os.path.exists(HASH_FILE):
	with open(HASH_FILE, "r") as f:
	if f.read().strip() == new_hash and os.path.exists(CHUNKS_FILE):
	print("✅ Manuals unchanged. Skipping re-embedding.")
	return
	print("🔄 Indexing manuals...")
	extract_and_chunk()
	embed_chunks()
	with open(HASH_FILE, "w") as f:
	f.write(new_hash)
	print("✅ Embedding complete.")

	# ---------------------------
	# 🖥️ Gradio Interface
	# ---------------------------
	def build_ui():
	with gr.Blocks() as demo:
	gr.Markdown("## 🔍 Ask SmartManuals-AI")
	inp = gr.Textbox(label="Your question")
	out = gr.Textbox(label="Answer", lines=6)
	btn = gr.Button("Ask")
	btn.click(fn=ask, inputs=inp, outputs=out)
	return demo

	# ---------------------------
	# 🔧 Run App
	# ---------------------------
	load_model()
	initialize()
	demo = build_ui()