Spaces:

damoojeje
/

SmartManuals-AI

Sleeping

App Files Files Community

SmartManuals-AI / app.py

damoojeje

Update app.py

2975595 verified 7 months ago

raw

history blame

4.58 kB

	import os
	import json
	import fitz # PyMuPDF
	import docx
	import chromadb
	import torch
	import nltk
	import gradio as gr
	from tqdm import tqdm
	from typing import List
	from PIL import Image
	from nltk.tokenize import sent_tokenize
	from sentence_transformers import SentenceTransformer, util
	from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

	# --- Ensure punkt tokenizer is available ---
	try:
	nltk.data.find("tokenizers/punkt")
	except LookupError:
	nltk.download("punkt")

	# --- Configuration ---
	MANUALS_FOLDER = "./Manuals"
	CHROMA_PATH = "./chroma_store"
	COLLECTION_NAME = "manual_chunks"
	MODEL_OPTIONS = {
	"LLaMA 3.1 8B": "meta-llama/Llama-3.1-8B-Instruct",
	"Mistral 7B": "mistralai/Mistral-7B-Instruct-v0.3",
	"Gemma 7B": "google/gemma-1.1-7b-it"
	}
	HF_TOKEN = os.environ.get("HF_TOKEN")
	MAX_CONTEXT_CHUNKS = 3

	# --- Utility Functions ---
	def extract_text_from_pdf(path):
	try:
	doc = fitz.open(path)
	return "\n".join([page.get_text().strip() for page in doc])
	except:
	return ""

	def extract_text_from_docx(path):
	try:
	doc = docx.Document(path)
	return "\n".join([para.text.strip() for para in doc.paragraphs])
	except:
	return ""

	def clean(text):
	return "\n".join([line.strip() for line in text.splitlines() if line.strip()])

	def split_sentences(text):
	try:
	return sent_tokenize(text)
	except Exception as e:
	print(f"[Tokenizer Error] {e}. Falling back to simple split.")
	return text.split(". ")

	def chunk_sentences(sentences, max_tokens=500, overlap=50):
	chunks = []
	current = []
	total = 0
	for sentence in sentences:
	count = len(sentence.split())
	if total + count > max_tokens:
	chunks.append(" ".join(current))
	current = current[-overlap:]
	total = sum(len(s.split()) for s in current)
	current.append(sentence)
	total += count
	if current:
	chunks.append(" ".join(current))
	return chunks

	def embed_all():
	db = chromadb.PersistentClient(path=CHROMA_PATH)
	if COLLECTION_NAME in [c.name for c in db.list_collections()]:
	db.delete_collection(COLLECTION_NAME)
	collection = db.create_collection(COLLECTION_NAME)

	embedder = SentenceTransformer("all-MiniLM-L6-v2")
	all_chunks = []

	for fname in os.listdir(MANUALS_FOLDER):
	path = os.path.join(MANUALS_FOLDER, fname)
	text = ""
	if fname.lower().endswith(".pdf"):
	text = extract_text_from_pdf(path)
	elif fname.lower().endswith(".docx"):
	text = extract_text_from_docx(path)
	else:
	continue

	sents = split_sentences(clean(text))
	chunks = chunk_sentences(sents)
	for idx, chunk in enumerate(chunks):
	chunk_id = f"{fname}::chunk_{idx}"
	all_chunks.append({"id": chunk_id, "text": chunk, "metadata": {"source": fname}})

	for i in range(0, len(all_chunks), 16):
	batch = all_chunks[i:i+16]
	docs = [x["text"] for x in batch]
	ids = [x["id"] for x in batch]
	metas = [x["metadata"] for x in batch]
	embs = embedder.encode(docs).tolist()
	collection.add(documents=docs, ids=ids, metadatas=metas, embeddings=embs)

	return collection, embedder

	def answer_query(query, model_choice):
	db, embedder = embed_all()
	results = db.get_collection(COLLECTION_NAME).query(query_texts=[query], n_results=MAX_CONTEXT_CHUNKS)

	context = "\n\n".join(results["documents"][0])
	model_id = MODEL_OPTIONS.get(model_choice)

	tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
	model = AutoModelForCausalLM.from_pretrained(model_id, token=HF_TOKEN)
	pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

	prompt = f"""
	Context:
	{context}

	Question: {query}
	Answer:"""

	out = pipe(prompt, max_new_tokens=300, do_sample=False)
	return out[0]["generated_text"].split("Answer:")[-1].strip()

	# --- UI ---
	with gr.Blocks() as demo:
	gr.Markdown("""# 📘 SmartManuals-AI
	Ask technical questions from manuals (PDF & DOCX) with LLM + OCR + RAG.
	""")

	with gr.Row():
	question = gr.Textbox(label="Your Question", placeholder="e.g., How do I reset the console?")
	model_choice = gr.Dropdown(choices=list(MODEL_OPTIONS.keys()), value="LLaMA 3.1 8B", label="Model")
	answer = gr.Textbox(label="Answer")
	submit = gr.Button("Ask")
	submit.click(fn=answer_query, inputs=[question, model_choice], outputs=answer)

	demo.launch()