Spaces:
Running
Running
| # app.py | |
| import os | |
| import tempfile | |
| import re | |
| import torch | |
| import gradio as gr | |
| from PyPDF2 import PdfReader | |
| from docx import Document as DocxDocument | |
| from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM | |
| from pptx import Presentation | |
| from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_core.documents import Document | |
| # Load Reasoning Model (lightweight + CPU friendly) | |
| model_id = "google/flan-t5-base" | |
| tokenizer = AutoTokenizer.from_pretrained(model_id) | |
| model = AutoModelForSeq2SeqLM.from_pretrained(model_id) | |
| reasoning_pipeline = pipeline( | |
| "text2text-generation", | |
| model=model, | |
| tokenizer=tokenizer, | |
| max_new_tokens=512, | |
| temperature=0.7, | |
| top_p=0.9 | |
| ) | |
| # Embedding Model | |
| embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") | |
| vectorstore = None | |
| # Summarizer | |
| summary_pipeline = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6") | |
| def clean_text(text): | |
| lines = text.split("\n") | |
| cleaned = [] | |
| for line in lines: | |
| line = line.strip() | |
| if re.search(r'(Page \d+|Slide \d+|CS583|UIC|Bing Liu)', line, re.IGNORECASE): | |
| continue | |
| if len(line) < 3: | |
| continue | |
| line = re.sub(r'[^\x00-\x7F]+', ' ', line) | |
| cleaned.append(line) | |
| return "\n".join(cleaned) | |
| def extract_text(file_path, ext): | |
| if ext == ".pdf": | |
| reader = PdfReader(file_path) | |
| return "\n".join([page.extract_text() or "" for page in reader.pages]) | |
| elif ext == ".docx": | |
| doc = DocxDocument(file_path) | |
| return "\n".join([p.text for p in doc.paragraphs]) | |
| elif ext == ".txt": | |
| with open(file_path, "r", encoding="utf-8") as f: | |
| return f.read() | |
| elif ext == ".pptx": | |
| prs = Presentation(file_path) | |
| return "\n".join(shape.text for slide in prs.slides for shape in slide.shapes if hasattr(shape, "text")) | |
| else: | |
| raise ValueError("Unsupported file format") | |
| def process_file(file): | |
| global vectorstore | |
| try: | |
| ext = os.path.splitext(file.name)[1].lower() | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp: | |
| # β Robust handling for file types: file-like, string path, bytes, NamedString | |
| if hasattr(file, "read"): | |
| file_bytes = file.read() | |
| elif isinstance(file, str) and os.path.exists(file): | |
| with open(file, "rb") as f: | |
| file_bytes = f.read() | |
| elif isinstance(file, bytes): | |
| file_bytes = file | |
| else: | |
| # Fallback for Hugging Face NamedString or other str-like types | |
| file_bytes = bytes(str(file), encoding='utf-8') | |
| tmp.write(file_bytes) | |
| tmp.flush() | |
| full_text = extract_text(tmp.name, ext) | |
| cleaned = clean_text(full_text) | |
| splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150) | |
| chunks = splitter.split_text(cleaned) | |
| docs = [Document(page_content=c) for c in chunks] | |
| vectorstore = FAISS.from_documents(docs, embedding_model) | |
| return "β File processed. You can now ask questions." | |
| except Exception as e: | |
| return f"β Error: {str(e)}" | |
| def generate_prompt(context, question): | |
| return f""" | |
| You are a helpful academic tutor assisting a student strictly based on course slides or textbook material. | |
| Context: | |
| {context} | |
| Question: | |
| {question} | |
| Instructions: | |
| - Answer ONLY using the above context. Do NOT add outside knowledge. | |
| - Think clearly and deeply before answering. | |
| - Use structured academic language based strictly on the context. | |
| - Use clean formatting with helpful headings and minimal bullet points. | |
| - Do NOT repeat the question or include prompt labels. | |
| - If the context lacks an answer, say: "The provided material does not contain sufficient information to answer this question accurately." | |
| - Output must be academically concise, well-organized, and visually clear. | |
| """.strip() | |
| def detect_question_type(q): | |
| q = q.lower().strip() | |
| if q.startswith(("what is", "define", "give definition")): | |
| return "definition" | |
| elif q.startswith(("how", "explain", "why")): | |
| return "explanation" | |
| elif "difference between" in q or "compare" in q: | |
| return "comparison" | |
| elif q.startswith("list") or "types of" in q: | |
| return "list" | |
| return "general" | |
| def post_process_output(answer_text, question): | |
| qtype = detect_question_type(question) | |
| label_map = { | |
| "definition": "\ud83d\udcd8 **Definition**", | |
| "explanation": "\ud83d\udcd8 **Explanation**", | |
| "comparison": "\ud83d\udcd8 **Comparison**", | |
| "list": "\ud83d\udcd8 **Key Points**", | |
| "general": "\ud83d\udcd8 **Insight**", | |
| } | |
| answer_text = f"{label_map.get(qtype)}\n\n{answer_text}" | |
| if len(answer_text.split()) > 80: | |
| summary = summary_pipeline(answer_text, max_length=60, min_length=25, do_sample=False)[0]['summary_text'] | |
| answer_text += f"\n\n\ud83d\udcdd **Summary:** {summary.strip()}" | |
| return answer_text | |
| def ask_question(question): | |
| global vectorstore | |
| if vectorstore is None: | |
| return "β Please upload and process a file first." | |
| docs = vectorstore.similarity_search(question, k=3) | |
| if not docs: | |
| return "β No relevant information found." | |
| context = "\n".join([doc.page_content for doc in docs]) | |
| prompt = generate_prompt(context, question) | |
| result = reasoning_pipeline(prompt)[0]['generated_text'] | |
| for marker in ["Context:", "Question:", "Instructions:"]: | |
| if marker in result: | |
| result = result.split(marker)[-1].strip() | |
| if "." in result: | |
| result = result.rsplit(".", 1)[0] + "." | |
| return post_process_output(result.strip(), question) | |
| # Gradio UI | |
| with gr.Blocks(theme=gr.themes.Monochrome()) as demo: | |
| gr.Markdown(""" | |
| # π AI Study Assistant | |
| Upload your lecture slide/text file, ask questions, and get intelligent answers powered by Phi-3. | |
| """) | |
| with gr.Tab("Upload & Ask"): | |
| with gr.Row(): | |
| file_input = gr.File(label="π Upload File", file_types=[".pdf", ".docx", ".pptx", ".txt"]) | |
| upload_btn = gr.Button("Upload") | |
| upload_output = gr.Textbox(label="Upload Status", interactive=False) | |
| upload_btn.click(fn=process_file, inputs=file_input, outputs=upload_output) | |
| gr.Markdown("---") | |
| with gr.Row(): | |
| question = gr.Textbox(label="β Ask a question") | |
| ask_btn = gr.Button("Ask") | |
| answer = gr.Textbox(label="π‘ Answer", interactive=False) | |
| ask_btn.click(fn=ask_question, inputs=question, outputs=answer) | |
| with gr.Tab("History"): | |
| gr.Markdown(""" | |
| **β³ Coming Soon**: Question-answer history, summarization view, more! | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch() | |