# from fastapi import FastAPI # from pydantic import BaseModel # from fastapi.openapi.utils import get_openapi # from transformers import AutoTokenizer, AutoModelForCausalLM # import torch # app = FastAPI( # title="Harshal AI Backend", # version="1.0.0", # ) # MODEL_NAME = "Qwen/Qwen2.5-0.5B" # tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) # model = AutoModelForCausalLM.from_pretrained( # MODEL_NAME, # torch_dtype=torch.float32, # device_map="cpu", # ) # class ChatMessage(BaseModel): # messages: list # @app.get("/") # def home(): # return {"message": "Harshal AI backend running with Qwen 0.5B!"} # @app.post("/chat") # def chat(body: ChatMessage): # user_msg = body.messages[-1]["content"] # prompt = f"User: {user_msg}\nAssistant:" # inputs = tokenizer(prompt, return_tensors="pt") # outputs = model.generate( # **inputs, # max_new_tokens=120, # pad_token_id=tokenizer.eos_token_id, # temperature=0.4, # ) # text = tokenizer.decode(outputs[0], skip_special_tokens=True) # reply = text.split("Assistant:")[-1].strip() # return {"reply": reply} # @app.get("/openapi.json") # def openapi_json(): # return get_openapi( # title="Harshal AI Backend", # version="1.0.0", # routes=app.routes # ) from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel from typing import List from transformers import AutoTokenizer, AutoModelForCausalLM from sentence_transformers import SentenceTransformer from pypdf import PdfReader import torch, os app = FastAPI(title="Harshal AI Backend", version="1.0.0") # CORS (Next.js frontend) app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"], ) # ============================================================ # 1) LOAD MAIN MODEL (Phi-3 Mini — good balance of quality/speed) # ============================================================ MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct" tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) llm = AutoModelForCausalLM.from_pretrained( MODEL_NAME, dtype=torch.float32, # instead of torch_dtype ) llm.eval() # ============================================================ # 2) LOAD EMBEDDINGS + BUILD RAG FROM resume.pdf # ============================================================ EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2" embedder = SentenceTransformer(EMBED_MODEL) RESUME_FILE = "resume.pdf" resume_rag = None def chunk_text(text, max_chars=450, overlap=80): """Simple overlapping chunks.""" text = " ".join(text.split()) chunks, start = [], 0 while start < len(text): end = start + max_chars chunks.append(text[start:end]) start = end - overlap return chunks def build_rag(): """Reads resume.pdf → chunks → embeddings.""" global resume_rag if not os.path.exists(RESUME_FILE): print("⚠ resume.pdf NOT FOUND — RAG disabled.") return reader = PdfReader(RESUME_FILE) full_text = "" for page in reader.pages: full_text += page.extract_text() or "" chunks = chunk_text(full_text) embeddings = embedder.encode( chunks, convert_to_tensor=True, normalize_embeddings=True ) resume_rag = {"chunks": chunks, "embs": embeddings} print("✅ Resume RAG built with", len(chunks), "chunks") build_rag() def retrieve_rag(query, top_k=3): """Find most relevant resume chunks.""" if resume_rag is None: return "" q = embedder.encode([query], convert_to_tensor=True, normalize_embeddings=True)[0] sims = torch.nn.functional.cosine_similarity(q.unsqueeze(0), resume_rag["embs"]) top = torch.topk(sims, k=min(top_k, len(sims))) return "\n\n".join(resume_rag["chunks"][i] for i in top.indices) # ============================================================ # 3) REQUEST MODELS # ============================================================ class Message(BaseModel): role: str content: str class ChatReq(BaseModel): messages: List[Message] # ============================================================ # 4) SMART MODE CHAT ROUTE (Resume + Fallback) # ============================================================ @app.post("/chat") def chat(req: ChatReq): user_msg = req.messages[-1].content.strip() # RAG: take from resume rag_context = retrieve_rag(user_msg) # Detect if we found resume info has_relevant_rag = rag_context and len(rag_context) > 40 # SMART RULE: # If resume has relevant info → answer ONLY from resume # If resume has nothing → fallback using general persona if has_relevant_rag: fallback_note = "" resume_instruction = """ Use ONLY the details provided in the resume context. Do NOT invent any new companies, dates, or roles. Stay strictly consistent with: - Current company: Börm Bruckmeier Infotech India Pvt. Ltd. - Past experience: SarvaTech Consultants (Internship) - Personal projects listed in resume. """ else: fallback_note = """ The resume does NOT mention this. Provide a short helpful answer based on Harshal’s background, tone, and skills — BUT do NOT invent companies, dates, or job titles not present in the resume. """ resume_instruction = "" # System persona system_prompt = f""" You are Harshal Sonawane — a real human software engineer from Pune, India. Tone: - Natural, confident, friendly. - 2–8 sentences max. - Never say you are an AI. Your resume (RAG context): {rag_context} Instructions: {resume_instruction} Fallback rule: {fallback_note} """.strip() # Chat template msgs = [ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_msg}, ] input_ids = tokenizer.apply_chat_template( msgs, return_tensors="pt", add_generation_prompt=True, ) with torch.no_grad(): output_ids = llm.generate( input_ids, max_new_tokens=160, temperature=0.55, top_p=0.9, repetition_penalty=1.06, do_sample=True, ) gen = output_ids[0][input_ids.shape[-1]:] reply = tokenizer.decode(gen, skip_special_tokens=True).strip() return {"reply": reply} # ============================================================ # 5) HEALTH CHECK # ============================================================ @app.get("/") def health(): return { "status": "Harshal AI backend running (SMART MODE)", "model": MODEL_NAME }