# from fastapi import FastAPI # from pydantic import BaseModel # from fastapi.openapi.utils import get_openapi # from transformers import AutoTokenizer, AutoModelForCausalLM # import torch # app = FastAPI( # title="Harshal AI Backend", # version="1.0.0", # ) # MODEL_NAME = "Qwen/Qwen2.5-0.5B" # tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) # model = AutoModelForCausalLM.from_pretrained( # MODEL_NAME, # torch_dtype=torch.float32, # device_map="cpu", # ) # class ChatMessage(BaseModel): # messages: list # @app.get("/") # def home(): # return {"message": "Harshal AI backend running with Qwen 0.5B!"} # @app.post("/chat") # def chat(body: ChatMessage): # user_msg = body.messages[-1]["content"] # prompt = f"User: {user_msg}\nAssistant:" # inputs = tokenizer(prompt, return_tensors="pt") # outputs = model.generate( # **inputs, # max_new_tokens=120, # pad_token_id=tokenizer.eos_token_id, # temperature=0.4, # ) # text = tokenizer.decode(outputs[0], skip_special_tokens=True) # reply = text.split("Assistant:")[-1].strip() # return {"reply": reply} # @app.get("/openapi.json") # def openapi_json(): # return get_openapi( # title="Harshal AI Backend", # version="1.0.0", # routes=app.routes # ) from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel from typing import List from transformers import AutoTokenizer, AutoModelForCausalLM from sentence_transformers import SentenceTransformer from pypdf import PdfReader import torch import os app = FastAPI( title="Harshal AI Backend", version="1.0.0" ) # CORS so Next.js can call backend app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"], ) # ====================================== # 1) LOAD MAIN MODEL (Qwen2.5 1.5B) # ====================================== MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct" tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) llm = AutoModelForCausalLM.from_pretrained( MODEL_NAME, torch_dtype=torch.float32, device_map="cpu" ) llm.eval() # ====================================== # 2) LOAD EMBEDDING MODEL + RAG # ====================================== EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2" embedder = SentenceTransformer(EMBED_MODEL) RESUME = "resume.pdf" resume_rag = None def chunk_text(text, max_chars=450, overlap=80): text = " ".join(text.split()) chunks, start = [], 0 while start < len(text): end = start + max_chars chunks.append(text[start:end]) start = end - overlap return chunks def build_rag(): global resume_rag if not os.path.exists(RESUME): print("⚠ resume.pdf NOT FOUND — RAG disabled") return reader = PdfReader(RESUME) text = "" for p in reader.pages: text += p.extract_text() or "" chunks = chunk_text(text) embs = embedder.encode(chunks, convert_to_tensor=True, normalize_embeddings=True) resume_rag = {"chunks": chunks, "embs": embs} print("✅ RAG Ready with", len(chunks), "chunks") def get_rag_context(query): if resume_rag is None: return "" q = embedder.encode([query], convert_to_tensor=True, normalize_embeddings=True)[0] sims = torch.nn.functional.cosine_similarity(q.unsqueeze(0), resume_rag["embs"]) top = torch.topk(sims, k=3) return "\n\n".join(resume_rag["chunks"][i] for i in top.indices) build_rag() # ====================================== # 3) SWAGGER DATA MODELS # ====================================== class Msg(BaseModel): role: str content: str class ChatReq(BaseModel): messages: List[Msg] # ====================================== # 4) CHAT ROUTE (Human-like Harshal) # ====================================== @app.post("/chat") def chat(req: ChatReq): user_msg = req.messages[-1].content.strip() rag = get_rag_context(user_msg) persona = f""" You are Harshal Sonawane, a real human software engineer from Pune. Rules: - Do NOT say you are an AI or model. - Answer in a human, natural tone. - Keep replies short (2–4 sentences). - Use resume facts when relevant. - If unknown, answer honestly. Resume context: {rag} """.strip() messages = [ {"role": "system", "content": persona}, {"role": "user", "content": user_msg} ] ids = tokenizer.apply_chat_template( messages, return_tensors="pt", add_generation_prompt=True ).to(llm.device) out = llm.generate( ids, max_new_tokens=150, temperature=0.5, top_p=0.9, repetition_penalty=1.05, do_sample=True ) gen = out[0][ids.shape[-1]:] reply = tokenizer.decode(gen, skip_special_tokens=True).strip() return {"reply": reply} @app.get("/") def health(): return {"status": "Harshal AI backend is running with Qwen 1.5B + RAG 🎯"}