Spaces:

MohamedFahim
/

URL-To-Answer

Sleeping

App Files Files Community

MohamedFahim commited on Oct 26

Commit

e3ad748

verified ·

1 Parent(s): 2db5397

Update main_api.py

Browse files

Files changed (1) hide show

main_api.py +531 -140

main_api.py CHANGED Viewed

@@ -1,21 +1,234 @@
-import pymupdf4llm
 import fitz  # PyMuPDF
 import faiss
-from sentence_transformers import SentenceTransformer
 from typing import List, Optional
 from langchain.text_splitters import RecursiveCharacterTextSplitter, MarkdownTextSplitter
-import markdown
-import uuid
-from pathlib import Path
-# Initialize embedding model (add at top with other initializations)
 embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
-# FAISS index storage (in-memory, you can persist to disk)
-vector_stores = {}  # Store multiple FAISS indexes by collection name
 class DocumentUpload(BaseModel):
-    """Model for document upload response"""
     file_id: str
     filename: str
     file_type: str
@@ -23,22 +236,80 @@ class DocumentUpload(BaseModel):
     storage_path: str
 class RAGQueryRequest(BaseModel):
-    """Model for RAG query with collection specification"""
     query: str
     collection_name: str
     top_k: Optional[int] = 3
 class VectorStoreInfo(BaseModel):
-    """Information about vector store collection"""
     collection_name: str
     total_chunks: int
     dimension: int
-# Utility Functions
 def chunk_document(text: str, file_type: str, chunk_size: int = 1000, chunk_overlap: int = 200):
     """Chunk document based on file type"""
-    if file_type == "markdown" or file_type == "md":
         splitter = MarkdownTextSplitter(
             chunk_size=chunk_size,
             chunk_overlap=chunk_overlap
@@ -54,68 +325,31 @@ def chunk_document(text: str, file_type: str, chunk_size: int = 1000, chunk_over
     logger.info(f"Created {len(chunks)} chunks from document")
     return chunks
-def create_or_update_vector_store(collection_name: str, chunks: List[str], metadata: List[dict]):
-    """Create or update FAISS vector store with new chunks"""
-    # Generate embeddings
-    embeddings = embedding_model.encode(chunks, show_progress_bar=True)
-    if collection_name in vector_stores:
-        # Add to existing index
-        index_data = vector_stores[collection_name]
-        index_data['index'].add(embeddings)
-        index_data['chunks'].extend(chunks)
-        index_data['metadata'].extend(metadata)
-    else:
-        # Create new index
-        dimension = embeddings.shape[1]
-        index = faiss.IndexFlatL2(dimension)
-        index.add(embeddings)
-        vector_stores[collection_name] = {
-            'index': index,
-            'chunks': chunks,
-            'metadata': metadata,
-            'dimension': dimension
-        }
-    logger.info(f"Vector store '{collection_name}' now has {len(vector_stores[collection_name]['chunks'])} chunks")
-    return len(chunks)
-def extract_text_from_pdf(file_bytes: bytes) -> str:
-    """Extract text from PDF using PyMuPDF with markdown formatting"""
     try:
-        # Use pymupdf4llm for better markdown extraction
-        pdf_doc = fitz.open(stream=file_bytes, filetype="pdf")
         md_text = pymupdf4llm.to_markdown(pdf_doc)
         return md_text
     except Exception as e:
         logger.error(f"Error extracting PDF: {e}")
-        # Fallback to basic extraction
-        pdf_doc = fitz.open(stream=file_bytes, filetype="pdf")
         text = ""
         for page in pdf_doc:
             text += page.get_text()
         return text
-def extract_text_from_markdown(file_bytes: bytes) -> str:
     """Extract text from markdown file"""
-    return file_bytes.decode('utf-8')
-# New Endpoints
 @app.post("/upload_document", response_model=DocumentUpload)
 async def upload_document(
     file: UploadFile = File(...),
     collection_name: Optional[str] = "default"
 ):
-    """
-    Upload and process PDF or Markdown documents for RAG.
-    Creates chunks and stores in FAISS vector database.
-    """
-    if not supabase:
-        raise HTTPException(status_code=500, detail="Supabase not configured")
-    # Validate file type
     allowed_types = {
         "application/pdf": "pdf",
         "text/markdown": "markdown",
@@ -124,49 +358,47 @@ async def upload_document(
     if file.content_type not in allowed_types:
         raise HTTPException(
-            status_code=415,
             detail=f"Unsupported file type. Allowed: PDF, Markdown, TXT"
         )
     try:
-        # Read file content
-        file_bytes = await file.read()
         file_type = allowed_types[file.content_type]
-        # Extract text based on file type
         if file_type == "pdf":
-            text_content = extract_text_from_pdf(file_bytes)
-        elif file_type in ["markdown", "txt"]:
-            text_content = extract_text_from_markdown(file_bytes)
         else:
-            raise HTTPException(status_code=400, detail="Unsupported file type")
         if not text_content.strip():
-            raise HTTPException(status_code=400, detail="No text content extracted from file")
-        # Generate unique file ID
-        file_id = str(uuid.uuid4())
-        storage_filename = f"{file_id}_{file.filename}"
-        # Upload original file to Supabase
-        try:
-            supabase.storage.from_("url-2-ans-bucket").upload(
-                path=storage_filename,
-                file=file_bytes,
-                file_options={"content-type": file.content_type}
-            )
-        except Exception:
-            # Try update if file exists
-            supabase.storage.from_("url-2-ans-bucket").update(
-                path=storage_filename,
-                file=file_bytes,
-                file_options={"content-type": file.content_type}
-            )
-        # Chunk the document
         chunks = chunk_document(text_content, file_type)
-        # Create metadata for each chunk
         metadata = [
             {
                 "file_id": file_id,
@@ -179,31 +411,36 @@ async def upload_document(
         ]
         # Add to vector store
-        chunks_created = create_or_update_vector_store(collection_name, chunks, metadata)
         return DocumentUpload(
             file_id=file_id,
             filename=file.filename,
             file_type=file_type,
             chunks_created=chunks_created,
-            storage_path=f"supabase://url-2-ans-bucket/{storage_filename}"
         )
     except HTTPException:
         raise
     except Exception as e:
         logger.exception("Error in upload_document")
-        raise HTTPException(status_code=500, detail=f"Error processing document: {str(e)}")
-    finally:
-        await file.close()
 @app.post("/upload_multiple_documents")
 async def upload_multiple_documents(
     files: List[UploadFile] = File(...),
     collection_name: Optional[str] = "default"
 ):
-    """Upload multiple documents at once"""
     results = []
     errors = []
@@ -221,38 +458,30 @@ async def upload_multiple_documents(
         "errors": errors
     }
 @app.post("/query_documents")
 async def query_documents(request: RAGQueryRequest):
-    """
-    Query documents using RAG with FAISS vector search.
-    Returns answer with source citations.
-    """
-    if request.collection_name not in vector_stores:
         raise HTTPException(
-            status_code=404,
-            detail=f"Collection '{request.collection_name}' not found. Upload documents first."
         )
     try:
-        # Get vector store data
-        store_data = vector_stores[request.collection_name]
-        index = store_data['index']
-        chunks = store_data['chunks']
-        metadata = store_data['metadata']
         # Generate query embedding
         query_embedding = embedding_model.encode([request.query])
         # Search in FAISS
-        distances, indices = index.search(query_embedding, min(request.top_k, len(chunks)))
-        # Get top-k chunks
-        retrieved_chunks = [chunks[i] for i in indices[0]]
-        retrieved_metadata = [metadata[i] for i in indices[0]]
-        # Check if results are relevant (threshold-based)
-        if distances[0][0] > 1.5:  # Adjust threshold as needed
             return {
                 "answer": "I couldn't find this information in the provided documents.",
                 "sources": [],
@@ -260,16 +489,20 @@ async def query_documents(request: RAGQueryRequest):
                 "collection": request.collection_name
             }
-        # Create context from retrieved chunks
         context_text = "\n\n".join([
-            f"[Source {i+1} - {meta['filename']}]:\n{chunk}"
             for i, (chunk, meta) in enumerate(zip(retrieved_chunks, retrieved_metadata))
         ])
-        # Generate answer using Groq
         answer = process_with_groq(request.query, context_text)
-        # Prepare source citations
         sources = [
             {
                 "filename": meta['filename'],
@@ -292,38 +525,196 @@ async def query_documents(request: RAGQueryRequest):
         logger.exception("Error in query_documents")
         raise HTTPException(status_code=500, detail=f"Query failed: {str(e)}")
 @app.get("/list_collections")
 async def list_collections():
-    """List all available vector store collections"""
-    collections = []
-    for name, data in vector_stores.items():
-        collections.append(VectorStoreInfo(
-            collection_name=name,
-            total_chunks=len(data['chunks']),
-            dimension=data['dimension']
-        ))
     return {"collections": collections}
 @app.delete("/delete_collection/{collection_name}")
 async def delete_collection(collection_name: str):
-    """Delete a vector store collection"""
-    if collection_name not in vector_stores:
-        raise HTTPException(status_code=404, detail="Collection not found")
-    del vector_stores[collection_name]
-    return {"message": f"Collection '{collection_name}' deleted successfully"}
 @app.get("/health_check")
 async def health_check():
-    """Check system health and configuration"""
     return {
         "status": "healthy",
         "supabase_configured": supabase is not None,
-        "groq_configured": groq_api_key is not None,
         "embedding_model": "all-MiniLM-L6-v2",
-        "vector_stores": len(vector_stores),
-        "total_chunks": sum(len(store['chunks']) for store in vector_stores.values())
     }

+import os
+import logging
+import time
+import random
+import json
+import numpy as np
+import uvicorn
 import fitz  # PyMuPDF
+import pymupdf4llm
 import faiss
+from pathlib import Path
 from typing import List, Optional
+from urllib.parse import urlparse, urljoin
+from fastapi import FastAPI, HTTPException, File, UploadFile
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+from bs4 import BeautifulSoup
+import requests
+from sklearn.metrics.pairwise import cosine_similarity
+from supabase import create_client, Client
+from groq import Groq
+from sentence_transformers import SentenceTransformer
 from langchain.text_splitters import RecursiveCharacterTextSplitter, MarkdownTextSplitter
+import pickle
+# ==================== CONFIGURATION FOR HUGGING FACE SPACES ====================
+# Persistent storage directory (Hugging Face Spaces uses /data/)
+PERSISTENT_STORAGE = os.getenv("PERSISTENT_STORAGE", "/data")
+VECTOR_STORE_DIR = os.path.join(PERSISTENT_STORAGE, "vector_stores")
+TEMP_UPLOAD_DIR = os.path.join(PERSISTENT_STORAGE, "temp_uploads")
+# Create directories if they don't exist
+os.makedirs(VECTOR_STORE_DIR, exist_ok=True)
+os.makedirs(TEMP_UPLOAD_DIR, exist_ok=True)
+# Set HuggingFace cache to persistent storage
+os.environ["HF_HOME"] = os.path.join(PERSISTENT_STORAGE, ".huggingface")
+# ==================== LOGGING SETUP ====================
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+# ==================== FASTAPI APP ====================
+app = FastAPI(title="RAG Assistant API", version="2.0")
+# CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# ==================== ENVIRONMENT VARIABLES ====================
+groq_api_key = os.getenv("GROQ_API_KEY")
+supabase_url = os.getenv("SUPABASE_URL")
+supabase_key = os.getenv("SUPABASE_KEY")
+# Initialize clients
+supabase: Optional[Client] = None
+groq_client = None
+if supabase_url and supabase_key:
+    try:
+        supabase = create_client(supabase_url, supabase_key)
+        logger.info("Supabase client initialized successfully")
+    except Exception as e:
+        logger.error(f"Failed to initialize Supabase: {e}")
+if groq_api_key:
+    try:
+        groq_client = Groq(api_key=groq_api_key)
+        logger.info("Groq client initialized successfully")
+    except Exception as e:
+        logger.error(f"Failed to initialize Groq: {e}")
+# Initialize embedding model (cached in persistent storage)
+logger.info("Loading embedding model...")
 embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
+logger.info("Embedding model loaded successfully")
+# ==================== PERSISTENT VECTOR STORE MANAGEMENT ====================
+class VectorStoreManager:
+    """Manage FAISS vector stores with disk persistence"""
+    def __init__(self, base_dir: str):
+        self.base_dir = base_dir
+        self.stores = {}
+        self.load_all_stores()
+    def load_all_stores(self):
+        """Load all existing vector stores from disk on startup"""
+        try:
+            for collection_dir in Path(self.base_dir).iterdir():
+                if collection_dir.is_dir():
+                    collection_name = collection_dir.name
+                    try:
+                        self.load_store(collection_name)
+                        logger.info(f"Loaded collection '{collection_name}' from disk")
+                    except Exception as e:
+                        logger.error(f"Failed to load collection '{collection_name}': {e}")
+        except Exception as e:
+            logger.error(f"Error loading vector stores: {e}")
+    def load_store(self, collection_name: str):
+        """Load a specific vector store from disk"""
+        collection_dir = os.path.join(self.base_dir, collection_name)
+        if not os.path.exists(collection_dir):
+            raise FileNotFoundError(f"Collection '{collection_name}' not found")
+        # Load FAISS index
+        index_path = os.path.join(collection_dir, "index.faiss")
+        index = faiss.read_index(index_path)
+        # Load metadata
+        metadata_path = os.path.join(collection_dir, "metadata.pkl")
+        with open(metadata_path, 'rb') as f:
+            data = pickle.load(f)
+        self.stores[collection_name] = {
+            'index': index,
+            'chunks': data['chunks'],
+            'metadata': data['metadata'],
+            'dimension': index.d
+        }
+    def save_store(self, collection_name: str):
+        """Save a vector store to disk"""
+        collection_dir = os.path.join(self.base_dir, collection_name)
+        os.makedirs(collection_dir, exist_ok=True)
+        store_data = self.stores[collection_name]
+        # Save FAISS index
+        index_path = os.path.join(collection_dir, "index.faiss")
+        faiss.write_index(store_data['index'], index_path)
+        # Save metadata
+        metadata_path = os.path.join(collection_dir, "metadata.pkl")
+        with open(metadata_path, 'wb') as f:
+            pickle.dump({
+                'chunks': store_data['chunks'],
+                'metadata': store_data['metadata']
+            }, f)
+        logger.info(f"Saved collection '{collection_name}' to disk")
+    def create_or_update_store(self, collection_name: str, chunks: List[str], metadata: List[dict]):
+        """Create or update a vector store"""
+        # Generate embeddings
+        embeddings = embedding_model.encode(chunks, show_progress_bar=True)
+        embeddings = np.array(embeddings).astype('float32')
+        if collection_name in self.stores:
+            # Add to existing index
+            store_data = self.stores[collection_name]
+            store_data['index'].add(embeddings)
+            store_data['chunks'].extend(chunks)
+            store_data['metadata'].extend(metadata)
+        else:
+            # Create new index
+            dimension = embeddings.shape[1]
+            index = faiss.IndexFlatL2(dimension)
+            index.add(embeddings)
+            self.stores[collection_name] = {
+                'index': index,
+                'chunks': chunks.copy(),
+                'metadata': metadata.copy(),
+                'dimension': dimension
+            }
+        # Save to disk
+        self.save_store(collection_name)
+        return len(chunks)
+    def get_store(self, collection_name: str):
+        """Get a vector store"""
+        if collection_name not in self.stores:
+            # Try to load from disk
+            try:
+                self.load_store(collection_name)
+            except:
+                return None
+        return self.stores.get(collection_name)
+    def delete_store(self, collection_name: str):
+        """Delete a vector store"""
+        if collection_name in self.stores:
+            del self.stores[collection_name]
+        # Delete from disk
+        collection_dir = os.path.join(self.base_dir, collection_name)
+        if os.path.exists(collection_dir):
+            import shutil
+            shutil.rmtree(collection_dir)
+    def list_stores(self):
+        """List all available stores"""
+        return [
+            {
+                'collection_name': name,
+                'total_chunks': len(data['chunks']),
+                'dimension': data['dimension']
+            }
+            for name, data in self.stores.items()
+        ]
+# Initialize vector store manager
+vector_store_manager = VectorStoreManager(VECTOR_STORE_DIR)
+# ==================== PYDANTIC MODELS ====================
+class URL(BaseModel):
+    url: str
+class RAGRequest(BaseModel):
+    file_path: str
+    prompt: str
 class DocumentUpload(BaseModel):
     file_id: str
     filename: str
     file_type: str
     storage_path: str
 class RAGQueryRequest(BaseModel):
     query: str
     collection_name: str
     top_k: Optional[int] = 3
 class VectorStoreInfo(BaseModel):
     collection_name: str
     total_chunks: int
     dimension: int
+# ==================== EXISTING FUNCTIONALITY ====================
+user_agents = [
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
+    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
+    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36'
+]
+bucket_name = "url-2-ans-bucket"
+def query(payload):
+    """Query Hugging Face embedding API"""
+    API_URL = "https://api-inference.huggingface.co/models/sentence-transformers/all-MiniLM-L6-v2"
+    headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_TOKEN', '')}"}
+    response = requests.post(API_URL, headers=headers, json=payload)
+    if response.status_code == 200:
+        return response.json()
+    else:
+        logger.warning(f"HF API error: {response.status_code}, using local model")
+        return embedding_model.encode(payload["inputs"]).tolist()
+def process_with_groq(query: str, context: str) -> str:
+    """Process query with Groq LLM"""
+    if not groq_client:
+        return "Groq API not configured. Please set GROQ_API_KEY environment variable."
+    try:
+        messages = [
+            {
+                "role": "system",
+                "content": "You are a helpful assistant. Answer questions based on the provided context. If you cannot find the answer in the context, say so."
+            },
+            {
+                "role": "user",
+                "content": f"Context:\n{context}\n\nQuestion: {query}\n\nAnswer:"
+            }
+        ]
+        chat_completion = groq_client.chat.completions.create(
+            messages=messages,
+            model="llama-3.3-70b-versatile",
+            temperature=0.7,
+            max_tokens=1024,
+        )
+        return chat_completion.choices[0].message.content
+    except Exception as e:
+        logger.error(f"Groq API error: {e}")
+        return f"Error generating response: {str(e)}"
+@app.get("/")
+async def root():
+    return {
+        "message": "RAG Assistant API",
+        "version": "2.0",
+        "status": "running",
+        "storage": PERSISTENT_STORAGE
+    }
+# ==================== NEW RAG ENDPOINTS ====================
 def chunk_document(text: str, file_type: str, chunk_size: int = 1000, chunk_overlap: int = 200):
     """Chunk document based on file type"""
+    if file_type in ["markdown", "md"]:
         splitter = MarkdownTextSplitter(
             chunk_size=chunk_size,
             chunk_overlap=chunk_overlap
     logger.info(f"Created {len(chunks)} chunks from document")
     return chunks
+def extract_text_from_pdf(file_path: str) -> str:
+    """Extract text from PDF"""
     try:
+        pdf_doc = fitz.open(file_path)
         md_text = pymupdf4llm.to_markdown(pdf_doc)
         return md_text
     except Exception as e:
         logger.error(f"Error extracting PDF: {e}")
+        pdf_doc = fitz.open(file_path)
         text = ""
         for page in pdf_doc:
             text += page.get_text()
         return text
+def extract_text_from_markdown(file_path: str) -> str:
     """Extract text from markdown file"""
+    with open(file_path, 'r', encoding='utf-8') as f:
+        return f.read()
 @app.post("/upload_document", response_model=DocumentUpload)
 async def upload_document(
     file: UploadFile = File(...),
     collection_name: Optional[str] = "default"
 ):
+    """Upload and process PDF or Markdown documents"""
     allowed_types = {
         "application/pdf": "pdf",
         "text/markdown": "markdown",
     if file.content_type not in allowed_types:
         raise HTTPException(
+            status_code=415,
             detail=f"Unsupported file type. Allowed: PDF, Markdown, TXT"
         )
     try:
+        # Save file temporarily to persistent storage
         file_type = allowed_types[file.content_type]
+        temp_file_path = os.path.join(TEMP_UPLOAD_DIR, f"{int(time.time())}_{file.filename}")
+        # Write uploaded file
+        with open(temp_file_path, "wb") as buffer:
+            content = await file.read()
+            buffer.write(content)
+        # Extract text
         if file_type == "pdf":
+            text_content = extract_text_from_pdf(temp_file_path)
         else:
+            text_content = extract_text_from_markdown(temp_file_path)
         if not text_content.strip():
+            raise HTTPException(status_code=400, detail="No text content extracted")
+        # Optional: Upload to Supabase
+        storage_filename = f"{int(time.time())}_{file.filename}"
+        if supabase:
+            try:
+                with open(temp_file_path, 'rb') as f:
+                    supabase.storage.from_(bucket_name).upload(
+                        path=storage_filename,
+                        file=f.read(),
+                        file_options={"content-type": file.content_type}
+                    )
+            except:
+                pass  # Continue even if Supabase upload fails
+        # Chunk document
         chunks = chunk_document(text_content, file_type)
+        # Create metadata
+        file_id = str(int(time.time()))
         metadata = [
             {
                 "file_id": file_id,
         ]
         # Add to vector store
+        chunks_created = vector_store_manager.create_or_update_store(
+            collection_name, chunks, metadata
+        )
+        # Clean up temp file
+        try:
+            os.remove(temp_file_path)
+        except:
+            pass
         return DocumentUpload(
             file_id=file_id,
             filename=file.filename,
             file_type=file_type,
             chunks_created=chunks_created,
+            storage_path=f"supabase://{bucket_name}/{storage_filename}" if supabase else temp_file_path
         )
     except HTTPException:
         raise
     except Exception as e:
         logger.exception("Error in upload_document")
+        raise HTTPException(status_code=500, detail=f"Error: {str(e)}")
 @app.post("/upload_multiple_documents")
 async def upload_multiple_documents(
     files: List[UploadFile] = File(...),
     collection_name: Optional[str] = "default"
 ):
+    """Upload multiple documents"""
     results = []
     errors = []
         "errors": errors
     }
 @app.post("/query_documents")
 async def query_documents(request: RAGQueryRequest):
+    """Query documents using RAG"""
+    store_data = vector_store_manager.get_store(request.collection_name)
+    if not store_data:
         raise HTTPException(
+            status_code=404,
+            detail=f"Collection '{request.collection_name}' not found"
         )
     try:
         # Generate query embedding
         query_embedding = embedding_model.encode([request.query])
+        query_embedding = np.array(query_embedding).astype('float32')
         # Search in FAISS
+        distances, indices = store_data['index'].search(
+            query_embedding,
+            min(request.top_k, len(store_data['chunks']))
+        )
+        # Check relevance threshold
+        if distances[0][0] > 1.5:
             return {
                 "answer": "I couldn't find this information in the provided documents.",
                 "sources": [],
                 "collection": request.collection_name
             }
+        # Get relevant chunks
+        retrieved_chunks = [store_data['chunks'][i] for i in indices[0]]
+        retrieved_metadata = [store_data['metadata'][i] for i in indices[0]]
+        # Create context
         context_text = "\n\n".join([
+            f"[Source {i+1} - {meta['filename']}]:\n{chunk}"
             for i, (chunk, meta) in enumerate(zip(retrieved_chunks, retrieved_metadata))
         ])
+        # Generate answer
         answer = process_with_groq(request.query, context_text)
+        # Prepare sources
         sources = [
             {
                 "filename": meta['filename'],
         logger.exception("Error in query_documents")
         raise HTTPException(status_code=500, detail=f"Query failed: {str(e)}")
 @app.get("/list_collections")
 async def list_collections():
+    """List all collections"""
+    collections = vector_store_manager.list_stores()
     return {"collections": collections}
 @app.delete("/delete_collection/{collection_name}")
 async def delete_collection(collection_name: str):
+    """Delete a collection"""
+    try:
+        vector_store_manager.delete_store(collection_name)
+        return {"message": f"Collection '{collection_name}' deleted successfully"}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
 @app.get("/health_check")
 async def health_check():
+    """System health check"""
     return {
         "status": "healthy",
         "supabase_configured": supabase is not None,
+        "groq_configured": groq_client is not None,
         "embedding_model": "all-MiniLM-L6-v2",
+        "vector_stores": len(vector_store_manager.stores),
+        "total_chunks": sum(len(store['chunks']) for store in vector_store_manager.stores.values()),
+        "persistent_storage": PERSISTENT_STORAGE,
+        "collections": list(vector_store_manager.stores.keys())
     }
+# ==================== EXISTING WEB SCRAPING ENDPOINTS ====================
+@app.post("/rag")
+async def rag(request: RAGRequest):
+    """Existing RAG endpoint for URL-based content"""
+    if not supabase:
+        raise HTTPException(status_code=500, detail="Supabase not configured")
+    try:
+        file_path = request.file_path
+        # Download from Supabase
+        file_content = supabase.storage.from_(bucket_name).download(file_path)
+        text = file_content.decode('utf-8')
+        data = json.loads(text)
+        # Extract text
+        full_text = ""
+        for item in data:
+            full_text += item.get("text", "") + " "
+        # Chunk text
+        chunk_size = 1000
+        chunks = [full_text[i:i+chunk_size] for i in range(0, len(full_text), chunk_size)]
+        # Get embeddings
+        chunk_embeddings = []
+        for chunk in chunks:
+            embedding = query({"inputs": chunk})
+            chunk_embeddings.append(embedding)
+        query_embedding = query({"inputs": request.prompt})
+        # Calculate similarity
+        similarities = []
+        for chunk_embedding in chunk_embeddings:
+            query_np = np.array(query_embedding)
+            chunk_np = np.array(chunk_embedding)
+            if len(query_np.shape) == 1:
+                query_np = query_np.reshape(1, -1)
+            if len(chunk_np.shape) == 1:
+                chunk_np = chunk_np.reshape(1, -1)
+            similarity = cosine_similarity(query_np, chunk_np)[0][0]
+            similarities.append(similarity)
+        # Get top 3 chunks
+        top_k = 3
+        top_indices = np.argsort(similarities)[-top_k:][::-1]
+        relevant_chunks = [chunks[i] for i in top_indices]
+        context_text = "\n\n".join(relevant_chunks)
+        # Process with Groq
+        answer = process_with_groq(request.prompt, context_text)
+        sources = [{"text": chunks[i][:200] + "...", "position": i} for i in top_indices]
+        return {
+            "sources": sources,
+            "user_query": request.prompt,
+            "assistant_response": answer,
+            "file_source": f"supabase://{bucket_name}/{file_path}"
+        }
+    except Exception as e:
+        logger.exception("Error in RAG")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/extract_links")
+async def extract_links(url: URL):
+    """Extract links from URL"""
+    def extract_unique_links(url_string, max_retries=3, timeout=30):
+        for attempt in range(max_retries):
+            try:
+                headers = {'User-Agent': random.choice(user_agents)}
+                response = requests.get(url_string, headers=headers, timeout=timeout)
+                response.raise_for_status()
+                soup = BeautifulSoup(response.text, 'html.parser')
+                base_url = urlparse(url_string)
+                base_url = f"{base_url.scheme}://{base_url.netloc}"
+                links = [urljoin(base_url, a.get('href')) for a in soup.find_all('a', href=True)]
+                unique_links = list(dict.fromkeys(links))
+                unique_links.insert(0, url_string)
+                return unique_links
+            except Exception as e:
+                if attempt < max_retries - 1:
+                    time.sleep(5 * (attempt + 1))
+                else:
+                    raise HTTPException(status_code=500, detail=str(e))
+        return []
+    try:
+        unique_links = extract_unique_links(url.url)
+        return {"unique_links": unique_links}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/extract_text")
+async def extract_text(urls: List[str]):
+    """Extract text from URLs"""
+    if not supabase:
+        raise HTTPException(status_code=500, detail="Supabase not configured")
+    output_file = "extracted_text.txt"
+    def text_data_extractor(links):
+        extracted_texts = []
+        for link in links:
+            retries = 3
+            while retries > 0:
+                try:
+                    headers = {'User-Agent': random.choice(user_agents)}
+                    response = requests.get(link, headers=headers, timeout=30)
+                    response.raise_for_status()
+                    soup = BeautifulSoup(response.text, 'html.parser')
+                    text = ' '.join(soup.get_text().split())
+                    extracted_texts.append({"url": link, "text": text})
+                    break
+                except:
+                    retries -= 1
+                    if retries > 0:
+                        time.sleep(5)
+            if retries == 0:
+                extracted_texts.append({"url": link, "text": "Failed to retrieve"})
+        return extracted_texts
+    try:
+        extracted_data = text_data_extractor(urls)
+        string_output = json.dumps(extracted_data, ensure_ascii=False, indent=2)
+        # Upload to Supabase
+        file_content = string_output.encode('utf-8')
+        try:
+            supabase.storage.from_(bucket_name).upload(
+                path=output_file,
+                file=file_content,
+                file_options={"content-type": "text/plain"}
+            )
+        except:
+            supabase.storage.from_(bucket_name).update(
+                path=output_file,
+                file=file_content,
+                file_options={"content-type": "text/plain"}
+            )
+        return {"extracted_data": extracted_data, "file_saved": output_file}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+# ==================== MAIN ====================
+if __name__ == "__main__":
+    uvicorn.run(
+        "main_api:app",
+        host="0.0.0.0",
+        port=8000,
+        reload=False,
+        access_log=True
+    )