MohamedFahim commited on
Commit
e3ad748
·
verified ·
1 Parent(s): 2db5397

Update main_api.py

Browse files
Files changed (1) hide show
  1. main_api.py +531 -140
main_api.py CHANGED
@@ -1,21 +1,234 @@
1
- import pymupdf4llm
 
 
 
 
 
 
2
  import fitz # PyMuPDF
 
3
  import faiss
4
- from sentence_transformers import SentenceTransformer
5
  from typing import List, Optional
 
 
 
 
 
 
 
 
 
 
6
  from langchain.text_splitters import RecursiveCharacterTextSplitter, MarkdownTextSplitter
7
- import markdown
8
- import uuid
9
- from pathlib import Path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
- # Initialize embedding model (add at top with other initializations)
 
12
  embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
 
13
 
14
- # FAISS index storage (in-memory, you can persist to disk)
15
- vector_stores = {} # Store multiple FAISS indexes by collection name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  class DocumentUpload(BaseModel):
18
- """Model for document upload response"""
19
  file_id: str
20
  filename: str
21
  file_type: str
@@ -23,22 +236,80 @@ class DocumentUpload(BaseModel):
23
  storage_path: str
24
 
25
  class RAGQueryRequest(BaseModel):
26
- """Model for RAG query with collection specification"""
27
  query: str
28
  collection_name: str
29
  top_k: Optional[int] = 3
30
 
31
  class VectorStoreInfo(BaseModel):
32
- """Information about vector store collection"""
33
  collection_name: str
34
  total_chunks: int
35
  dimension: int
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
- # Utility Functions
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  def chunk_document(text: str, file_type: str, chunk_size: int = 1000, chunk_overlap: int = 200):
40
  """Chunk document based on file type"""
41
- if file_type == "markdown" or file_type == "md":
42
  splitter = MarkdownTextSplitter(
43
  chunk_size=chunk_size,
44
  chunk_overlap=chunk_overlap
@@ -54,68 +325,31 @@ def chunk_document(text: str, file_type: str, chunk_size: int = 1000, chunk_over
54
  logger.info(f"Created {len(chunks)} chunks from document")
55
  return chunks
56
 
57
- def create_or_update_vector_store(collection_name: str, chunks: List[str], metadata: List[dict]):
58
- """Create or update FAISS vector store with new chunks"""
59
- # Generate embeddings
60
- embeddings = embedding_model.encode(chunks, show_progress_bar=True)
61
-
62
- if collection_name in vector_stores:
63
- # Add to existing index
64
- index_data = vector_stores[collection_name]
65
- index_data['index'].add(embeddings)
66
- index_data['chunks'].extend(chunks)
67
- index_data['metadata'].extend(metadata)
68
- else:
69
- # Create new index
70
- dimension = embeddings.shape[1]
71
- index = faiss.IndexFlatL2(dimension)
72
- index.add(embeddings)
73
-
74
- vector_stores[collection_name] = {
75
- 'index': index,
76
- 'chunks': chunks,
77
- 'metadata': metadata,
78
- 'dimension': dimension
79
- }
80
-
81
- logger.info(f"Vector store '{collection_name}' now has {len(vector_stores[collection_name]['chunks'])} chunks")
82
- return len(chunks)
83
-
84
- def extract_text_from_pdf(file_bytes: bytes) -> str:
85
- """Extract text from PDF using PyMuPDF with markdown formatting"""
86
  try:
87
- # Use pymupdf4llm for better markdown extraction
88
- pdf_doc = fitz.open(stream=file_bytes, filetype="pdf")
89
  md_text = pymupdf4llm.to_markdown(pdf_doc)
90
  return md_text
91
  except Exception as e:
92
  logger.error(f"Error extracting PDF: {e}")
93
- # Fallback to basic extraction
94
- pdf_doc = fitz.open(stream=file_bytes, filetype="pdf")
95
  text = ""
96
  for page in pdf_doc:
97
  text += page.get_text()
98
  return text
99
 
100
- def extract_text_from_markdown(file_bytes: bytes) -> str:
101
  """Extract text from markdown file"""
102
- return file_bytes.decode('utf-8')
103
-
104
- # New Endpoints
105
 
106
  @app.post("/upload_document", response_model=DocumentUpload)
107
  async def upload_document(
108
  file: UploadFile = File(...),
109
  collection_name: Optional[str] = "default"
110
  ):
111
- """
112
- Upload and process PDF or Markdown documents for RAG.
113
- Creates chunks and stores in FAISS vector database.
114
- """
115
- if not supabase:
116
- raise HTTPException(status_code=500, detail="Supabase not configured")
117
-
118
- # Validate file type
119
  allowed_types = {
120
  "application/pdf": "pdf",
121
  "text/markdown": "markdown",
@@ -124,49 +358,47 @@ async def upload_document(
124
 
125
  if file.content_type not in allowed_types:
126
  raise HTTPException(
127
- status_code=415,
128
  detail=f"Unsupported file type. Allowed: PDF, Markdown, TXT"
129
  )
130
 
131
  try:
132
- # Read file content
133
- file_bytes = await file.read()
134
  file_type = allowed_types[file.content_type]
 
135
 
136
- # Extract text based on file type
 
 
 
 
 
137
  if file_type == "pdf":
138
- text_content = extract_text_from_pdf(file_bytes)
139
- elif file_type in ["markdown", "txt"]:
140
- text_content = extract_text_from_markdown(file_bytes)
141
  else:
142
- raise HTTPException(status_code=400, detail="Unsupported file type")
143
 
144
  if not text_content.strip():
145
- raise HTTPException(status_code=400, detail="No text content extracted from file")
146
 
147
- # Generate unique file ID
148
- file_id = str(uuid.uuid4())
149
- storage_filename = f"{file_id}_{file.filename}"
 
 
 
 
 
 
 
 
 
150
 
151
- # Upload original file to Supabase
152
- try:
153
- supabase.storage.from_("url-2-ans-bucket").upload(
154
- path=storage_filename,
155
- file=file_bytes,
156
- file_options={"content-type": file.content_type}
157
- )
158
- except Exception:
159
- # Try update if file exists
160
- supabase.storage.from_("url-2-ans-bucket").update(
161
- path=storage_filename,
162
- file=file_bytes,
163
- file_options={"content-type": file.content_type}
164
- )
165
-
166
- # Chunk the document
167
  chunks = chunk_document(text_content, file_type)
168
 
169
- # Create metadata for each chunk
 
170
  metadata = [
171
  {
172
  "file_id": file_id,
@@ -179,31 +411,36 @@ async def upload_document(
179
  ]
180
 
181
  # Add to vector store
182
- chunks_created = create_or_update_vector_store(collection_name, chunks, metadata)
 
 
 
 
 
 
 
 
183
 
184
  return DocumentUpload(
185
  file_id=file_id,
186
  filename=file.filename,
187
  file_type=file_type,
188
  chunks_created=chunks_created,
189
- storage_path=f"supabase://url-2-ans-bucket/{storage_filename}"
190
  )
191
 
192
  except HTTPException:
193
  raise
194
  except Exception as e:
195
  logger.exception("Error in upload_document")
196
- raise HTTPException(status_code=500, detail=f"Error processing document: {str(e)}")
197
- finally:
198
- await file.close()
199
-
200
 
201
  @app.post("/upload_multiple_documents")
202
  async def upload_multiple_documents(
203
  files: List[UploadFile] = File(...),
204
  collection_name: Optional[str] = "default"
205
  ):
206
- """Upload multiple documents at once"""
207
  results = []
208
  errors = []
209
 
@@ -221,38 +458,30 @@ async def upload_multiple_documents(
221
  "errors": errors
222
  }
223
 
224
-
225
  @app.post("/query_documents")
226
  async def query_documents(request: RAGQueryRequest):
227
- """
228
- Query documents using RAG with FAISS vector search.
229
- Returns answer with source citations.
230
- """
231
- if request.collection_name not in vector_stores:
232
  raise HTTPException(
233
- status_code=404,
234
- detail=f"Collection '{request.collection_name}' not found. Upload documents first."
235
  )
236
 
237
  try:
238
- # Get vector store data
239
- store_data = vector_stores[request.collection_name]
240
- index = store_data['index']
241
- chunks = store_data['chunks']
242
- metadata = store_data['metadata']
243
-
244
  # Generate query embedding
245
  query_embedding = embedding_model.encode([request.query])
 
246
 
247
  # Search in FAISS
248
- distances, indices = index.search(query_embedding, min(request.top_k, len(chunks)))
249
-
250
- # Get top-k chunks
251
- retrieved_chunks = [chunks[i] for i in indices[0]]
252
- retrieved_metadata = [metadata[i] for i in indices[0]]
253
 
254
- # Check if results are relevant (threshold-based)
255
- if distances[0][0] > 1.5: # Adjust threshold as needed
256
  return {
257
  "answer": "I couldn't find this information in the provided documents.",
258
  "sources": [],
@@ -260,16 +489,20 @@ async def query_documents(request: RAGQueryRequest):
260
  "collection": request.collection_name
261
  }
262
 
263
- # Create context from retrieved chunks
 
 
 
 
264
  context_text = "\n\n".join([
265
- f"[Source {i+1} - {meta['filename']}]:\n{chunk}"
266
  for i, (chunk, meta) in enumerate(zip(retrieved_chunks, retrieved_metadata))
267
  ])
268
 
269
- # Generate answer using Groq
270
  answer = process_with_groq(request.query, context_text)
271
 
272
- # Prepare source citations
273
  sources = [
274
  {
275
  "filename": meta['filename'],
@@ -292,38 +525,196 @@ async def query_documents(request: RAGQueryRequest):
292
  logger.exception("Error in query_documents")
293
  raise HTTPException(status_code=500, detail=f"Query failed: {str(e)}")
294
 
295
-
296
  @app.get("/list_collections")
297
  async def list_collections():
298
- """List all available vector store collections"""
299
- collections = []
300
- for name, data in vector_stores.items():
301
- collections.append(VectorStoreInfo(
302
- collection_name=name,
303
- total_chunks=len(data['chunks']),
304
- dimension=data['dimension']
305
- ))
306
  return {"collections": collections}
307
 
308
-
309
  @app.delete("/delete_collection/{collection_name}")
310
  async def delete_collection(collection_name: str):
311
- """Delete a vector store collection"""
312
- if collection_name not in vector_stores:
313
- raise HTTPException(status_code=404, detail="Collection not found")
314
-
315
- del vector_stores[collection_name]
316
- return {"message": f"Collection '{collection_name}' deleted successfully"}
317
-
318
 
319
  @app.get("/health_check")
320
  async def health_check():
321
- """Check system health and configuration"""
322
  return {
323
  "status": "healthy",
324
  "supabase_configured": supabase is not None,
325
- "groq_configured": groq_api_key is not None,
326
  "embedding_model": "all-MiniLM-L6-v2",
327
- "vector_stores": len(vector_stores),
328
- "total_chunks": sum(len(store['chunks']) for store in vector_stores.values())
 
 
329
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ import time
4
+ import random
5
+ import json
6
+ import numpy as np
7
+ import uvicorn
8
  import fitz # PyMuPDF
9
+ import pymupdf4llm
10
  import faiss
11
+ from pathlib import Path
12
  from typing import List, Optional
13
+ from urllib.parse import urlparse, urljoin
14
+ from fastapi import FastAPI, HTTPException, File, UploadFile
15
+ from fastapi.middleware.cors import CORSMiddleware
16
+ from pydantic import BaseModel
17
+ from bs4 import BeautifulSoup
18
+ import requests
19
+ from sklearn.metrics.pairwise import cosine_similarity
20
+ from supabase import create_client, Client
21
+ from groq import Groq
22
+ from sentence_transformers import SentenceTransformer
23
  from langchain.text_splitters import RecursiveCharacterTextSplitter, MarkdownTextSplitter
24
+ import pickle
25
+
26
+ # ==================== CONFIGURATION FOR HUGGING FACE SPACES ====================
27
+
28
+ # Persistent storage directory (Hugging Face Spaces uses /data/)
29
+ PERSISTENT_STORAGE = os.getenv("PERSISTENT_STORAGE", "/data")
30
+ VECTOR_STORE_DIR = os.path.join(PERSISTENT_STORAGE, "vector_stores")
31
+ TEMP_UPLOAD_DIR = os.path.join(PERSISTENT_STORAGE, "temp_uploads")
32
+
33
+ # Create directories if they don't exist
34
+ os.makedirs(VECTOR_STORE_DIR, exist_ok=True)
35
+ os.makedirs(TEMP_UPLOAD_DIR, exist_ok=True)
36
+
37
+ # Set HuggingFace cache to persistent storage
38
+ os.environ["HF_HOME"] = os.path.join(PERSISTENT_STORAGE, ".huggingface")
39
+
40
+ # ==================== LOGGING SETUP ====================
41
+
42
+ logging.basicConfig(
43
+ level=logging.INFO,
44
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
45
+ )
46
+ logger = logging.getLogger(__name__)
47
+
48
+ # ==================== FASTAPI APP ====================
49
+
50
+ app = FastAPI(title="RAG Assistant API", version="2.0")
51
+
52
+ # CORS middleware
53
+ app.add_middleware(
54
+ CORSMiddleware,
55
+ allow_origins=["*"],
56
+ allow_credentials=True,
57
+ allow_methods=["*"],
58
+ allow_headers=["*"],
59
+ )
60
+
61
+ # ==================== ENVIRONMENT VARIABLES ====================
62
+
63
+ groq_api_key = os.getenv("GROQ_API_KEY")
64
+ supabase_url = os.getenv("SUPABASE_URL")
65
+ supabase_key = os.getenv("SUPABASE_KEY")
66
+
67
+ # Initialize clients
68
+ supabase: Optional[Client] = None
69
+ groq_client = None
70
+
71
+ if supabase_url and supabase_key:
72
+ try:
73
+ supabase = create_client(supabase_url, supabase_key)
74
+ logger.info("Supabase client initialized successfully")
75
+ except Exception as e:
76
+ logger.error(f"Failed to initialize Supabase: {e}")
77
+
78
+ if groq_api_key:
79
+ try:
80
+ groq_client = Groq(api_key=groq_api_key)
81
+ logger.info("Groq client initialized successfully")
82
+ except Exception as e:
83
+ logger.error(f"Failed to initialize Groq: {e}")
84
 
85
+ # Initialize embedding model (cached in persistent storage)
86
+ logger.info("Loading embedding model...")
87
  embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
88
+ logger.info("Embedding model loaded successfully")
89
 
90
+ # ==================== PERSISTENT VECTOR STORE MANAGEMENT ====================
91
+
92
+ class VectorStoreManager:
93
+ """Manage FAISS vector stores with disk persistence"""
94
+
95
+ def __init__(self, base_dir: str):
96
+ self.base_dir = base_dir
97
+ self.stores = {}
98
+ self.load_all_stores()
99
+
100
+ def load_all_stores(self):
101
+ """Load all existing vector stores from disk on startup"""
102
+ try:
103
+ for collection_dir in Path(self.base_dir).iterdir():
104
+ if collection_dir.is_dir():
105
+ collection_name = collection_dir.name
106
+ try:
107
+ self.load_store(collection_name)
108
+ logger.info(f"Loaded collection '{collection_name}' from disk")
109
+ except Exception as e:
110
+ logger.error(f"Failed to load collection '{collection_name}': {e}")
111
+ except Exception as e:
112
+ logger.error(f"Error loading vector stores: {e}")
113
+
114
+ def load_store(self, collection_name: str):
115
+ """Load a specific vector store from disk"""
116
+ collection_dir = os.path.join(self.base_dir, collection_name)
117
+
118
+ if not os.path.exists(collection_dir):
119
+ raise FileNotFoundError(f"Collection '{collection_name}' not found")
120
+
121
+ # Load FAISS index
122
+ index_path = os.path.join(collection_dir, "index.faiss")
123
+ index = faiss.read_index(index_path)
124
+
125
+ # Load metadata
126
+ metadata_path = os.path.join(collection_dir, "metadata.pkl")
127
+ with open(metadata_path, 'rb') as f:
128
+ data = pickle.load(f)
129
+
130
+ self.stores[collection_name] = {
131
+ 'index': index,
132
+ 'chunks': data['chunks'],
133
+ 'metadata': data['metadata'],
134
+ 'dimension': index.d
135
+ }
136
+
137
+ def save_store(self, collection_name: str):
138
+ """Save a vector store to disk"""
139
+ collection_dir = os.path.join(self.base_dir, collection_name)
140
+ os.makedirs(collection_dir, exist_ok=True)
141
+
142
+ store_data = self.stores[collection_name]
143
+
144
+ # Save FAISS index
145
+ index_path = os.path.join(collection_dir, "index.faiss")
146
+ faiss.write_index(store_data['index'], index_path)
147
+
148
+ # Save metadata
149
+ metadata_path = os.path.join(collection_dir, "metadata.pkl")
150
+ with open(metadata_path, 'wb') as f:
151
+ pickle.dump({
152
+ 'chunks': store_data['chunks'],
153
+ 'metadata': store_data['metadata']
154
+ }, f)
155
+
156
+ logger.info(f"Saved collection '{collection_name}' to disk")
157
+
158
+ def create_or_update_store(self, collection_name: str, chunks: List[str], metadata: List[dict]):
159
+ """Create or update a vector store"""
160
+ # Generate embeddings
161
+ embeddings = embedding_model.encode(chunks, show_progress_bar=True)
162
+ embeddings = np.array(embeddings).astype('float32')
163
+
164
+ if collection_name in self.stores:
165
+ # Add to existing index
166
+ store_data = self.stores[collection_name]
167
+ store_data['index'].add(embeddings)
168
+ store_data['chunks'].extend(chunks)
169
+ store_data['metadata'].extend(metadata)
170
+ else:
171
+ # Create new index
172
+ dimension = embeddings.shape[1]
173
+ index = faiss.IndexFlatL2(dimension)
174
+ index.add(embeddings)
175
+
176
+ self.stores[collection_name] = {
177
+ 'index': index,
178
+ 'chunks': chunks.copy(),
179
+ 'metadata': metadata.copy(),
180
+ 'dimension': dimension
181
+ }
182
+
183
+ # Save to disk
184
+ self.save_store(collection_name)
185
+ return len(chunks)
186
+
187
+ def get_store(self, collection_name: str):
188
+ """Get a vector store"""
189
+ if collection_name not in self.stores:
190
+ # Try to load from disk
191
+ try:
192
+ self.load_store(collection_name)
193
+ except:
194
+ return None
195
+ return self.stores.get(collection_name)
196
+
197
+ def delete_store(self, collection_name: str):
198
+ """Delete a vector store"""
199
+ if collection_name in self.stores:
200
+ del self.stores[collection_name]
201
+
202
+ # Delete from disk
203
+ collection_dir = os.path.join(self.base_dir, collection_name)
204
+ if os.path.exists(collection_dir):
205
+ import shutil
206
+ shutil.rmtree(collection_dir)
207
+
208
+ def list_stores(self):
209
+ """List all available stores"""
210
+ return [
211
+ {
212
+ 'collection_name': name,
213
+ 'total_chunks': len(data['chunks']),
214
+ 'dimension': data['dimension']
215
+ }
216
+ for name, data in self.stores.items()
217
+ ]
218
+
219
+ # Initialize vector store manager
220
+ vector_store_manager = VectorStoreManager(VECTOR_STORE_DIR)
221
+
222
+ # ==================== PYDANTIC MODELS ====================
223
+
224
+ class URL(BaseModel):
225
+ url: str
226
+
227
+ class RAGRequest(BaseModel):
228
+ file_path: str
229
+ prompt: str
230
 
231
  class DocumentUpload(BaseModel):
 
232
  file_id: str
233
  filename: str
234
  file_type: str
 
236
  storage_path: str
237
 
238
  class RAGQueryRequest(BaseModel):
 
239
  query: str
240
  collection_name: str
241
  top_k: Optional[int] = 3
242
 
243
  class VectorStoreInfo(BaseModel):
 
244
  collection_name: str
245
  total_chunks: int
246
  dimension: int
247
+
248
+ # ==================== EXISTING FUNCTIONALITY ====================
249
+
250
+ user_agents = [
251
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
252
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
253
+ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36'
254
+ ]
255
+
256
+ bucket_name = "url-2-ans-bucket"
257
+
258
+ def query(payload):
259
+ """Query Hugging Face embedding API"""
260
+ API_URL = "https://api-inference.huggingface.co/models/sentence-transformers/all-MiniLM-L6-v2"
261
+ headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_TOKEN', '')}"}
262
 
263
+ response = requests.post(API_URL, headers=headers, json=payload)
264
+ if response.status_code == 200:
265
+ return response.json()
266
+ else:
267
+ logger.warning(f"HF API error: {response.status_code}, using local model")
268
+ return embedding_model.encode(payload["inputs"]).tolist()
269
+
270
+ def process_with_groq(query: str, context: str) -> str:
271
+ """Process query with Groq LLM"""
272
+ if not groq_client:
273
+ return "Groq API not configured. Please set GROQ_API_KEY environment variable."
274
+
275
+ try:
276
+ messages = [
277
+ {
278
+ "role": "system",
279
+ "content": "You are a helpful assistant. Answer questions based on the provided context. If you cannot find the answer in the context, say so."
280
+ },
281
+ {
282
+ "role": "user",
283
+ "content": f"Context:\n{context}\n\nQuestion: {query}\n\nAnswer:"
284
+ }
285
+ ]
286
+
287
+ chat_completion = groq_client.chat.completions.create(
288
+ messages=messages,
289
+ model="llama-3.3-70b-versatile",
290
+ temperature=0.7,
291
+ max_tokens=1024,
292
+ )
293
+
294
+ return chat_completion.choices[0].message.content
295
+ except Exception as e:
296
+ logger.error(f"Groq API error: {e}")
297
+ return f"Error generating response: {str(e)}"
298
+
299
+ @app.get("/")
300
+ async def root():
301
+ return {
302
+ "message": "RAG Assistant API",
303
+ "version": "2.0",
304
+ "status": "running",
305
+ "storage": PERSISTENT_STORAGE
306
+ }
307
+
308
+ # ==================== NEW RAG ENDPOINTS ====================
309
 
310
  def chunk_document(text: str, file_type: str, chunk_size: int = 1000, chunk_overlap: int = 200):
311
  """Chunk document based on file type"""
312
+ if file_type in ["markdown", "md"]:
313
  splitter = MarkdownTextSplitter(
314
  chunk_size=chunk_size,
315
  chunk_overlap=chunk_overlap
 
325
  logger.info(f"Created {len(chunks)} chunks from document")
326
  return chunks
327
 
328
+ def extract_text_from_pdf(file_path: str) -> str:
329
+ """Extract text from PDF"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
330
  try:
331
+ pdf_doc = fitz.open(file_path)
 
332
  md_text = pymupdf4llm.to_markdown(pdf_doc)
333
  return md_text
334
  except Exception as e:
335
  logger.error(f"Error extracting PDF: {e}")
336
+ pdf_doc = fitz.open(file_path)
 
337
  text = ""
338
  for page in pdf_doc:
339
  text += page.get_text()
340
  return text
341
 
342
+ def extract_text_from_markdown(file_path: str) -> str:
343
  """Extract text from markdown file"""
344
+ with open(file_path, 'r', encoding='utf-8') as f:
345
+ return f.read()
 
346
 
347
  @app.post("/upload_document", response_model=DocumentUpload)
348
  async def upload_document(
349
  file: UploadFile = File(...),
350
  collection_name: Optional[str] = "default"
351
  ):
352
+ """Upload and process PDF or Markdown documents"""
 
 
 
 
 
 
 
353
  allowed_types = {
354
  "application/pdf": "pdf",
355
  "text/markdown": "markdown",
 
358
 
359
  if file.content_type not in allowed_types:
360
  raise HTTPException(
361
+ status_code=415,
362
  detail=f"Unsupported file type. Allowed: PDF, Markdown, TXT"
363
  )
364
 
365
  try:
366
+ # Save file temporarily to persistent storage
 
367
  file_type = allowed_types[file.content_type]
368
+ temp_file_path = os.path.join(TEMP_UPLOAD_DIR, f"{int(time.time())}_{file.filename}")
369
 
370
+ # Write uploaded file
371
+ with open(temp_file_path, "wb") as buffer:
372
+ content = await file.read()
373
+ buffer.write(content)
374
+
375
+ # Extract text
376
  if file_type == "pdf":
377
+ text_content = extract_text_from_pdf(temp_file_path)
 
 
378
  else:
379
+ text_content = extract_text_from_markdown(temp_file_path)
380
 
381
  if not text_content.strip():
382
+ raise HTTPException(status_code=400, detail="No text content extracted")
383
 
384
+ # Optional: Upload to Supabase
385
+ storage_filename = f"{int(time.time())}_{file.filename}"
386
+ if supabase:
387
+ try:
388
+ with open(temp_file_path, 'rb') as f:
389
+ supabase.storage.from_(bucket_name).upload(
390
+ path=storage_filename,
391
+ file=f.read(),
392
+ file_options={"content-type": file.content_type}
393
+ )
394
+ except:
395
+ pass # Continue even if Supabase upload fails
396
 
397
+ # Chunk document
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
398
  chunks = chunk_document(text_content, file_type)
399
 
400
+ # Create metadata
401
+ file_id = str(int(time.time()))
402
  metadata = [
403
  {
404
  "file_id": file_id,
 
411
  ]
412
 
413
  # Add to vector store
414
+ chunks_created = vector_store_manager.create_or_update_store(
415
+ collection_name, chunks, metadata
416
+ )
417
+
418
+ # Clean up temp file
419
+ try:
420
+ os.remove(temp_file_path)
421
+ except:
422
+ pass
423
 
424
  return DocumentUpload(
425
  file_id=file_id,
426
  filename=file.filename,
427
  file_type=file_type,
428
  chunks_created=chunks_created,
429
+ storage_path=f"supabase://{bucket_name}/{storage_filename}" if supabase else temp_file_path
430
  )
431
 
432
  except HTTPException:
433
  raise
434
  except Exception as e:
435
  logger.exception("Error in upload_document")
436
+ raise HTTPException(status_code=500, detail=f"Error: {str(e)}")
 
 
 
437
 
438
  @app.post("/upload_multiple_documents")
439
  async def upload_multiple_documents(
440
  files: List[UploadFile] = File(...),
441
  collection_name: Optional[str] = "default"
442
  ):
443
+ """Upload multiple documents"""
444
  results = []
445
  errors = []
446
 
 
458
  "errors": errors
459
  }
460
 
 
461
  @app.post("/query_documents")
462
  async def query_documents(request: RAGQueryRequest):
463
+ """Query documents using RAG"""
464
+ store_data = vector_store_manager.get_store(request.collection_name)
465
+
466
+ if not store_data:
 
467
  raise HTTPException(
468
+ status_code=404,
469
+ detail=f"Collection '{request.collection_name}' not found"
470
  )
471
 
472
  try:
 
 
 
 
 
 
473
  # Generate query embedding
474
  query_embedding = embedding_model.encode([request.query])
475
+ query_embedding = np.array(query_embedding).astype('float32')
476
 
477
  # Search in FAISS
478
+ distances, indices = store_data['index'].search(
479
+ query_embedding,
480
+ min(request.top_k, len(store_data['chunks']))
481
+ )
 
482
 
483
+ # Check relevance threshold
484
+ if distances[0][0] > 1.5:
485
  return {
486
  "answer": "I couldn't find this information in the provided documents.",
487
  "sources": [],
 
489
  "collection": request.collection_name
490
  }
491
 
492
+ # Get relevant chunks
493
+ retrieved_chunks = [store_data['chunks'][i] for i in indices[0]]
494
+ retrieved_metadata = [store_data['metadata'][i] for i in indices[0]]
495
+
496
+ # Create context
497
  context_text = "\n\n".join([
498
+ f"[Source {i+1} - {meta['filename']}]:\n{chunk}"
499
  for i, (chunk, meta) in enumerate(zip(retrieved_chunks, retrieved_metadata))
500
  ])
501
 
502
+ # Generate answer
503
  answer = process_with_groq(request.query, context_text)
504
 
505
+ # Prepare sources
506
  sources = [
507
  {
508
  "filename": meta['filename'],
 
525
  logger.exception("Error in query_documents")
526
  raise HTTPException(status_code=500, detail=f"Query failed: {str(e)}")
527
 
 
528
  @app.get("/list_collections")
529
  async def list_collections():
530
+ """List all collections"""
531
+ collections = vector_store_manager.list_stores()
 
 
 
 
 
 
532
  return {"collections": collections}
533
 
 
534
  @app.delete("/delete_collection/{collection_name}")
535
  async def delete_collection(collection_name: str):
536
+ """Delete a collection"""
537
+ try:
538
+ vector_store_manager.delete_store(collection_name)
539
+ return {"message": f"Collection '{collection_name}' deleted successfully"}
540
+ except Exception as e:
541
+ raise HTTPException(status_code=500, detail=str(e))
 
542
 
543
  @app.get("/health_check")
544
  async def health_check():
545
+ """System health check"""
546
  return {
547
  "status": "healthy",
548
  "supabase_configured": supabase is not None,
549
+ "groq_configured": groq_client is not None,
550
  "embedding_model": "all-MiniLM-L6-v2",
551
+ "vector_stores": len(vector_store_manager.stores),
552
+ "total_chunks": sum(len(store['chunks']) for store in vector_store_manager.stores.values()),
553
+ "persistent_storage": PERSISTENT_STORAGE,
554
+ "collections": list(vector_store_manager.stores.keys())
555
  }
556
+
557
+ # ==================== EXISTING WEB SCRAPING ENDPOINTS ====================
558
+
559
+ @app.post("/rag")
560
+ async def rag(request: RAGRequest):
561
+ """Existing RAG endpoint for URL-based content"""
562
+ if not supabase:
563
+ raise HTTPException(status_code=500, detail="Supabase not configured")
564
+
565
+ try:
566
+ file_path = request.file_path
567
+
568
+ # Download from Supabase
569
+ file_content = supabase.storage.from_(bucket_name).download(file_path)
570
+ text = file_content.decode('utf-8')
571
+ data = json.loads(text)
572
+
573
+ # Extract text
574
+ full_text = ""
575
+ for item in data:
576
+ full_text += item.get("text", "") + " "
577
+
578
+ # Chunk text
579
+ chunk_size = 1000
580
+ chunks = [full_text[i:i+chunk_size] for i in range(0, len(full_text), chunk_size)]
581
+
582
+ # Get embeddings
583
+ chunk_embeddings = []
584
+ for chunk in chunks:
585
+ embedding = query({"inputs": chunk})
586
+ chunk_embeddings.append(embedding)
587
+
588
+ query_embedding = query({"inputs": request.prompt})
589
+
590
+ # Calculate similarity
591
+ similarities = []
592
+ for chunk_embedding in chunk_embeddings:
593
+ query_np = np.array(query_embedding)
594
+ chunk_np = np.array(chunk_embedding)
595
+
596
+ if len(query_np.shape) == 1:
597
+ query_np = query_np.reshape(1, -1)
598
+ if len(chunk_np.shape) == 1:
599
+ chunk_np = chunk_np.reshape(1, -1)
600
+
601
+ similarity = cosine_similarity(query_np, chunk_np)[0][0]
602
+ similarities.append(similarity)
603
+
604
+ # Get top 3 chunks
605
+ top_k = 3
606
+ top_indices = np.argsort(similarities)[-top_k:][::-1]
607
+ relevant_chunks = [chunks[i] for i in top_indices]
608
+ context_text = "\n\n".join(relevant_chunks)
609
+
610
+ # Process with Groq
611
+ answer = process_with_groq(request.prompt, context_text)
612
+
613
+ sources = [{"text": chunks[i][:200] + "...", "position": i} for i in top_indices]
614
+
615
+ return {
616
+ "sources": sources,
617
+ "user_query": request.prompt,
618
+ "assistant_response": answer,
619
+ "file_source": f"supabase://{bucket_name}/{file_path}"
620
+ }
621
+
622
+ except Exception as e:
623
+ logger.exception("Error in RAG")
624
+ raise HTTPException(status_code=500, detail=str(e))
625
+
626
+ @app.post("/extract_links")
627
+ async def extract_links(url: URL):
628
+ """Extract links from URL"""
629
+ def extract_unique_links(url_string, max_retries=3, timeout=30):
630
+ for attempt in range(max_retries):
631
+ try:
632
+ headers = {'User-Agent': random.choice(user_agents)}
633
+ response = requests.get(url_string, headers=headers, timeout=timeout)
634
+ response.raise_for_status()
635
+ soup = BeautifulSoup(response.text, 'html.parser')
636
+
637
+ base_url = urlparse(url_string)
638
+ base_url = f"{base_url.scheme}://{base_url.netloc}"
639
+
640
+ links = [urljoin(base_url, a.get('href')) for a in soup.find_all('a', href=True)]
641
+ unique_links = list(dict.fromkeys(links))
642
+ unique_links.insert(0, url_string)
643
+ return unique_links
644
+ except Exception as e:
645
+ if attempt < max_retries - 1:
646
+ time.sleep(5 * (attempt + 1))
647
+ else:
648
+ raise HTTPException(status_code=500, detail=str(e))
649
+ return []
650
+
651
+ try:
652
+ unique_links = extract_unique_links(url.url)
653
+ return {"unique_links": unique_links}
654
+ except Exception as e:
655
+ raise HTTPException(status_code=500, detail=str(e))
656
+
657
+ @app.post("/extract_text")
658
+ async def extract_text(urls: List[str]):
659
+ """Extract text from URLs"""
660
+ if not supabase:
661
+ raise HTTPException(status_code=500, detail="Supabase not configured")
662
+
663
+ output_file = "extracted_text.txt"
664
+
665
+ def text_data_extractor(links):
666
+ extracted_texts = []
667
+ for link in links:
668
+ retries = 3
669
+ while retries > 0:
670
+ try:
671
+ headers = {'User-Agent': random.choice(user_agents)}
672
+ response = requests.get(link, headers=headers, timeout=30)
673
+ response.raise_for_status()
674
+ soup = BeautifulSoup(response.text, 'html.parser')
675
+ text = ' '.join(soup.get_text().split())
676
+ extracted_texts.append({"url": link, "text": text})
677
+ break
678
+ except:
679
+ retries -= 1
680
+ if retries > 0:
681
+ time.sleep(5)
682
+
683
+ if retries == 0:
684
+ extracted_texts.append({"url": link, "text": "Failed to retrieve"})
685
+
686
+ return extracted_texts
687
+
688
+ try:
689
+ extracted_data = text_data_extractor(urls)
690
+ string_output = json.dumps(extracted_data, ensure_ascii=False, indent=2)
691
+
692
+ # Upload to Supabase
693
+ file_content = string_output.encode('utf-8')
694
+ try:
695
+ supabase.storage.from_(bucket_name).upload(
696
+ path=output_file,
697
+ file=file_content,
698
+ file_options={"content-type": "text/plain"}
699
+ )
700
+ except:
701
+ supabase.storage.from_(bucket_name).update(
702
+ path=output_file,
703
+ file=file_content,
704
+ file_options={"content-type": "text/plain"}
705
+ )
706
+
707
+ return {"extracted_data": extracted_data, "file_saved": output_file}
708
+ except Exception as e:
709
+ raise HTTPException(status_code=500, detail=str(e))
710
+
711
+ # ==================== MAIN ====================
712
+
713
+ if __name__ == "__main__":
714
+ uvicorn.run(
715
+ "main_api:app",
716
+ host="0.0.0.0",
717
+ port=8000,
718
+ reload=False,
719
+ access_log=True
720
+ )