SIH-ML-Backend-Resume-scanner / populate_chroma.py
Pipalskill's picture
Upload 9 files
7a10db2 verified
import json
import chromadb
import firebase_admin
from firebase_admin import credentials, firestore
from encoder import SentenceEncoder
def initialize_firebase_with_file():
"""Initializes Firebase using a local serviceAccountKey.json file."""
try:
# Use the service account key file
cred = credentials.Certificate("serviceAccountKey.json")
if not firebase_admin._apps:
firebase_admin.initialize_app(cred)
db = firestore.client()
print("βœ… Firebase connection initialized from file.")
return db
except Exception as e:
print(f"❌ Could not initialize Firebase from file. Error: {e}")
print(" - Make sure 'serviceAccountKey.json' has been uploaded to the terminal.")
return None
def populate_vector_db():
"""
Reads internships from Firestore, generates embeddings, and populates ChromaDB.
"""
db = initialize_firebase_with_file()
if db is None:
return
# 1. Initialize other clients
encoder = SentenceEncoder()
chroma_client = chromadb.PersistentClient(path="/data/chroma_db")
collection = chroma_client.get_or_create_collection(name="internships")
# 2. Clear existing data
if collection.count() > 0:
print(f"ℹ️ Clearing {collection.count()} existing items from ChromaDB.")
collection.delete(ids=collection.get()['ids'])
# 3. Fetch data from Firestore
print("πŸ“š Reading internship data from Firestore...")
internships_ref = db.collection('internships').stream()
internships = [doc.to_dict() for doc in internships_ref]
if not internships:
print("❌ No internship data found in Firestore.")
return
# 4. Generate embeddings
print(f"🧠 Generating embeddings for {len(internships)} internships...")
texts = [f"{i['title']}. {i['description']}. Skills: {', '.join(i['skills'])}" for i in internships]
embeddings = encoder.encode(texts, show_progress_bar=True).tolist()
ids = [i['id'] for i in internships]
metadatas = []
for i in internships:
i['skills'] = json.dumps(i['skills'])
metadatas.append(i)
# 5. Add to ChromaDB
print("βž• Adding data to ChromaDB...")
collection.add(ids=ids, embeddings=embeddings, metadatas=metadatas)
print(f"βœ… Successfully populated ChromaDB with {collection.count()} items.")
if __name__ == "__main__":
populate_vector_db()