|
|
import json |
|
|
import chromadb |
|
|
import firebase_admin |
|
|
from firebase_admin import credentials, firestore |
|
|
from encoder import SentenceEncoder |
|
|
|
|
|
def initialize_firebase_with_file(): |
|
|
"""Initializes Firebase using a local serviceAccountKey.json file.""" |
|
|
try: |
|
|
|
|
|
cred = credentials.Certificate("serviceAccountKey.json") |
|
|
|
|
|
if not firebase_admin._apps: |
|
|
firebase_admin.initialize_app(cred) |
|
|
|
|
|
db = firestore.client() |
|
|
print("β
Firebase connection initialized from file.") |
|
|
return db |
|
|
except Exception as e: |
|
|
print(f"β Could not initialize Firebase from file. Error: {e}") |
|
|
print(" - Make sure 'serviceAccountKey.json' has been uploaded to the terminal.") |
|
|
return None |
|
|
|
|
|
def populate_vector_db(): |
|
|
""" |
|
|
Reads internships from Firestore, generates embeddings, and populates ChromaDB. |
|
|
""" |
|
|
db = initialize_firebase_with_file() |
|
|
if db is None: |
|
|
return |
|
|
|
|
|
|
|
|
encoder = SentenceEncoder() |
|
|
chroma_client = chromadb.PersistentClient(path="/data/chroma_db") |
|
|
collection = chroma_client.get_or_create_collection(name="internships") |
|
|
|
|
|
|
|
|
if collection.count() > 0: |
|
|
print(f"βΉοΈ Clearing {collection.count()} existing items from ChromaDB.") |
|
|
collection.delete(ids=collection.get()['ids']) |
|
|
|
|
|
|
|
|
print("π Reading internship data from Firestore...") |
|
|
internships_ref = db.collection('internships').stream() |
|
|
internships = [doc.to_dict() for doc in internships_ref] |
|
|
|
|
|
if not internships: |
|
|
print("β No internship data found in Firestore.") |
|
|
return |
|
|
|
|
|
|
|
|
print(f"π§ Generating embeddings for {len(internships)} internships...") |
|
|
texts = [f"{i['title']}. {i['description']}. Skills: {', '.join(i['skills'])}" for i in internships] |
|
|
embeddings = encoder.encode(texts, show_progress_bar=True).tolist() |
|
|
ids = [i['id'] for i in internships] |
|
|
|
|
|
metadatas = [] |
|
|
for i in internships: |
|
|
i['skills'] = json.dumps(i['skills']) |
|
|
metadatas.append(i) |
|
|
|
|
|
|
|
|
print("β Adding data to ChromaDB...") |
|
|
collection.add(ids=ids, embeddings=embeddings, metadatas=metadatas) |
|
|
print(f"β
Successfully populated ChromaDB with {collection.count()} items.") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
populate_vector_db() |