reachify-ai-service / scripts /train_creative.py
amitbhatt6075's picture
Complete fresh start - FINAL UPLOAD
0914e96
raw
history blame
4 kB
# ai-service/scripts/train_creative.py
import os
import sys
import json
import csv
# Path setup
current_dir = os.path.dirname(os.path.abspath(__file__))
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)
from core.rag.store import VectorStore
def ingest_creative_data():
print("\nπŸš€ Starting ADVANCED Creative Director Training...")
# 1. Initialize DB
store = VectorStore(collection_name="creative_mind")
data_folder = os.path.join(parent_dir, "data", "creative_training")
documents = []
metadatas = []
ids = []
if not os.path.exists(data_folder):
print(f"❌ Folder missing: {data_folder}")
return
# 2. Iterate over files
files_found = 0
for filename in os.listdir(data_folder):
file_path = os.path.join(data_folder, filename)
try:
# --- CASE A: TEXT FILES ---
if filename.endswith(".txt"):
print(f" πŸ“„ Reading Text: {filename}")
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
if content.strip():
# Text file ko chunks me tod sakte hain agar badi ho
# Abhi simple rakhte hain
documents.append(content)
metadatas.append({"source": filename, "type": "text_guide"})
ids.append(f"{filename}_full")
files_found += 1
# --- CASE B: JSON FILES (GitHub Style) ---
elif filename.endswith(".json"):
print(f" ✨ Reading JSON: {filename}")
with open(file_path, "r", encoding="utf-8") as f:
data = json.load(f)
# Agar list of objects hai (Common in datasets)
if isinstance(data, list):
for idx, item in enumerate(data):
# JSON object ko text string me convert karte hain taaki vector ban sake
text_repr = f"Strategy: {item.get('title', 'Tip')}\nDetails: {item.get('content', item)}"
documents.append(text_repr)
metadatas.append({"source": filename, "type": "json_entry"})
ids.append(f"{filename}_{idx}")
files_found += 1
# --- CASE C: CSV FILES (Excel Style) ---
elif filename.endswith(".csv"):
print(f" πŸ“Š Reading CSV: {filename}")
with open(file_path, "r", encoding="utf-8") as f:
reader = csv.DictReader(f)
for idx, row in enumerate(reader):
# Dictionary ko readable string banao
# Example row: {'hook': 'Stop!', 'niche': 'Tech'}
# Result: "hook: Stop! \n niche: Tech"
text_repr = "\n".join([f"{k}: {v}" for k, v in row.items()])
documents.append(text_repr)
metadatas.append({"source": filename, "type": "csv_row"})
ids.append(f"{filename}_{idx}")
files_found += 1
except Exception as e:
print(f" ⚠️ Error processing {filename}: {e}")
# 3. Save to Database
if documents:
print(f"\n 🧠 Embedding {len(documents)} data points into Vector DB...")
# Batch processing agar data bohot zyada ho
batch_size = 50
for i in range(0, len(documents), batch_size):
end = min(i + batch_size, len(documents))
print(f" - Batch {i} to {end}...")
store.add_text(documents[i:end], metadatas[i:end], ids[i:end])
print(f"βœ… Training Complete! Scanned {files_found} files.")
else:
print("⚠️ No valid data found. Add .txt, .json, or .csv files.")
if __name__ == "__main__":
ingest_creative_data()