File size: 4,003 Bytes
0914e96 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
# ai-service/scripts/train_creative.py
import os
import sys
import json
import csv
# Path setup
current_dir = os.path.dirname(os.path.abspath(__file__))
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)
from core.rag.store import VectorStore
def ingest_creative_data():
print("\nπ Starting ADVANCED Creative Director Training...")
# 1. Initialize DB
store = VectorStore(collection_name="creative_mind")
data_folder = os.path.join(parent_dir, "data", "creative_training")
documents = []
metadatas = []
ids = []
if not os.path.exists(data_folder):
print(f"β Folder missing: {data_folder}")
return
# 2. Iterate over files
files_found = 0
for filename in os.listdir(data_folder):
file_path = os.path.join(data_folder, filename)
try:
# --- CASE A: TEXT FILES ---
if filename.endswith(".txt"):
print(f" π Reading Text: {filename}")
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
if content.strip():
# Text file ko chunks me tod sakte hain agar badi ho
# Abhi simple rakhte hain
documents.append(content)
metadatas.append({"source": filename, "type": "text_guide"})
ids.append(f"{filename}_full")
files_found += 1
# --- CASE B: JSON FILES (GitHub Style) ---
elif filename.endswith(".json"):
print(f" β¨ Reading JSON: {filename}")
with open(file_path, "r", encoding="utf-8") as f:
data = json.load(f)
# Agar list of objects hai (Common in datasets)
if isinstance(data, list):
for idx, item in enumerate(data):
# JSON object ko text string me convert karte hain taaki vector ban sake
text_repr = f"Strategy: {item.get('title', 'Tip')}\nDetails: {item.get('content', item)}"
documents.append(text_repr)
metadatas.append({"source": filename, "type": "json_entry"})
ids.append(f"{filename}_{idx}")
files_found += 1
# --- CASE C: CSV FILES (Excel Style) ---
elif filename.endswith(".csv"):
print(f" π Reading CSV: {filename}")
with open(file_path, "r", encoding="utf-8") as f:
reader = csv.DictReader(f)
for idx, row in enumerate(reader):
# Dictionary ko readable string banao
# Example row: {'hook': 'Stop!', 'niche': 'Tech'}
# Result: "hook: Stop! \n niche: Tech"
text_repr = "\n".join([f"{k}: {v}" for k, v in row.items()])
documents.append(text_repr)
metadatas.append({"source": filename, "type": "csv_row"})
ids.append(f"{filename}_{idx}")
files_found += 1
except Exception as e:
print(f" β οΈ Error processing {filename}: {e}")
# 3. Save to Database
if documents:
print(f"\n π§ Embedding {len(documents)} data points into Vector DB...")
# Batch processing agar data bohot zyada ho
batch_size = 50
for i in range(0, len(documents), batch_size):
end = min(i + batch_size, len(documents))
print(f" - Batch {i} to {end}...")
store.add_text(documents[i:end], metadatas[i:end], ids[i:end])
print(f"β
Training Complete! Scanned {files_found} files.")
else:
print("β οΈ No valid data found. Add .txt, .json, or .csv files.")
if __name__ == "__main__":
ingest_creative_data() |