File size: 4,003 Bytes
0914e96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# ai-service/scripts/train_creative.py

import os
import sys
import json
import csv

# Path setup
current_dir = os.path.dirname(os.path.abspath(__file__))
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)

from core.rag.store import VectorStore

def ingest_creative_data():
    print("\nπŸš€ Starting ADVANCED Creative Director Training...")
    
    # 1. Initialize DB
    store = VectorStore(collection_name="creative_mind")
    data_folder = os.path.join(parent_dir, "data", "creative_training")
    
    documents = []
    metadatas = []
    ids = []
    
    if not os.path.exists(data_folder):
        print(f"❌ Folder missing: {data_folder}")
        return

    # 2. Iterate over files
    files_found = 0
    for filename in os.listdir(data_folder):
        file_path = os.path.join(data_folder, filename)
        
        try:
            # --- CASE A: TEXT FILES ---
            if filename.endswith(".txt"):
                print(f"   πŸ“„ Reading Text: {filename}")
                with open(file_path, "r", encoding="utf-8") as f:
                    content = f.read()
                    if content.strip():
                        # Text file ko chunks me tod sakte hain agar badi ho
                        # Abhi simple rakhte hain
                        documents.append(content)
                        metadatas.append({"source": filename, "type": "text_guide"})
                        ids.append(f"{filename}_full")
                        files_found += 1

            # --- CASE B: JSON FILES (GitHub Style) ---
            elif filename.endswith(".json"):
                print(f"   ✨ Reading JSON: {filename}")
                with open(file_path, "r", encoding="utf-8") as f:
                    data = json.load(f)
                    # Agar list of objects hai (Common in datasets)
                    if isinstance(data, list):
                        for idx, item in enumerate(data):
                            # JSON object ko text string me convert karte hain taaki vector ban sake
                            text_repr = f"Strategy: {item.get('title', 'Tip')}\nDetails: {item.get('content', item)}"
                            documents.append(text_repr)
                            metadatas.append({"source": filename, "type": "json_entry"})
                            ids.append(f"{filename}_{idx}")
                    files_found += 1

            # --- CASE C: CSV FILES (Excel Style) ---
            elif filename.endswith(".csv"):
                print(f"   πŸ“Š Reading CSV: {filename}")
                with open(file_path, "r", encoding="utf-8") as f:
                    reader = csv.DictReader(f)
                    for idx, row in enumerate(reader):
                        # Dictionary ko readable string banao
                        # Example row: {'hook': 'Stop!', 'niche': 'Tech'}
                        # Result: "hook: Stop! \n niche: Tech"
                        text_repr = "\n".join([f"{k}: {v}" for k, v in row.items()])
                        documents.append(text_repr)
                        metadatas.append({"source": filename, "type": "csv_row"})
                        ids.append(f"{filename}_{idx}")
                    files_found += 1

        except Exception as e:
            print(f"   ⚠️ Error processing {filename}: {e}")

    # 3. Save to Database
    if documents:
        print(f"\n   🧠 Embedding {len(documents)} data points into Vector DB...")
        # Batch processing agar data bohot zyada ho
        batch_size = 50
        for i in range(0, len(documents), batch_size):
            end = min(i + batch_size, len(documents))
            print(f"      - Batch {i} to {end}...")
            store.add_text(documents[i:end], metadatas[i:end], ids[i:end])
            
        print(f"βœ… Training Complete! Scanned {files_found} files.")
    else:
        print("⚠️ No valid data found. Add .txt, .json, or .csv files.")

if __name__ == "__main__":
    ingest_creative_data()