File size: 3,567 Bytes
0914e96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import os
import joblib
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from supabase import create_client, Client
from dotenv import load_dotenv

# Load environment variables from a .env file
load_dotenv() 

# --- Database Connection ---
# Make sure these are in your .env file for the ai-service
SUPABASE_URL = os.getenv("SUPABASE_URL")
SUPABASE_SERVICE_KEY = os.getenv("SUPABASE_SERVICE_KEY") # Use service key for scripts

if not SUPABASE_URL or not SUPABASE_SERVICE_KEY:
    raise ValueError("SUPABASE_URL and SUPABASE_SERVICE_KEY must be set in environment variables.")

supabase: Client = create_client(SUPABASE_URL, SUPABASE_SERVICE_KEY)

def fetch_training_data():
    """Fetches completed campaign and payment data for training."""
    print("Fetching training data from Supabase...")
    
    # We use 'completed' status campaigns as the basis for training
    response = supabase.table("campaigns").select("id, budget, assigned_to").eq("status", "completed").execute()
    
    if not response.data:
        print("No completed campaigns found to train the model.")
        return None
        
    campaigns_df = pd.DataFrame(response.data)
    
    # Now, get the total actual payout for each campaign
    all_payments = []
    campaign_ids = campaigns_df['id'].tolist()
    
    # Ensure campaign_ids is not empty to avoid a Supabase error
    if not campaign_ids:
        return None

    payments_response = supabase.table("payments").select("campaign_id, amount").in_("campaign_id", campaign_ids).execute()
    payments_df = pd.DataFrame(payments_response.data)
    
    # Group by campaign_id and sum the payouts
    payout_summary = payments_df.groupby('campaign_id')['amount'].sum().reset_index()
    payout_summary = payout_summary.rename(columns={'amount': 'actual_payout'})
    
    # Merge the payout data with the campaign data
    training_data = pd.merge(campaigns_df, payout_summary, left_on='id', right_on='campaign_id')
    
    print(f"Successfully fetched and merged data for {len(training_data)} campaigns.")
    return training_data

def train_model():
    """Trains and saves the payout forecasting model."""
    df = fetch_training_data()
    if df is None or df.empty:
        print("Model training skipped due to lack of data.")
        return

    # Our features (X) and target (y)
    features = ['budget'] 
    target = 'actual_payout'
    
    X = df[features]
    y = df[target]
    
    if len(df) < 5:
        print("Not enough data to split for testing. Training on all available data.")
        X_train, y_train = X, y
        X_test, y_test = X, y # Use the same for scoring, not ideal but necessary
    else:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    print("Training a Linear Regression model...")
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    score = model.score(X_test, y_test)
    print(f"Model trained with R^2 score: {score:.2f}")
    
    # ✨ FIX: Correctly path up one directory from `training` to `models`
    script_dir = os.path.dirname(__file__) # This is the /training directory
    models_dir = os.path.join(script_dir, '..', 'models')
    os.makedirs(models_dir, exist_ok=True) # Ensure the /models directory exists
    model_path = os.path.join(models_dir, 'payout_forecaster_v1.joblib')
    
    joblib.dump(model, model_path)
    
    print(f"Model successfully saved to: {model_path}")

if __name__ == "__main__":
    train_model()