import os import joblib import pandas as pd from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from supabase import create_client, Client from dotenv import load_dotenv # Load environment variables from a .env file load_dotenv() # --- Database Connection --- # Make sure these are in your .env file for the ai-service SUPABASE_URL = os.getenv("SUPABASE_URL") SUPABASE_SERVICE_KEY = os.getenv("SUPABASE_SERVICE_KEY") # Use service key for scripts if not SUPABASE_URL or not SUPABASE_SERVICE_KEY: raise ValueError("SUPABASE_URL and SUPABASE_SERVICE_KEY must be set in environment variables.") supabase: Client = create_client(SUPABASE_URL, SUPABASE_SERVICE_KEY) def fetch_training_data(): """Fetches completed campaign and payment data for training.""" print("Fetching training data from Supabase...") # We use 'completed' status campaigns as the basis for training response = supabase.table("campaigns").select("id, budget, assigned_to").eq("status", "completed").execute() if not response.data: print("No completed campaigns found to train the model.") return None campaigns_df = pd.DataFrame(response.data) # Now, get the total actual payout for each campaign all_payments = [] campaign_ids = campaigns_df['id'].tolist() # Ensure campaign_ids is not empty to avoid a Supabase error if not campaign_ids: return None payments_response = supabase.table("payments").select("campaign_id, amount").in_("campaign_id", campaign_ids).execute() payments_df = pd.DataFrame(payments_response.data) # Group by campaign_id and sum the payouts payout_summary = payments_df.groupby('campaign_id')['amount'].sum().reset_index() payout_summary = payout_summary.rename(columns={'amount': 'actual_payout'}) # Merge the payout data with the campaign data training_data = pd.merge(campaigns_df, payout_summary, left_on='id', right_on='campaign_id') print(f"Successfully fetched and merged data for {len(training_data)} campaigns.") return training_data def train_model(): """Trains and saves the payout forecasting model.""" df = fetch_training_data() if df is None or df.empty: print("Model training skipped due to lack of data.") return # Our features (X) and target (y) features = ['budget'] target = 'actual_payout' X = df[features] y = df[target] if len(df) < 5: print("Not enough data to split for testing. Training on all available data.") X_train, y_train = X, y X_test, y_test = X, y # Use the same for scoring, not ideal but necessary else: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) print("Training a Linear Regression model...") model = LinearRegression() model.fit(X_train, y_train) score = model.score(X_test, y_test) print(f"Model trained with R^2 score: {score:.2f}") # ✨ FIX: Correctly path up one directory from `training` to `models` script_dir = os.path.dirname(__file__) # This is the /training directory models_dir = os.path.join(script_dir, '..', 'models') os.makedirs(models_dir, exist_ok=True) # Ensure the /models directory exists model_path = os.path.join(models_dir, 'payout_forecaster_v1.joblib') joblib.dump(model, model_path) print(f"Model successfully saved to: {model_path}") if __name__ == "__main__": train_model()