reachify-ai-service / training /train_payout_forecaster.py
amitbhatt6075's picture
Complete fresh start - FINAL UPLOAD
0914e96
raw
history blame
3.57 kB
import os
import joblib
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from supabase import create_client, Client
from dotenv import load_dotenv
# Load environment variables from a .env file
load_dotenv()
# --- Database Connection ---
# Make sure these are in your .env file for the ai-service
SUPABASE_URL = os.getenv("SUPABASE_URL")
SUPABASE_SERVICE_KEY = os.getenv("SUPABASE_SERVICE_KEY") # Use service key for scripts
if not SUPABASE_URL or not SUPABASE_SERVICE_KEY:
raise ValueError("SUPABASE_URL and SUPABASE_SERVICE_KEY must be set in environment variables.")
supabase: Client = create_client(SUPABASE_URL, SUPABASE_SERVICE_KEY)
def fetch_training_data():
"""Fetches completed campaign and payment data for training."""
print("Fetching training data from Supabase...")
# We use 'completed' status campaigns as the basis for training
response = supabase.table("campaigns").select("id, budget, assigned_to").eq("status", "completed").execute()
if not response.data:
print("No completed campaigns found to train the model.")
return None
campaigns_df = pd.DataFrame(response.data)
# Now, get the total actual payout for each campaign
all_payments = []
campaign_ids = campaigns_df['id'].tolist()
# Ensure campaign_ids is not empty to avoid a Supabase error
if not campaign_ids:
return None
payments_response = supabase.table("payments").select("campaign_id, amount").in_("campaign_id", campaign_ids).execute()
payments_df = pd.DataFrame(payments_response.data)
# Group by campaign_id and sum the payouts
payout_summary = payments_df.groupby('campaign_id')['amount'].sum().reset_index()
payout_summary = payout_summary.rename(columns={'amount': 'actual_payout'})
# Merge the payout data with the campaign data
training_data = pd.merge(campaigns_df, payout_summary, left_on='id', right_on='campaign_id')
print(f"Successfully fetched and merged data for {len(training_data)} campaigns.")
return training_data
def train_model():
"""Trains and saves the payout forecasting model."""
df = fetch_training_data()
if df is None or df.empty:
print("Model training skipped due to lack of data.")
return
# Our features (X) and target (y)
features = ['budget']
target = 'actual_payout'
X = df[features]
y = df[target]
if len(df) < 5:
print("Not enough data to split for testing. Training on all available data.")
X_train, y_train = X, y
X_test, y_test = X, y # Use the same for scoring, not ideal but necessary
else:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training a Linear Regression model...")
model = LinearRegression()
model.fit(X_train, y_train)
score = model.score(X_test, y_test)
print(f"Model trained with R^2 score: {score:.2f}")
# ✨ FIX: Correctly path up one directory from `training` to `models`
script_dir = os.path.dirname(__file__) # This is the /training directory
models_dir = os.path.join(script_dir, '..', 'models')
os.makedirs(models_dir, exist_ok=True) # Ensure the /models directory exists
model_path = os.path.join(models_dir, 'payout_forecaster_v1.joblib')
joblib.dump(model, model_path)
print(f"Model successfully saved to: {model_path}")
if __name__ == "__main__":
train_model()