|
|
import os |
|
|
import joblib |
|
|
import pandas as pd |
|
|
from sklearn.linear_model import LinearRegression |
|
|
from sklearn.model_selection import train_test_split |
|
|
from supabase import create_client, Client |
|
|
from dotenv import load_dotenv |
|
|
|
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
|
|
|
|
|
|
SUPABASE_URL = os.getenv("SUPABASE_URL") |
|
|
SUPABASE_SERVICE_KEY = os.getenv("SUPABASE_SERVICE_KEY") |
|
|
|
|
|
if not SUPABASE_URL or not SUPABASE_SERVICE_KEY: |
|
|
raise ValueError("SUPABASE_URL and SUPABASE_SERVICE_KEY must be set in environment variables.") |
|
|
|
|
|
supabase: Client = create_client(SUPABASE_URL, SUPABASE_SERVICE_KEY) |
|
|
|
|
|
def fetch_training_data(): |
|
|
"""Fetches completed campaign and payment data for training.""" |
|
|
print("Fetching training data from Supabase...") |
|
|
|
|
|
|
|
|
response = supabase.table("campaigns").select("id, budget, assigned_to").eq("status", "completed").execute() |
|
|
|
|
|
if not response.data: |
|
|
print("No completed campaigns found to train the model.") |
|
|
return None |
|
|
|
|
|
campaigns_df = pd.DataFrame(response.data) |
|
|
|
|
|
|
|
|
all_payments = [] |
|
|
campaign_ids = campaigns_df['id'].tolist() |
|
|
|
|
|
|
|
|
if not campaign_ids: |
|
|
return None |
|
|
|
|
|
payments_response = supabase.table("payments").select("campaign_id, amount").in_("campaign_id", campaign_ids).execute() |
|
|
payments_df = pd.DataFrame(payments_response.data) |
|
|
|
|
|
|
|
|
payout_summary = payments_df.groupby('campaign_id')['amount'].sum().reset_index() |
|
|
payout_summary = payout_summary.rename(columns={'amount': 'actual_payout'}) |
|
|
|
|
|
|
|
|
training_data = pd.merge(campaigns_df, payout_summary, left_on='id', right_on='campaign_id') |
|
|
|
|
|
print(f"Successfully fetched and merged data for {len(training_data)} campaigns.") |
|
|
return training_data |
|
|
|
|
|
def train_model(): |
|
|
"""Trains and saves the payout forecasting model.""" |
|
|
df = fetch_training_data() |
|
|
if df is None or df.empty: |
|
|
print("Model training skipped due to lack of data.") |
|
|
return |
|
|
|
|
|
|
|
|
features = ['budget'] |
|
|
target = 'actual_payout' |
|
|
|
|
|
X = df[features] |
|
|
y = df[target] |
|
|
|
|
|
if len(df) < 5: |
|
|
print("Not enough data to split for testing. Training on all available data.") |
|
|
X_train, y_train = X, y |
|
|
X_test, y_test = X, y |
|
|
else: |
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) |
|
|
|
|
|
print("Training a Linear Regression model...") |
|
|
model = LinearRegression() |
|
|
model.fit(X_train, y_train) |
|
|
|
|
|
score = model.score(X_test, y_test) |
|
|
print(f"Model trained with R^2 score: {score:.2f}") |
|
|
|
|
|
|
|
|
script_dir = os.path.dirname(__file__) |
|
|
models_dir = os.path.join(script_dir, '..', 'models') |
|
|
os.makedirs(models_dir, exist_ok=True) |
|
|
model_path = os.path.join(models_dir, 'payout_forecaster_v1.joblib') |
|
|
|
|
|
joblib.dump(model, model_path) |
|
|
|
|
|
print(f"Model successfully saved to: {model_path}") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
train_model() |