Spaces:

amitbhatt6075
/

reachify-ai-service

Running

App Files Files Community

reachify-ai-service / training /train_payout_forecaster.py

amitbhatt6075

Complete fresh start - FINAL UPLOAD

0914e96 16 days ago

raw

history blame

3.57 kB

	import os
	import joblib
	import pandas as pd
	from sklearn.linear_model import LinearRegression
	from sklearn.model_selection import train_test_split
	from supabase import create_client, Client
	from dotenv import load_dotenv

	# Load environment variables from a .env file
	load_dotenv()

	# --- Database Connection ---
	# Make sure these are in your .env file for the ai-service
	SUPABASE_URL = os.getenv("SUPABASE_URL")
	SUPABASE_SERVICE_KEY = os.getenv("SUPABASE_SERVICE_KEY") # Use service key for scripts

	if not SUPABASE_URL or not SUPABASE_SERVICE_KEY:
	raise ValueError("SUPABASE_URL and SUPABASE_SERVICE_KEY must be set in environment variables.")

	supabase: Client = create_client(SUPABASE_URL, SUPABASE_SERVICE_KEY)

	def fetch_training_data():
	"""Fetches completed campaign and payment data for training."""
	print("Fetching training data from Supabase...")

	# We use 'completed' status campaigns as the basis for training
	response = supabase.table("campaigns").select("id, budget, assigned_to").eq("status", "completed").execute()

	if not response.data:
	print("No completed campaigns found to train the model.")
	return None

	campaigns_df = pd.DataFrame(response.data)

	# Now, get the total actual payout for each campaign
	all_payments = []
	campaign_ids = campaigns_df['id'].tolist()

	# Ensure campaign_ids is not empty to avoid a Supabase error
	if not campaign_ids:
	return None

	payments_response = supabase.table("payments").select("campaign_id, amount").in_("campaign_id", campaign_ids).execute()
	payments_df = pd.DataFrame(payments_response.data)

	# Group by campaign_id and sum the payouts
	payout_summary = payments_df.groupby('campaign_id')['amount'].sum().reset_index()
	payout_summary = payout_summary.rename(columns={'amount': 'actual_payout'})

	# Merge the payout data with the campaign data
	training_data = pd.merge(campaigns_df, payout_summary, left_on='id', right_on='campaign_id')

	print(f"Successfully fetched and merged data for {len(training_data)} campaigns.")
	return training_data

	def train_model():
	"""Trains and saves the payout forecasting model."""
	df = fetch_training_data()
	if df is None or df.empty:
	print("Model training skipped due to lack of data.")
	return

	# Our features (X) and target (y)
	features = ['budget']
	target = 'actual_payout'

	X = df[features]
	y = df[target]

	if len(df) < 5:
	print("Not enough data to split for testing. Training on all available data.")
	X_train, y_train = X, y
	X_test, y_test = X, y # Use the same for scoring, not ideal but necessary
	else:
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

	print("Training a Linear Regression model...")
	model = LinearRegression()
	model.fit(X_train, y_train)

	score = model.score(X_test, y_test)
	print(f"Model trained with R^2 score: {score:.2f}")

	# ✨ FIX: Correctly path up one directory from `training` to `models`
	script_dir = os.path.dirname(__file__) # This is the /training directory
	models_dir = os.path.join(script_dir, '..', 'models')
	os.makedirs(models_dir, exist_ok=True) # Ensure the /models directory exists
	model_path = os.path.join(models_dir, 'payout_forecaster_v1.joblib')

	joblib.dump(model, model_path)

	print(f"Model successfully saved to: {model_path}")

	if __name__ == "__main__":
	train_model()