Spaces:

amitbhatt6075
/

reachify-ai-service

Running

App Files Files Community

reachify-ai-service / training /train_thunderbird_market_predictor.py

amitbhatt6075

feat(thunderbird): Add market intelligence module with new model and APIs

01c71d2 9 days ago

raw

history blame

3.59 kB

	import os
	import pandas as pd
	from sklearn.model_selection import train_test_split
	from sklearn.ensemble import GradientBoostingRegressor
	from sklearn.metrics import mean_squared_error
	from sklearn.preprocessing import OneHotEncoder
	import joblib

	# --- CONFIGURATION ---

	# Path to the training data we created in the previous step
	INPUT_FILE = os.path.join(os.path.dirname(__file__), '..', 'data', 'thunderbird_market_trends.csv')

	# Path to save the trained model
	MODEL_OUTPUT_FILE = os.path.join(os.path.dirname(__file__), '..', 'models', 'thunderbird_market_predictor_v1.joblib')

	# --- MAIN SCRIPT ---

	def train_model():
	"""
	Loads the training data, prepares it for the model, trains the model,
	and saves the final version to a .joblib file.
	"""
	print("--- Starting Project Thunderbird Model Training ---")

	# 1. Load Data
	try:
	df = pd.read_csv(INPUT_FILE)
	print(f"✅ Successfully loaded training data from '{INPUT_FILE}'")
	except FileNotFoundError:
	print(f"❌ Error: Training data file not found at '{INPUT_FILE}'.")
	print(" Please run `scripts/export_thunderbird_training_data.py` first.")
	return

	# 2. Prepare Data (Feature Engineering)
	print("\n🚀 Preparing data for the model...")
	# 'month' and 'niche' are categorical. The model needs numbers.
	# We will use one-hot encoding for the 'niche'.
	encoder = OneHotEncoder(handle_unknown='ignore')
	niche_encoded = encoder.fit_transform(df[['niche']]).toarray()

	# Create a new DataFrame with the encoded columns
	niche_df = pd.DataFrame(niche_encoded, columns=encoder.get_feature_names_out(['niche']))

	# We won't use 'month' directly, as the trend score already has the time component.
	# Our features are the market trend and the niche type.
	X = pd.concat([df[['trend_score']], niche_df], axis=1)

	# Our target is to predict how many successful campaigns there will be.
	y = df['successful_campaigns']

	print(f"✅ Data prepared. Features: {X.columns.tolist()}")

	# 3. Split data for training and testing
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
	print(f"\n🚀 Splitting data: {len(X_train)} rows for training, {len(X_test)} rows for testing.")

	# 4. Train the Model
	print("\n🧠 Training the Gradient Boosting Regressor model...")

	# Gradient Boosting is a good choice for this kind of tabular data.
	model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
	model.fit(X_train, y_train)

	print("✅ Model training complete.")

	# 5. Evaluate the Model (optional, but good practice)
	predictions = model.predict(X_test)
	mse = mean_squared_error(y_test, predictions)
	print(f"\n📊 Model evaluation (Mean Squared Error): {mse:.2f}")
	print(" (Lower is better. A small number means the model's predictions are close to the real values).")


	# 6. Save the Trained Model and the Encoder
	print(f"\n💾 Saving the trained model and encoder...")
	try:
	# We need to save BOTH the model AND the encoder, so we can use it for predictions later.
	model_and_encoder = {
	'model': model,
	'encoder': encoder
	}
	joblib.dump(model_and_encoder, MODEL_OUTPUT_FILE)
	print(f"✅ Success! Model has been saved to:")
	print(f" {MODEL_OUTPUT_FILE}")
	except Exception as e:
	print(f"\n❌ Error saving model file: {e}")


	if __name__ == "__main__":
	train_model()