import pandas as pd from sklearn.model_selection import train_test_split from sklearn.ensemble import GradientBoostingRegressor from sklearn.preprocessing import OneHotEncoder from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline import joblib import os print("--- Starting Performance Model Training ---") # Data load karna data_path = os.path.join(os.path.dirname(__file__), '..', 'data', 'dummy_campaigns.csv') df = pd.read_csv(data_path) print(f"Loaded {len(df)} rows from {data_path}") # Features (X) aur Target (y) define karna # Hum in features se 'final_reach' predict karna chahte hain features = ['category', 'budget', 'location', 'platform'] target = 'final_reach' X = df[features] y = df[target] # Categorical features ko pehchanna categorical_features = ['category', 'location', 'platform'] # Budget ek numerical feature hai, usko chhod denge # Preprocessing pipeline banana # OneHotEncoder categorical data ko numerical format mein badalta hai preprocessor = ColumnTransformer( transformers=[ ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features) ], remainder='passthrough' # 'budget' column ko aise hi rehne do ) # Poora ML Pipeline: 1. Preprocess karo, 2. Model se train karo model_pipeline = Pipeline(steps=[ ('preprocessor', preprocessor), ('regressor', GradientBoostingRegressor(n_estimators=100, random_state=42)) ]) # Model ko train karna print("Training the Gradient Boosting Regressor model...") model_pipeline.fit(X, y) print("--- Model training complete! ---") # Trained model ko save karna model_path = os.path.join(os.path.dirname(__file__), '..', 'models', 'performance_predictor_v1.joblib') joblib.dump(model_pipeline, model_path) print(f"--- Model saved successfully to {model_path} ---")