# ...existing code... import joblib, json from pathlib import Path from xgboost import XGBClassifier from lightgbm import LGBMClassifier # ...existing code... import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.calibration import CalibratedClassifierCV try: from ingest import metrics_report # type: ignore except Exception: try: from preprocess import metrics_report # type: ignore except Exception: # Minimal fallback implementation returning a dict compatible with existing summary DataFrame usage from sklearn.metrics import log_loss, accuracy_score, brier_score_loss, roc_auc_score def metrics_report(y_true, y_proba, name="model"): """ Minimal metrics report compatible with the rest of the script. Returns a dict with at least the key "model" so it can be placed into the summary DataFrame. """ y_true = np.asarray(y_true) proba = np.asarray(y_proba) # handle binary vs multiclass probabilities try: if proba.ndim == 1 or proba.shape[1] == 1: # binary proba_flat = proba.ravel() y_pred = (proba_flat >= 0.5).astype(int) ll = float(log_loss(y_true, proba_flat)) if y_true.size and proba_flat.size else float("nan") try: auc = float(roc_auc_score(y_true, proba_flat)) except Exception: auc = float("nan") try: brier = float(brier_score_loss(y_true, proba_flat)) except Exception: brier = float("nan") else: # multiclass y_pred = proba.argmax(axis=1) try: ll = float(log_loss(y_true, proba)) except Exception: ll = float("nan") try: auc = float(roc_auc_score(y_true, proba, multi_class="ovr")) except Exception: auc = float("nan") # multiclass Brier as mean squared error against one-hot try: n_classes = proba.shape[1] one_hot = np.eye(n_classes)[y_true] brier = float(np.mean(np.sum((proba - one_hot) ** 2, axis=1))) except Exception: brier = float("nan") except Exception: y_pred = np.zeros_like(y_true) ll = auc = brier = float("nan") acc = float(accuracy_score(y_true, y_pred)) if y_true.size else float("nan") return { "model": name, "accuracy": acc, "log_loss": ll, "roc_auc": auc, "brier": brier } def load_processed_data(target_col="target", data_dir=Path("data/processed")): """ Try multiple ways to obtain a processed dataframe: 1) call a load_processed_data / load_data function from local ingest or preprocess modules 2) look for common filenames under data/processed (parquet/csv) Returns: df (pd.DataFrame) """ # 1) try local modules try: from ingest import load_processed_data as _lp # type: ignore df = _lp() if isinstance(df, pd.DataFrame): return df except Exception: pass try: from preprocess import load_processed_data as _lp2 # type: ignore df = _lp2() if isinstance(df, pd.DataFrame): return df except Exception: pass # 2) look for common files candidates = [ data_dir / "processed.parquet", data_dir / "dataset.parquet", data_dir / "processed.csv", data_dir / "dataset.csv", data_dir / "train.parquet", data_dir / "train.csv", ] for fp in candidates: if fp.exists(): if fp.suffix == ".parquet": return pd.read_parquet(fp) else: return pd.read_csv(fp) raise FileNotFoundError( f"No processed data found. Checked modules ingest/preprocess and files under {data_dir}. " "Add a processed dataset or expose load_processed_data() in ingest/preprocess." ) # Load data and build train/valid/test splits df = load_processed_data() # infer target column TARGET = None for candidate in ("target", "label", "y", "outcome"): if candidate in df.columns: TARGET = candidate break if TARGET is None: raise KeyError("No target column found. Expected one of: target,label,y,outcome") # If dataset already includes a 'split' column with values 'train'/'valid'/'test', use it if "split" in df.columns: train_df = df[df["split"] == "train"].drop(columns=["split"]) valid_df = df[df["split"] == "valid"].drop(columns=["split"]) test_df = df[df["split"] == "test"].drop(columns=["split"]) else: # create splits: train/valid/test = 64%/16%/20% (approx) train_val, test_df = train_test_split(df, test_size=0.20, stratify=df[TARGET], random_state=42) train_df, valid_df = train_test_split(train_val, test_size=0.20, stratify=train_val[TARGET], random_state=42) X_cols = [c for c in df.columns if c != TARGET] WINDOW = int(df.attrs.get("WINDOW", 1)) if hasattr(df, "attrs") else 1 X_train = train_df[X_cols] y_train = train_df[TARGET] X_valid = valid_df[X_cols] y_valid = valid_df[TARGET] X_test = test_df[X_cols] y_test = test_df[TARGET] # ...existing code... xgb = XGBClassifier( n_estimators=6000, max_depth=50, learning_rate=0.05, subsample=0.9, colsample_bytree=0.9, objective="multi:softprob", num_class=3, reg_lambda=1.0, random_state=42, tree_method="hist" ) xgb = XGBClassifier( n_estimators=6000, max_depth=50, learning_rate=0.05, subsample=0.9, colsample_bytree=0.9, objective="multi:softprob", num_class=3, reg_lambda=1.0, random_state=42, tree_method="hist" ) xgb.fit(X_train, y_train) proba_xgb_valid = xgb.predict_proba(X_valid) proba_xgb_test = xgb.predict_proba(X_test) m_xgb_valid = metrics_report(y_valid, proba_xgb_valid, "xgb_valid") m_xgb_test = metrics_report(y_test, proba_xgb_test, "xgb_test") # Isotonic calibration cal_xgb = CalibratedClassifierCV(xgb, method="isotonic", cv="prefit") cal_xgb.fit(X_valid, y_valid) proba_xgb_cal_valid = cal_xgb.predict_proba(X_valid) proba_xgb_cal_test = cal_xgb.predict_proba(X_test) m_xgb_cal_valid = metrics_report(y_valid, proba_xgb_cal_valid, "xgb_cal_valid") m_xgb_cal_test = metrics_report(y_test, proba_xgb_cal_test, "xgb_cal_test") # Platt (optional) cal_xgb_platt = CalibratedClassifierCV(xgb, method="sigmoid", cv="prefit") cal_xgb_platt.fit(X_valid, y_valid) proba_xgb_platt_test = cal_xgb_platt.predict_proba(X_test) m_xgb_platt_test = metrics_report(y_test, proba_xgb_platt_test, "xgb_platt_test") # ----------------------------- # 8) LightGBM + Calibration # ----------------------------- lgbm = LGBMClassifier( n_estimators=12000, learning_rate=0.03, num_leaves=63, subsample=0.9, colsample_bytree=0.9, objective="multiclass", class_weight=None, random_state=42 ) lgbm.fit(X_train, y_train) proba_lgb_valid = lgbm.predict_proba(X_valid) proba_lgb_test = lgbm.predict_proba(X_test) m_lgb_valid = metrics_report(y_valid, proba_lgb_valid, "lgb_valid") m_lgb_test = metrics_report(y_test, proba_lgb_test, "lgb_test") cal_lgb = CalibratedClassifierCV(lgbm, method="isotonic", cv="prefit") cal_lgb.fit(X_valid, y_valid) proba_lgb_cal_valid = cal_lgb.predict_proba(X_valid) proba_lgb_cal_test = cal_lgb.predict_proba(X_test) m_lgb_cal_valid = metrics_report(y_valid, proba_lgb_cal_valid, "lgb_cal_valid") m_lgb_cal_test = metrics_report(y_test, proba_lgb_cal_test, "lgb_cal_test") from sklearn.base import BaseEstimator, ClassifierMixin class PriorProbaPredictor(BaseEstimator, ClassifierMixin): """Predict class probabilities equal to the class distribution in training data.""" def fit(self, X, y): y = np.asarray(y) classes, counts = np.unique(y, return_counts=True) self.classes_ = classes self.class_proba_ = counts / counts.sum() return self def predict_proba(self, X): n = len(X) # return array shape (n_samples, n_classes) following classes_ order return np.tile(self.class_proba_, (n, 1)) def predict(self, X): proba = self.predict_proba(X) return proba.argmax(axis=1) odds = PriorProbaPredictor() odds.fit(X_train, y_train) proba_odds_valid = odds.predict_proba(X_valid) proba_odds_test = odds.predict_proba(X_test) m_odds_valid = metrics_report(y_valid, proba_odds_valid, "odds_valid") m_odds_test = metrics_report(y_test, proba_odds_test, "odds_test") # ...existing code... # ----------------------------- # 9) Summary table # ----------------------------- summary = pd.DataFrame([ m_odds_valid, m_odds_test, m_xgb_valid, m_xgb_test, m_xgb_cal_valid, m_xgb_cal_test, m_xgb_platt_test, m_lgb_valid, m_lgb_test, m_lgb_cal_valid, m_lgb_cal_test ]).sort_values("model").reset_index(drop=True) try: from IPython.display import display as _display _display(summary) except Exception: print(summary.to_string(index=False)) # Optional: save metrics summary.to_csv("./evaluation/baseline_metrics.csv", index=False) print("Saved: baseline_metrics.csv") Path(".").mkdir(exist_ok=True) # เลือกโมเดลที่ต้องการใช้ inference (แนะนำตัวที่ calibrated แล้ว) joblib.dump(cal_xgb, "./model/model_xgb_isotonic.joblib") joblib.dump(cal_lgb, "./model/model_lgb_isotonic.joblib") # เก็บคอลัมน์ฟีเจอร์ และพารามิเตอร์สำคัญ with open("feature_columns.json", "w", encoding="utf-8") as f: json.dump({"X_cols": X_cols, "WINDOW": int(WINDOW)}, f, ensure_ascii=False, indent=2) print("Saved: model_xgb_isotonic.joblib, model_lgb_isotonic.joblib, feature_columns.json")