Spaces:
Sleeping
Sleeping
| # ...existing code... | |
| import joblib, json | |
| from pathlib import Path | |
| from xgboost import XGBClassifier | |
| from lightgbm import LGBMClassifier | |
| # ...existing code... | |
| import pandas as pd | |
| import numpy as np | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.calibration import CalibratedClassifierCV | |
| try: | |
| from ingest import metrics_report # type: ignore | |
| except Exception: | |
| try: | |
| from preprocess import metrics_report # type: ignore | |
| except Exception: | |
| # Minimal fallback implementation returning a dict compatible with existing summary DataFrame usage | |
| from sklearn.metrics import log_loss, accuracy_score, brier_score_loss, roc_auc_score | |
| def metrics_report(y_true, y_proba, name="model"): | |
| """ | |
| Minimal metrics report compatible with the rest of the script. | |
| Returns a dict with at least the key "model" so it can be placed into the summary DataFrame. | |
| """ | |
| y_true = np.asarray(y_true) | |
| proba = np.asarray(y_proba) | |
| # handle binary vs multiclass probabilities | |
| try: | |
| if proba.ndim == 1 or proba.shape[1] == 1: | |
| # binary | |
| proba_flat = proba.ravel() | |
| y_pred = (proba_flat >= 0.5).astype(int) | |
| ll = float(log_loss(y_true, proba_flat)) if y_true.size and proba_flat.size else float("nan") | |
| try: | |
| auc = float(roc_auc_score(y_true, proba_flat)) | |
| except Exception: | |
| auc = float("nan") | |
| try: | |
| brier = float(brier_score_loss(y_true, proba_flat)) | |
| except Exception: | |
| brier = float("nan") | |
| else: | |
| # multiclass | |
| y_pred = proba.argmax(axis=1) | |
| try: | |
| ll = float(log_loss(y_true, proba)) | |
| except Exception: | |
| ll = float("nan") | |
| try: | |
| auc = float(roc_auc_score(y_true, proba, multi_class="ovr")) | |
| except Exception: | |
| auc = float("nan") | |
| # multiclass Brier as mean squared error against one-hot | |
| try: | |
| n_classes = proba.shape[1] | |
| one_hot = np.eye(n_classes)[y_true] | |
| brier = float(np.mean(np.sum((proba - one_hot) ** 2, axis=1))) | |
| except Exception: | |
| brier = float("nan") | |
| except Exception: | |
| y_pred = np.zeros_like(y_true) | |
| ll = auc = brier = float("nan") | |
| acc = float(accuracy_score(y_true, y_pred)) if y_true.size else float("nan") | |
| return { | |
| "model": name, | |
| "accuracy": acc, | |
| "log_loss": ll, | |
| "roc_auc": auc, | |
| "brier": brier | |
| } | |
| def load_processed_data(target_col="target", data_dir=Path("data/processed")): | |
| """ | |
| Try multiple ways to obtain a processed dataframe: | |
| 1) call a load_processed_data / load_data function from local ingest or preprocess modules | |
| 2) look for common filenames under data/processed (parquet/csv) | |
| Returns: df (pd.DataFrame) | |
| """ | |
| # 1) try local modules | |
| try: | |
| from ingest import load_processed_data as _lp # type: ignore | |
| df = _lp() | |
| if isinstance(df, pd.DataFrame): | |
| return df | |
| except Exception: | |
| pass | |
| try: | |
| from preprocess import load_processed_data as _lp2 # type: ignore | |
| df = _lp2() | |
| if isinstance(df, pd.DataFrame): | |
| return df | |
| except Exception: | |
| pass | |
| # 2) look for common files | |
| candidates = [ | |
| data_dir / "processed.parquet", | |
| data_dir / "dataset.parquet", | |
| data_dir / "processed.csv", | |
| data_dir / "dataset.csv", | |
| data_dir / "train.parquet", | |
| data_dir / "train.csv", | |
| ] | |
| for fp in candidates: | |
| if fp.exists(): | |
| if fp.suffix == ".parquet": | |
| return pd.read_parquet(fp) | |
| else: | |
| return pd.read_csv(fp) | |
| raise FileNotFoundError( | |
| f"No processed data found. Checked modules ingest/preprocess and files under {data_dir}. " | |
| "Add a processed dataset or expose load_processed_data() in ingest/preprocess." | |
| ) | |
| # Load data and build train/valid/test splits | |
| df = load_processed_data() | |
| # infer target column | |
| TARGET = None | |
| for candidate in ("target", "label", "y", "outcome"): | |
| if candidate in df.columns: | |
| TARGET = candidate | |
| break | |
| if TARGET is None: | |
| raise KeyError("No target column found. Expected one of: target,label,y,outcome") | |
| # If dataset already includes a 'split' column with values 'train'/'valid'/'test', use it | |
| if "split" in df.columns: | |
| train_df = df[df["split"] == "train"].drop(columns=["split"]) | |
| valid_df = df[df["split"] == "valid"].drop(columns=["split"]) | |
| test_df = df[df["split"] == "test"].drop(columns=["split"]) | |
| else: | |
| # create splits: train/valid/test = 64%/16%/20% (approx) | |
| train_val, test_df = train_test_split(df, test_size=0.20, stratify=df[TARGET], random_state=42) | |
| train_df, valid_df = train_test_split(train_val, test_size=0.20, stratify=train_val[TARGET], random_state=42) | |
| X_cols = [c for c in df.columns if c != TARGET] | |
| WINDOW = int(df.attrs.get("WINDOW", 1)) if hasattr(df, "attrs") else 1 | |
| X_train = train_df[X_cols] | |
| y_train = train_df[TARGET] | |
| X_valid = valid_df[X_cols] | |
| y_valid = valid_df[TARGET] | |
| X_test = test_df[X_cols] | |
| y_test = test_df[TARGET] | |
| # ...existing code... | |
| xgb = XGBClassifier( | |
| n_estimators=6000, | |
| max_depth=50, | |
| learning_rate=0.05, | |
| subsample=0.9, | |
| colsample_bytree=0.9, | |
| objective="multi:softprob", | |
| num_class=3, | |
| reg_lambda=1.0, | |
| random_state=42, | |
| tree_method="hist" | |
| ) | |
| xgb = XGBClassifier( | |
| n_estimators=6000, | |
| max_depth=50, | |
| learning_rate=0.05, | |
| subsample=0.9, | |
| colsample_bytree=0.9, | |
| objective="multi:softprob", | |
| num_class=3, | |
| reg_lambda=1.0, | |
| random_state=42, | |
| tree_method="hist" | |
| ) | |
| xgb.fit(X_train, y_train) | |
| proba_xgb_valid = xgb.predict_proba(X_valid) | |
| proba_xgb_test = xgb.predict_proba(X_test) | |
| m_xgb_valid = metrics_report(y_valid, proba_xgb_valid, "xgb_valid") | |
| m_xgb_test = metrics_report(y_test, proba_xgb_test, "xgb_test") | |
| # Isotonic calibration | |
| cal_xgb = CalibratedClassifierCV(xgb, method="isotonic", cv="prefit") | |
| cal_xgb.fit(X_valid, y_valid) | |
| proba_xgb_cal_valid = cal_xgb.predict_proba(X_valid) | |
| proba_xgb_cal_test = cal_xgb.predict_proba(X_test) | |
| m_xgb_cal_valid = metrics_report(y_valid, proba_xgb_cal_valid, "xgb_cal_valid") | |
| m_xgb_cal_test = metrics_report(y_test, proba_xgb_cal_test, "xgb_cal_test") | |
| # Platt (optional) | |
| cal_xgb_platt = CalibratedClassifierCV(xgb, method="sigmoid", cv="prefit") | |
| cal_xgb_platt.fit(X_valid, y_valid) | |
| proba_xgb_platt_test = cal_xgb_platt.predict_proba(X_test) | |
| m_xgb_platt_test = metrics_report(y_test, proba_xgb_platt_test, "xgb_platt_test") | |
| # ----------------------------- | |
| # 8) LightGBM + Calibration | |
| # ----------------------------- | |
| lgbm = LGBMClassifier( | |
| n_estimators=12000, | |
| learning_rate=0.03, | |
| num_leaves=63, | |
| subsample=0.9, | |
| colsample_bytree=0.9, | |
| objective="multiclass", | |
| class_weight=None, | |
| random_state=42 | |
| ) | |
| lgbm.fit(X_train, y_train) | |
| proba_lgb_valid = lgbm.predict_proba(X_valid) | |
| proba_lgb_test = lgbm.predict_proba(X_test) | |
| m_lgb_valid = metrics_report(y_valid, proba_lgb_valid, "lgb_valid") | |
| m_lgb_test = metrics_report(y_test, proba_lgb_test, "lgb_test") | |
| cal_lgb = CalibratedClassifierCV(lgbm, method="isotonic", cv="prefit") | |
| cal_lgb.fit(X_valid, y_valid) | |
| proba_lgb_cal_valid = cal_lgb.predict_proba(X_valid) | |
| proba_lgb_cal_test = cal_lgb.predict_proba(X_test) | |
| m_lgb_cal_valid = metrics_report(y_valid, proba_lgb_cal_valid, "lgb_cal_valid") | |
| m_lgb_cal_test = metrics_report(y_test, proba_lgb_cal_test, "lgb_cal_test") | |
| from sklearn.base import BaseEstimator, ClassifierMixin | |
| class PriorProbaPredictor(BaseEstimator, ClassifierMixin): | |
| """Predict class probabilities equal to the class distribution in training data.""" | |
| def fit(self, X, y): | |
| y = np.asarray(y) | |
| classes, counts = np.unique(y, return_counts=True) | |
| self.classes_ = classes | |
| self.class_proba_ = counts / counts.sum() | |
| return self | |
| def predict_proba(self, X): | |
| n = len(X) | |
| # return array shape (n_samples, n_classes) following classes_ order | |
| return np.tile(self.class_proba_, (n, 1)) | |
| def predict(self, X): | |
| proba = self.predict_proba(X) | |
| return proba.argmax(axis=1) | |
| odds = PriorProbaPredictor() | |
| odds.fit(X_train, y_train) | |
| proba_odds_valid = odds.predict_proba(X_valid) | |
| proba_odds_test = odds.predict_proba(X_test) | |
| m_odds_valid = metrics_report(y_valid, proba_odds_valid, "odds_valid") | |
| m_odds_test = metrics_report(y_test, proba_odds_test, "odds_test") | |
| # ...existing code... | |
| # ----------------------------- | |
| # 9) Summary table | |
| # ----------------------------- | |
| summary = pd.DataFrame([ | |
| m_odds_valid, m_odds_test, | |
| m_xgb_valid, m_xgb_test, m_xgb_cal_valid, m_xgb_cal_test, m_xgb_platt_test, | |
| m_lgb_valid, m_lgb_test, m_lgb_cal_valid, m_lgb_cal_test | |
| ]).sort_values("model").reset_index(drop=True) | |
| try: | |
| from IPython.display import display as _display | |
| _display(summary) | |
| except Exception: | |
| print(summary.to_string(index=False)) | |
| # Optional: save metrics | |
| summary.to_csv("./evaluation/baseline_metrics.csv", index=False) | |
| print("Saved: baseline_metrics.csv") | |
| Path(".").mkdir(exist_ok=True) | |
| # เลือกโมเดลที่ต้องการใช้ inference (แนะนำตัวที่ calibrated แล้ว) | |
| joblib.dump(cal_xgb, "./model/model_xgb_isotonic.joblib") | |
| joblib.dump(cal_lgb, "./model/model_lgb_isotonic.joblib") | |
| # เก็บคอลัมน์ฟีเจอร์ และพารามิเตอร์สำคัญ | |
| with open("feature_columns.json", "w", encoding="utf-8") as f: | |
| json.dump({"X_cols": X_cols, "WINDOW": int(WINDOW)}, f, ensure_ascii=False, indent=2) | |
| print("Saved: model_xgb_isotonic.joblib, model_lgb_isotonic.joblib, feature_columns.json") |