Spaces:
Sleeping
Sleeping
| import numpy as np | |
| import pandas as pd | |
| from collections import defaultdict | |
| from datetime import datetime | |
| from typing import Optional, Tuple, Dict, List | |
| from unidecode import unidecode | |
| # --- Team name cleaner (shared) --- | |
| def clean_team(s: str) -> str: | |
| if pd.isna(s): | |
| return s | |
| s = unidecode(str(s)).strip() | |
| s = " ".join(s.split()) | |
| return s | |
| # --- Odds -> implied probabilities (normalized) --- | |
| def implied_from_odds(odds_tuple: Tuple[float, float, float]) -> Optional[Tuple[float, float, float]]: | |
| h, d, a = odds_tuple | |
| if min(h, d, a) <= 1.0: | |
| return None | |
| inv = np.array([1.0/h, 1.0/d, 1.0/a], dtype=float) | |
| s = inv.sum() | |
| if s <= 0: | |
| return None | |
| p = inv / s | |
| return float(p[0]), float(p[1]), float(p[2]) | |
| # --- Elo snapshot from historical df (up to date-1) --- | |
| def compute_elo_snapshot(df_hist: pd.DataFrame, base_elo: float = 1500.0, K: float = 20.0, home_adv: float = 60.0) -> Dict[str, float]: | |
| elo = defaultdict(lambda: base_elo) | |
| dfh = df_hist.sort_values("date").reset_index(drop=True) | |
| for _, r in dfh.iterrows(): | |
| h, a = r["home"], r["away"] | |
| eh, ea = elo[h], elo[a] | |
| ph = 1.0/(1.0 + 10**(-((eh+home_adv)-ea)/400)) | |
| if r["ftr"] == "H": | |
| oh, oa = 1.0, 0.0 | |
| elif r["ftr"] == "D": | |
| oh, oa = 0.5, 0.5 | |
| else: | |
| oh, oa = 0.0, 1.0 | |
| elo[h] = eh + K*(oh - ph) | |
| elo[a] = ea + K*((1.0-oh) - (1.0-ph)) | |
| return dict(elo) | |
| # --- Build rolling features for a single team from history --- | |
| def team_rolling_features(df_hist: pd.DataFrame, team_name: str, window: int = 6): | |
| rows: List[dict] = [] | |
| for _, r in df_hist.iterrows(): | |
| rows.append({ | |
| "date": r["date"], "team": r["home"], "is_home": 1, | |
| "gf": r["fthg"], "ga": r["ftag"], | |
| "shots_f": r.get("hs", np.nan), "shots_a": r.get("as", np.nan), | |
| "sot_f": r.get("hst", np.nan), "sot_a": r.get("ast", np.nan), | |
| "corn_f": r.get("hc", np.nan), "corn_a": r.get("ac", np.nan), | |
| "y_f": r.get("hy", np.nan), "y_a": r.get("ay", np.nan), | |
| "r_f": r.get("hr", np.nan), "r_a": r.get("ar", np.nan), | |
| "points": 3 if r["ftr"] == "H" else (1 if r["ftr"] == "D" else 0), | |
| }) | |
| rows.append({ | |
| "date": r["date"], "team": r["away"], "is_home": 0, | |
| "gf": r["ftag"], "ga": r["fthg"], | |
| "shots_f": r.get("as", np.nan), "shots_a": r.get("hs", np.nan), | |
| "sot_f": r.get("ast", np.nan), "sot_a": r.get("hst", np.nan), | |
| "corn_f": r.get("ac", np.nan), "corn_a": r.get("hc", np.nan), | |
| "y_f": r.get("ay", np.nan), "y_a": r.get("hy", np.nan), | |
| "r_f": r.get("ar", np.nan), "r_a": r.get("hr", np.nan), | |
| "points": 3 if r["ftr"] == "A" else (1 if r["ftr"] == "D" else 0), | |
| }) | |
| tm = pd.DataFrame(rows) | |
| if tm.empty: | |
| return None, None | |
| tm = tm.sort_values(["team", "date"]).reset_index(drop=True) | |
| agg_cols = ["gf", "ga", "shots_f", "shots_a", "sot_f", "sot_a", "corn_f", "corn_a", "y_f", "r_f", "points"] | |
| feats_home = {} | |
| feats_away = {} | |
| if (tm["team"] == team_name).any(): | |
| tdf = tm[tm["team"] == team_name] | |
| if (tdf["is_home"] == 1).any(): | |
| t_home = tdf[tdf["is_home"] == 1] | |
| for col in agg_cols: | |
| feats_home[f"roll_{col}"] = t_home[col].tail(window).mean() | |
| else: | |
| for col in agg_cols: | |
| feats_home[f"roll_{col}"] = np.nan | |
| if (tdf["is_home"] == 0).any(): | |
| t_away = tdf[tdf["is_home"] == 0] | |
| for col in agg_cols: | |
| feats_away[f"roll_{col}"] = t_away[col].tail(window).mean() | |
| else: | |
| for col in agg_cols: | |
| feats_away[f"roll_{col}"] = np.nan | |
| else: | |
| for col in agg_cols: | |
| feats_home[f"roll_{col}"] = np.nan | |
| feats_away[f"roll_{col}"] = np.nan | |
| return feats_home, feats_away | |
| # --- Build full feature vector for a fixture --- | |
| def build_features_for_fixture( | |
| home_team: str, | |
| away_team: str, | |
| date_str: str, | |
| df_all: pd.DataFrame, | |
| X_cols: List[str], | |
| window: int = 6, | |
| odds_tuple: Optional[Tuple[float, float, float]] = None, | |
| feat_df_for_medians: Optional[pd.DataFrame] = None, | |
| ): | |
| home = clean_team(home_team) | |
| away = clean_team(away_team) | |
| match_date = datetime.strptime(date_str, "%Y-%m-%d") | |
| df_hist = df_all[df_all["date"] < match_date].copy() | |
| if df_hist.empty: | |
| raise ValueError("No historical data found before match date. Try a later date or load more seasons.") | |
| # Elo snapshot | |
| elo_map = compute_elo_snapshot(df_hist) | |
| elo_home = float(elo_map.get(home, 1500.0)) | |
| elo_away = float(elo_map.get(away, 1500.0)) | |
| elo_diff = elo_home - elo_away | |
| # Rolling features by role | |
| feats_home_homeRole, _ = team_rolling_features(df_hist, home, window) | |
| _, feats_away_awayRole = team_rolling_features(df_hist, away, window) | |
| if feats_home_homeRole is None: | |
| feats_home_homeRole = {f"roll_{c}": np.nan for c in ["gf","ga","shots_f","shots_a","sot_f","sot_a","corn_f","corn_a","y_f","r_f","points"]} | |
| if feats_away_awayRole is None: | |
| feats_away_awayRole = {f"roll_{c}": np.nan for c in ["gf","ga","shots_f","shots_a","sot_f","sot_a","corn_f","corn_a","y_f","r_f","points"]} | |
| # Odds -> probs | |
| if odds_tuple is not None: | |
| probs = implied_from_odds(odds_tuple) | |
| if probs is None: | |
| raise ValueError("Invalid odds provided. Use decimal odds > 1.0") | |
| pH, pD, pA = probs | |
| else: | |
| # Proxy from Elo + average draw rate | |
| draws = (df_hist["ftr"] == "D").mean() | |
| draws = float(draws) if not np.isnan(draws) and draws > 0 else 0.25 | |
| k = 400.0 | |
| ph_nodraw = 1.0/(1.0 + 10**(-(elo_diff)/k)) | |
| pa_nodraw = 1.0 - ph_nodraw | |
| pH = ph_nodraw * (1.0 - draws) | |
| pA = pa_nodraw * (1.0 - draws) | |
| pD = draws | |
| s = pH + pD + pA | |
| pH, pD, pA = pH/s, pD/s, pA/s | |
| # Assemble features in X_cols order | |
| feat_row: Dict[str, float] = {} | |
| for k, v in feats_home_homeRole.items(): | |
| feat_row[f"home_{k}"] = v | |
| for k, v in feats_away_awayRole.items(): | |
| feat_row[f"away_{k}"] = v | |
| feat_row["elo_home"] = elo_home | |
| feat_row["elo_away"] = elo_away | |
| feat_row["elo_diff"] = elo_diff | |
| feat_row["p_odds_H"] = float(pH) | |
| feat_row["p_odds_D"] = float(pD) | |
| feat_row["p_odds_A"] = float(pA) | |
| x_vals = [feat_row.get(c, np.nan) for c in X_cols] | |
| x = np.array(x_vals, dtype=float).reshape(1, -1) | |
| if feat_df_for_medians is not None: | |
| medians = {c: float(feat_df_for_medians[c].median()) if c in feat_df_for_medians.columns else 0.0 for c in X_cols} | |
| else: | |
| # fallback zeros | |
| medians = {c: 0.0 for c in X_cols} | |
| for j, c in enumerate(X_cols): | |
| if np.isnan(x[0, j]): | |
| x[0, j] = medians[c] | |
| context = { | |
| "p_odds_H": pH, | |
| "p_odds_D": pD, | |
| "p_odds_A": pA, | |
| "elo_home": elo_home, | |
| "elo_away": elo_away, | |
| } | |
| return x, context | |