import numpy as np import pandas as pd from collections import defaultdict from datetime import datetime from typing import Optional, Tuple, Dict, List from unidecode import unidecode # --- Team name cleaner (shared) --- def clean_team(s: str) -> str: if pd.isna(s): return s s = unidecode(str(s)).strip() s = " ".join(s.split()) return s # --- Odds -> implied probabilities (normalized) --- def implied_from_odds(odds_tuple: Tuple[float, float, float]) -> Optional[Tuple[float, float, float]]: h, d, a = odds_tuple if min(h, d, a) <= 1.0: return None inv = np.array([1.0/h, 1.0/d, 1.0/a], dtype=float) s = inv.sum() if s <= 0: return None p = inv / s return float(p[0]), float(p[1]), float(p[2]) # --- Elo snapshot from historical df (up to date-1) --- def compute_elo_snapshot(df_hist: pd.DataFrame, base_elo: float = 1500.0, K: float = 20.0, home_adv: float = 60.0) -> Dict[str, float]: elo = defaultdict(lambda: base_elo) dfh = df_hist.sort_values("date").reset_index(drop=True) for _, r in dfh.iterrows(): h, a = r["home"], r["away"] eh, ea = elo[h], elo[a] ph = 1.0/(1.0 + 10**(-((eh+home_adv)-ea)/400)) if r["ftr"] == "H": oh, oa = 1.0, 0.0 elif r["ftr"] == "D": oh, oa = 0.5, 0.5 else: oh, oa = 0.0, 1.0 elo[h] = eh + K*(oh - ph) elo[a] = ea + K*((1.0-oh) - (1.0-ph)) return dict(elo) # --- Build rolling features for a single team from history --- def team_rolling_features(df_hist: pd.DataFrame, team_name: str, window: int = 6): rows: List[dict] = [] for _, r in df_hist.iterrows(): rows.append({ "date": r["date"], "team": r["home"], "is_home": 1, "gf": r["fthg"], "ga": r["ftag"], "shots_f": r.get("hs", np.nan), "shots_a": r.get("as", np.nan), "sot_f": r.get("hst", np.nan), "sot_a": r.get("ast", np.nan), "corn_f": r.get("hc", np.nan), "corn_a": r.get("ac", np.nan), "y_f": r.get("hy", np.nan), "y_a": r.get("ay", np.nan), "r_f": r.get("hr", np.nan), "r_a": r.get("ar", np.nan), "points": 3 if r["ftr"] == "H" else (1 if r["ftr"] == "D" else 0), }) rows.append({ "date": r["date"], "team": r["away"], "is_home": 0, "gf": r["ftag"], "ga": r["fthg"], "shots_f": r.get("as", np.nan), "shots_a": r.get("hs", np.nan), "sot_f": r.get("ast", np.nan), "sot_a": r.get("hst", np.nan), "corn_f": r.get("ac", np.nan), "corn_a": r.get("hc", np.nan), "y_f": r.get("ay", np.nan), "y_a": r.get("hy", np.nan), "r_f": r.get("ar", np.nan), "r_a": r.get("hr", np.nan), "points": 3 if r["ftr"] == "A" else (1 if r["ftr"] == "D" else 0), }) tm = pd.DataFrame(rows) if tm.empty: return None, None tm = tm.sort_values(["team", "date"]).reset_index(drop=True) agg_cols = ["gf", "ga", "shots_f", "shots_a", "sot_f", "sot_a", "corn_f", "corn_a", "y_f", "r_f", "points"] feats_home = {} feats_away = {} if (tm["team"] == team_name).any(): tdf = tm[tm["team"] == team_name] if (tdf["is_home"] == 1).any(): t_home = tdf[tdf["is_home"] == 1] for col in agg_cols: feats_home[f"roll_{col}"] = t_home[col].tail(window).mean() else: for col in agg_cols: feats_home[f"roll_{col}"] = np.nan if (tdf["is_home"] == 0).any(): t_away = tdf[tdf["is_home"] == 0] for col in agg_cols: feats_away[f"roll_{col}"] = t_away[col].tail(window).mean() else: for col in agg_cols: feats_away[f"roll_{col}"] = np.nan else: for col in agg_cols: feats_home[f"roll_{col}"] = np.nan feats_away[f"roll_{col}"] = np.nan return feats_home, feats_away # --- Build full feature vector for a fixture --- def build_features_for_fixture( home_team: str, away_team: str, date_str: str, df_all: pd.DataFrame, X_cols: List[str], window: int = 6, odds_tuple: Optional[Tuple[float, float, float]] = None, feat_df_for_medians: Optional[pd.DataFrame] = None, ): home = clean_team(home_team) away = clean_team(away_team) match_date = datetime.strptime(date_str, "%Y-%m-%d") df_hist = df_all[df_all["date"] < match_date].copy() if df_hist.empty: raise ValueError("No historical data found before match date. Try a later date or load more seasons.") # Elo snapshot elo_map = compute_elo_snapshot(df_hist) elo_home = float(elo_map.get(home, 1500.0)) elo_away = float(elo_map.get(away, 1500.0)) elo_diff = elo_home - elo_away # Rolling features by role feats_home_homeRole, _ = team_rolling_features(df_hist, home, window) _, feats_away_awayRole = team_rolling_features(df_hist, away, window) if feats_home_homeRole is None: feats_home_homeRole = {f"roll_{c}": np.nan for c in ["gf","ga","shots_f","shots_a","sot_f","sot_a","corn_f","corn_a","y_f","r_f","points"]} if feats_away_awayRole is None: feats_away_awayRole = {f"roll_{c}": np.nan for c in ["gf","ga","shots_f","shots_a","sot_f","sot_a","corn_f","corn_a","y_f","r_f","points"]} # Odds -> probs if odds_tuple is not None: probs = implied_from_odds(odds_tuple) if probs is None: raise ValueError("Invalid odds provided. Use decimal odds > 1.0") pH, pD, pA = probs else: # Proxy from Elo + average draw rate draws = (df_hist["ftr"] == "D").mean() draws = float(draws) if not np.isnan(draws) and draws > 0 else 0.25 k = 400.0 ph_nodraw = 1.0/(1.0 + 10**(-(elo_diff)/k)) pa_nodraw = 1.0 - ph_nodraw pH = ph_nodraw * (1.0 - draws) pA = pa_nodraw * (1.0 - draws) pD = draws s = pH + pD + pA pH, pD, pA = pH/s, pD/s, pA/s # Assemble features in X_cols order feat_row: Dict[str, float] = {} for k, v in feats_home_homeRole.items(): feat_row[f"home_{k}"] = v for k, v in feats_away_awayRole.items(): feat_row[f"away_{k}"] = v feat_row["elo_home"] = elo_home feat_row["elo_away"] = elo_away feat_row["elo_diff"] = elo_diff feat_row["p_odds_H"] = float(pH) feat_row["p_odds_D"] = float(pD) feat_row["p_odds_A"] = float(pA) x_vals = [feat_row.get(c, np.nan) for c in X_cols] x = np.array(x_vals, dtype=float).reshape(1, -1) if feat_df_for_medians is not None: medians = {c: float(feat_df_for_medians[c].median()) if c in feat_df_for_medians.columns else 0.0 for c in X_cols} else: # fallback zeros medians = {c: 0.0 for c in X_cols} for j, c in enumerate(X_cols): if np.isnan(x[0, j]): x[0, j] = medians[c] context = { "p_odds_H": pH, "p_odds_D": pD, "p_odds_A": pA, "elo_home": elo_home, "elo_away": elo_away, } return x, context