Match-Prediction / inference_utils.py
Teera's picture
Upload 8 files
46b7fa3 verified
import numpy as np
import pandas as pd
from collections import defaultdict
from datetime import datetime
from typing import Optional, Tuple, Dict, List
from unidecode import unidecode
# --- Team name cleaner (shared) ---
def clean_team(s: str) -> str:
if pd.isna(s):
return s
s = unidecode(str(s)).strip()
s = " ".join(s.split())
return s
# --- Odds -> implied probabilities (normalized) ---
def implied_from_odds(odds_tuple: Tuple[float, float, float]) -> Optional[Tuple[float, float, float]]:
h, d, a = odds_tuple
if min(h, d, a) <= 1.0:
return None
inv = np.array([1.0/h, 1.0/d, 1.0/a], dtype=float)
s = inv.sum()
if s <= 0:
return None
p = inv / s
return float(p[0]), float(p[1]), float(p[2])
# --- Elo snapshot from historical df (up to date-1) ---
def compute_elo_snapshot(df_hist: pd.DataFrame, base_elo: float = 1500.0, K: float = 20.0, home_adv: float = 60.0) -> Dict[str, float]:
elo = defaultdict(lambda: base_elo)
dfh = df_hist.sort_values("date").reset_index(drop=True)
for _, r in dfh.iterrows():
h, a = r["home"], r["away"]
eh, ea = elo[h], elo[a]
ph = 1.0/(1.0 + 10**(-((eh+home_adv)-ea)/400))
if r["ftr"] == "H":
oh, oa = 1.0, 0.0
elif r["ftr"] == "D":
oh, oa = 0.5, 0.5
else:
oh, oa = 0.0, 1.0
elo[h] = eh + K*(oh - ph)
elo[a] = ea + K*((1.0-oh) - (1.0-ph))
return dict(elo)
# --- Build rolling features for a single team from history ---
def team_rolling_features(df_hist: pd.DataFrame, team_name: str, window: int = 6):
rows: List[dict] = []
for _, r in df_hist.iterrows():
rows.append({
"date": r["date"], "team": r["home"], "is_home": 1,
"gf": r["fthg"], "ga": r["ftag"],
"shots_f": r.get("hs", np.nan), "shots_a": r.get("as", np.nan),
"sot_f": r.get("hst", np.nan), "sot_a": r.get("ast", np.nan),
"corn_f": r.get("hc", np.nan), "corn_a": r.get("ac", np.nan),
"y_f": r.get("hy", np.nan), "y_a": r.get("ay", np.nan),
"r_f": r.get("hr", np.nan), "r_a": r.get("ar", np.nan),
"points": 3 if r["ftr"] == "H" else (1 if r["ftr"] == "D" else 0),
})
rows.append({
"date": r["date"], "team": r["away"], "is_home": 0,
"gf": r["ftag"], "ga": r["fthg"],
"shots_f": r.get("as", np.nan), "shots_a": r.get("hs", np.nan),
"sot_f": r.get("ast", np.nan), "sot_a": r.get("hst", np.nan),
"corn_f": r.get("ac", np.nan), "corn_a": r.get("hc", np.nan),
"y_f": r.get("ay", np.nan), "y_a": r.get("hy", np.nan),
"r_f": r.get("ar", np.nan), "r_a": r.get("hr", np.nan),
"points": 3 if r["ftr"] == "A" else (1 if r["ftr"] == "D" else 0),
})
tm = pd.DataFrame(rows)
if tm.empty:
return None, None
tm = tm.sort_values(["team", "date"]).reset_index(drop=True)
agg_cols = ["gf", "ga", "shots_f", "shots_a", "sot_f", "sot_a", "corn_f", "corn_a", "y_f", "r_f", "points"]
feats_home = {}
feats_away = {}
if (tm["team"] == team_name).any():
tdf = tm[tm["team"] == team_name]
if (tdf["is_home"] == 1).any():
t_home = tdf[tdf["is_home"] == 1]
for col in agg_cols:
feats_home[f"roll_{col}"] = t_home[col].tail(window).mean()
else:
for col in agg_cols:
feats_home[f"roll_{col}"] = np.nan
if (tdf["is_home"] == 0).any():
t_away = tdf[tdf["is_home"] == 0]
for col in agg_cols:
feats_away[f"roll_{col}"] = t_away[col].tail(window).mean()
else:
for col in agg_cols:
feats_away[f"roll_{col}"] = np.nan
else:
for col in agg_cols:
feats_home[f"roll_{col}"] = np.nan
feats_away[f"roll_{col}"] = np.nan
return feats_home, feats_away
# --- Build full feature vector for a fixture ---
def build_features_for_fixture(
home_team: str,
away_team: str,
date_str: str,
df_all: pd.DataFrame,
X_cols: List[str],
window: int = 6,
odds_tuple: Optional[Tuple[float, float, float]] = None,
feat_df_for_medians: Optional[pd.DataFrame] = None,
):
home = clean_team(home_team)
away = clean_team(away_team)
match_date = datetime.strptime(date_str, "%Y-%m-%d")
df_hist = df_all[df_all["date"] < match_date].copy()
if df_hist.empty:
raise ValueError("No historical data found before match date. Try a later date or load more seasons.")
# Elo snapshot
elo_map = compute_elo_snapshot(df_hist)
elo_home = float(elo_map.get(home, 1500.0))
elo_away = float(elo_map.get(away, 1500.0))
elo_diff = elo_home - elo_away
# Rolling features by role
feats_home_homeRole, _ = team_rolling_features(df_hist, home, window)
_, feats_away_awayRole = team_rolling_features(df_hist, away, window)
if feats_home_homeRole is None:
feats_home_homeRole = {f"roll_{c}": np.nan for c in ["gf","ga","shots_f","shots_a","sot_f","sot_a","corn_f","corn_a","y_f","r_f","points"]}
if feats_away_awayRole is None:
feats_away_awayRole = {f"roll_{c}": np.nan for c in ["gf","ga","shots_f","shots_a","sot_f","sot_a","corn_f","corn_a","y_f","r_f","points"]}
# Odds -> probs
if odds_tuple is not None:
probs = implied_from_odds(odds_tuple)
if probs is None:
raise ValueError("Invalid odds provided. Use decimal odds > 1.0")
pH, pD, pA = probs
else:
# Proxy from Elo + average draw rate
draws = (df_hist["ftr"] == "D").mean()
draws = float(draws) if not np.isnan(draws) and draws > 0 else 0.25
k = 400.0
ph_nodraw = 1.0/(1.0 + 10**(-(elo_diff)/k))
pa_nodraw = 1.0 - ph_nodraw
pH = ph_nodraw * (1.0 - draws)
pA = pa_nodraw * (1.0 - draws)
pD = draws
s = pH + pD + pA
pH, pD, pA = pH/s, pD/s, pA/s
# Assemble features in X_cols order
feat_row: Dict[str, float] = {}
for k, v in feats_home_homeRole.items():
feat_row[f"home_{k}"] = v
for k, v in feats_away_awayRole.items():
feat_row[f"away_{k}"] = v
feat_row["elo_home"] = elo_home
feat_row["elo_away"] = elo_away
feat_row["elo_diff"] = elo_diff
feat_row["p_odds_H"] = float(pH)
feat_row["p_odds_D"] = float(pD)
feat_row["p_odds_A"] = float(pA)
x_vals = [feat_row.get(c, np.nan) for c in X_cols]
x = np.array(x_vals, dtype=float).reshape(1, -1)
if feat_df_for_medians is not None:
medians = {c: float(feat_df_for_medians[c].median()) if c in feat_df_for_medians.columns else 0.0 for c in X_cols}
else:
# fallback zeros
medians = {c: 0.0 for c in X_cols}
for j, c in enumerate(X_cols):
if np.isnan(x[0, j]):
x[0, j] = medians[c]
context = {
"p_odds_H": pH,
"p_odds_D": pD,
"p_odds_A": pA,
"elo_home": elo_home,
"elo_away": elo_away,
}
return x, context