import io from collections import defaultdict from datetime import datetime import numpy as np import pandas as pd import requests from unidecode import unidecode from tqdm import tqdm # ----------------------------- # 1) Download EPL seasons # ----------------------------- BASE_URL = "https://www.football-data.co.uk/mmz4281/{scode}/E0.csv" def season_code(y1): # 1993 -> '9394', 2024 -> '2425' return f"{str(y1)[-2:]}{(y1+1)%100:02d}" def try_read_csv_bytes(content): for enc in ("utf-8", "latin-1", "ISO-8859-1"): try: return pd.read_csv(io.BytesIO(content), encoding=enc) except Exception: pass return pd.DataFrame() def fetch_season(y1, verbose=True): scode = season_code(y1) url = BASE_URL.format(scode=scode) r = requests.get(url, timeout=30) if r.status_code != 200 or len(r.content) < 100: if verbose: print(f"[skip] {y1}-{(y1+1)%100:02d} ({scode}) not available") return pd.DataFrame() df = try_read_csv_bytes(r.content) if df.empty: if verbose: print(f"[warn] parse error {scode}") return pd.DataFrame() df["SeasonFirstYear"] = y1 df["Season"] = f"{y1}-{str(y1+1)[-2:]}" df["SeasonCode"] = scode return df def load_epl_data(start_y1: int = 1993, end_y1: int | None = None, verbose: bool = False) -> pd.DataFrame: """Download and concatenate EPL seasons into a single DataFrame. - start_y1: first season starting year (inclusive), e.g., 1993 - end_y1: last season starting year (inclusive). Defaults to current year + 1 to include latest. - verbose: print per-season logs """ if end_y1 is None: end_y1 = datetime.now().year + 1 frames = [] for y in tqdm(range(start_y1, end_y1 + 1)): df = fetch_season(y, verbose=verbose) if not df.empty: frames.append(df) data_raw = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame() if data_raw.empty: raise RuntimeError("No data downloaded. Try expanding start_y1 backward or check internet.") return data_raw if __name__ == "__main__": # Example usage when running this file directly data_raw = load_epl_data(start_y1=1993, end_y1=datetime.now().year + 1, verbose=True) print("Seasons loaded:", sorted(data_raw["Season"].unique()))