Spaces:
Sleeping
Sleeping
| import io | |
| from collections import defaultdict | |
| from datetime import datetime | |
| import numpy as np | |
| import pandas as pd | |
| import requests | |
| from unidecode import unidecode | |
| from tqdm import tqdm | |
| # ----------------------------- | |
| # 1) Download EPL seasons | |
| # ----------------------------- | |
| BASE_URL = "https://www.football-data.co.uk/mmz4281/{scode}/E0.csv" | |
| def season_code(y1): | |
| # 1993 -> '9394', 2024 -> '2425' | |
| return f"{str(y1)[-2:]}{(y1+1)%100:02d}" | |
| def try_read_csv_bytes(content): | |
| for enc in ("utf-8", "latin-1", "ISO-8859-1"): | |
| try: | |
| return pd.read_csv(io.BytesIO(content), encoding=enc) | |
| except Exception: | |
| pass | |
| return pd.DataFrame() | |
| def fetch_season(y1, verbose=True): | |
| scode = season_code(y1) | |
| url = BASE_URL.format(scode=scode) | |
| r = requests.get(url, timeout=30) | |
| if r.status_code != 200 or len(r.content) < 100: | |
| if verbose: print(f"[skip] {y1}-{(y1+1)%100:02d} ({scode}) not available") | |
| return pd.DataFrame() | |
| df = try_read_csv_bytes(r.content) | |
| if df.empty: | |
| if verbose: print(f"[warn] parse error {scode}") | |
| return pd.DataFrame() | |
| df["SeasonFirstYear"] = y1 | |
| df["Season"] = f"{y1}-{str(y1+1)[-2:]}" | |
| df["SeasonCode"] = scode | |
| return df | |
| def load_epl_data(start_y1: int = 1993, end_y1: int | None = None, verbose: bool = False) -> pd.DataFrame: | |
| """Download and concatenate EPL seasons into a single DataFrame. | |
| - start_y1: first season starting year (inclusive), e.g., 1993 | |
| - end_y1: last season starting year (inclusive). Defaults to current year + 1 to include latest. | |
| - verbose: print per-season logs | |
| """ | |
| if end_y1 is None: | |
| end_y1 = datetime.now().year + 1 | |
| frames = [] | |
| for y in tqdm(range(start_y1, end_y1 + 1)): | |
| df = fetch_season(y, verbose=verbose) | |
| if not df.empty: | |
| frames.append(df) | |
| data_raw = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame() | |
| if data_raw.empty: | |
| raise RuntimeError("No data downloaded. Try expanding start_y1 backward or check internet.") | |
| return data_raw | |
| if __name__ == "__main__": | |
| # Example usage when running this file directly | |
| data_raw = load_epl_data(start_y1=1993, end_y1=datetime.now().year + 1, verbose=True) | |
| print("Seasons loaded:", sorted(data_raw["Season"].unique())) |