""" FastAPI Serverless API for Cookie Classification Deploy this to Hugging Face Spaces for FREE serverless inference! """ from fastapi import FastAPI, HTTPException from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel from typing import List, Optional from huggingface_hub import hf_hub_download import joblib import numpy as np import re import pandas as pd from scipy.sparse import hstack, csr_matrix import os # Initialize FastAPI app = FastAPI( title="Cookie Classifier API", description="Classify web cookies into privacy categories: Strictly Necessary, Functionality, Analytics, Advertising/Tracking", version="1.0.0" ) # Enable CORS for frontend access app.add_middleware( CORSMiddleware, allow_origins=["*"], # In production, specify your frontend domain allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) # Class mapping CLASS_NAMES = { 0: "Strictly Necessary", 1: "Functionality", 2: "Analytics", 3: "Advertising/Tracking" } # Tracker tokens TRACKER_TOKENS = { "ga", "gid", "utm", "ad", "ads", "pixel", "trk", "track", "fbp", "fbc", "gclid", "sess", "session", "id", "uuid", "cid", "cmp", "campaign", "click", "impress" } # Global model storage model = None tfidf_word = None tfidf_char = None def extract_name_features(s: str): """Extract engineered features from cookie name""" if not isinstance(s, str): s = "" lower = s.lower() L = len(s) digits = sum(ch.isdigit() for ch in s) alphas = sum(ch.isalpha() for ch in s) underscores = lower.count("_") dashes = lower.count("-") dots = lower.count(".") prefix3 = lower[:3] if L >= 3 else lower suffix3 = lower[-3:] if L >= 3 else lower tokens = re.split(r"[^a-z0-9]+", lower) tokens = [t for t in tokens if t] uniq_tokens = len(set(tokens)) token_len_mean = np.mean([len(t) for t in tokens]) if tokens else 0.0 has_tracker = int(any(t in TRACKER_TOKENS for t in tokens)) camel = int(bool(re.search(r"[a-z][A-Z]", s))) snake = int("_" in s) has_hex = int(bool(re.search(r"\b[0-9a-f]{8,}\b", lower))) return { "len": L, "digits": digits, "alphas": alphas, "underscores": underscores, "dashes": dashes, "dots": dots, "prefix3": prefix3, "suffix3": suffix3, "uniq_tokens": uniq_tokens, "token_len_mean": float(token_len_mean), "has_tracker_token": has_tracker, "camelCase": camel, "snake_case": snake, "has_hex": has_hex } def build_name_features(series): """Build name features DataFrame""" X = pd.DataFrame([extract_name_features(x) for x in series.fillna("")]) for col in ["prefix3", "suffix3"]: top = X[col].value_counts().head(30).index X[col] = X[col].where(X[col].isin(top), "__other__") X = pd.get_dummies(X, columns=["prefix3", "suffix3"], drop_first=True) return X def preprocess_cookie(cookie_name: str): """Complete preprocessing for a single cookie name""" series = pd.Series([cookie_name]) # TF-IDF features Xw = tfidf_word.transform(series.fillna("").astype(str)) Xc = tfidf_char.transform(series.fillna("").astype(str)) Xtf = hstack([Xw, Xc]) # Name features Xname = build_name_features(series) Xname = Xname.select_dtypes(include=[np.number]).astype("float64") # Combine X_combined = hstack([Xtf, csr_matrix(Xname.values)]) return X_combined def preprocess_cookies_batch(cookie_names: List[str]): """Complete preprocessing for multiple cookie names (vectorized)""" series = pd.Series(cookie_names) # TF-IDF features (vectorized) Xw = tfidf_word.transform(series.fillna("").astype(str)) Xc = tfidf_char.transform(series.fillna("").astype(str)) Xtf = hstack([Xw, Xc]) # Name features (vectorized) Xname = build_name_features(series) Xname = Xname.select_dtypes(include=[np.number]).astype("float64") # Combine X_combined = hstack([Xtf, csr_matrix(Xname.values)]) return X_combined @app.on_event("startup") async def load_model(): """Load model and vectorizers on startup""" global model, tfidf_word, tfidf_char try: print("🔄 Loading model from Hugging Face...") # Download model model_path = hf_hub_download( repo_id="aqibtahir/cookie-classifier-lr-tfidf", filename="LR_TFIDF+NAME.joblib" ) model = joblib.load(model_path) print("✓ Model loaded") # Load vectorizers print("🔄 Loading vectorizers...") tfidf_word_path = hf_hub_download( repo_id="aqibtahir/cookie-classifier-lr-tfidf", filename="tfidf_word.joblib" ) tfidf_char_path = hf_hub_download( repo_id="aqibtahir/cookie-classifier-lr-tfidf", filename="tfidf_char.joblib" ) tfidf_word = joblib.load(tfidf_word_path) tfidf_char = joblib.load(tfidf_char_path) print("✓ Vectorizers loaded") print("🎉 API ready to serve predictions!") except Exception as e: print(f"❌ Error during startup: {e}") import traceback traceback.print_exc() raise # Request/Response models class CookieRequest(BaseModel): cookie_name: str class BatchCookieRequest(BaseModel): cookie_names: List[str] class PredictionResponse(BaseModel): cookie_name: str category: str class_id: int confidence: Optional[float] = None @app.get("/") async def root(): """Health check and API info""" return { "status": "online", "model": "Cookie Classifier - Linear Regression", "categories": list(CLASS_NAMES.values()), "endpoints": { "predict": "/predict", "batch": "/predict/batch", "docs": "/docs" } } @app.post("/predict", response_model=PredictionResponse) async def predict(request: CookieRequest): """ Predict cookie category for a single cookie name Example: ``` POST /predict {"cookie_name": "_ga"} ``` """ if not model: raise HTTPException(status_code=503, detail="Model not loaded") if not tfidf_word or not tfidf_char: raise HTTPException( status_code=503, detail="Vectorizers not available. Please upload tfidf_word.joblib and tfidf_char.joblib to the model repository" ) try: # Preprocess and predict features = preprocess_cookie(request.cookie_name) prediction = model.predict(features)[0] class_id = int(prediction) # Get confidence if available confidence = None try: decision = model.decision_function(features)[0] # Normalize decision scores to pseudo-probabilities scores = np.exp(decision) / np.exp(decision).sum() confidence = float(scores[class_id]) except: pass return PredictionResponse( cookie_name=request.cookie_name, category=CLASS_NAMES[class_id], class_id=class_id, confidence=confidence ) except Exception as e: raise HTTPException(status_code=500, detail=f"Prediction error: {str(e)}") @app.post("/predict/batch") async def predict_batch(request: BatchCookieRequest): """ Predict categories for multiple cookie names (vectorized batch processing) Example: ``` POST /predict/batch {"cookie_names": ["_ga", "sessionid", "utm_campaign"]} ``` """ if not model: raise HTTPException(status_code=503, detail="Model not loaded") if not tfidf_word or not tfidf_char: raise HTTPException( status_code=503, detail="Vectorizers not available" ) if not request.cookie_names: return {"predictions": []} try: # Vectorized preprocessing (process all cookies at once) features = preprocess_cookies_batch(request.cookie_names) # Batch prediction (single model call for all cookies) predictions = model.predict(features) # Get confidence scores for all predictions at once confidences = [] try: decisions = model.decision_function(features) # Normalize decision scores to pseudo-probabilities exp_scores = np.exp(decisions) probabilities = exp_scores / exp_scores.sum(axis=1, keepdims=True) confidences = [float(probabilities[i, pred]) for i, pred in enumerate(predictions)] except: confidences = [None] * len(predictions) # Build results results = [] for idx, (cookie_name, prediction, confidence) in enumerate(zip(request.cookie_names, predictions, confidences)): class_id = int(prediction) results.append({ "cookie_name": cookie_name, "category": CLASS_NAMES[class_id], "class_id": class_id, "confidence": confidence }) return {"predictions": results} except Exception as e: import traceback traceback.print_exc() raise HTTPException(status_code=500, detail=f"Batch prediction error: {str(e)}") if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=7860)