Spaces:
Sleeping
Sleeping
| """ | |
| FastAPI Serverless API for Cookie Classification | |
| Deploy this to Hugging Face Spaces for FREE serverless inference! | |
| """ | |
| from fastapi import FastAPI, HTTPException | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from pydantic import BaseModel | |
| from typing import List, Optional | |
| from huggingface_hub import hf_hub_download | |
| import joblib | |
| import numpy as np | |
| import re | |
| import pandas as pd | |
| from scipy.sparse import hstack, csr_matrix | |
| import os | |
| # Initialize FastAPI | |
| app = FastAPI( | |
| title="Cookie Classifier API", | |
| description="Classify web cookies into privacy categories: Strictly Necessary, Functionality, Analytics, Advertising/Tracking", | |
| version="1.0.0" | |
| ) | |
| # Enable CORS for frontend access | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], # In production, specify your frontend domain | |
| allow_credentials=True, | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| # Class mapping | |
| CLASS_NAMES = { | |
| 0: "Strictly Necessary", | |
| 1: "Functionality", | |
| 2: "Analytics", | |
| 3: "Advertising/Tracking" | |
| } | |
| # Tracker tokens | |
| TRACKER_TOKENS = { | |
| "ga", "gid", "utm", "ad", "ads", "pixel", "trk", "track", "fbp", "fbc", | |
| "gclid", "sess", "session", "id", "uuid", "cid", "cmp", "campaign", | |
| "click", "impress" | |
| } | |
| # Global model storage | |
| model = None | |
| tfidf_word = None | |
| tfidf_char = None | |
| def extract_name_features(s: str): | |
| """Extract engineered features from cookie name""" | |
| if not isinstance(s, str): | |
| s = "" | |
| lower = s.lower() | |
| L = len(s) | |
| digits = sum(ch.isdigit() for ch in s) | |
| alphas = sum(ch.isalpha() for ch in s) | |
| underscores = lower.count("_") | |
| dashes = lower.count("-") | |
| dots = lower.count(".") | |
| prefix3 = lower[:3] if L >= 3 else lower | |
| suffix3 = lower[-3:] if L >= 3 else lower | |
| tokens = re.split(r"[^a-z0-9]+", lower) | |
| tokens = [t for t in tokens if t] | |
| uniq_tokens = len(set(tokens)) | |
| token_len_mean = np.mean([len(t) for t in tokens]) if tokens else 0.0 | |
| has_tracker = int(any(t in TRACKER_TOKENS for t in tokens)) | |
| camel = int(bool(re.search(r"[a-z][A-Z]", s))) | |
| snake = int("_" in s) | |
| has_hex = int(bool(re.search(r"\b[0-9a-f]{8,}\b", lower))) | |
| return { | |
| "len": L, "digits": digits, "alphas": alphas, "underscores": underscores, | |
| "dashes": dashes, "dots": dots, "prefix3": prefix3, "suffix3": suffix3, | |
| "uniq_tokens": uniq_tokens, "token_len_mean": float(token_len_mean), | |
| "has_tracker_token": has_tracker, "camelCase": camel, "snake_case": snake, | |
| "has_hex": has_hex | |
| } | |
| def build_name_features(series): | |
| """Build name features DataFrame""" | |
| X = pd.DataFrame([extract_name_features(x) for x in series.fillna("")]) | |
| for col in ["prefix3", "suffix3"]: | |
| top = X[col].value_counts().head(30).index | |
| X[col] = X[col].where(X[col].isin(top), "__other__") | |
| X = pd.get_dummies(X, columns=["prefix3", "suffix3"], drop_first=True) | |
| return X | |
| def preprocess_cookie(cookie_name: str): | |
| """Complete preprocessing for a single cookie name""" | |
| series = pd.Series([cookie_name]) | |
| # TF-IDF features | |
| Xw = tfidf_word.transform(series.fillna("").astype(str)) | |
| Xc = tfidf_char.transform(series.fillna("").astype(str)) | |
| Xtf = hstack([Xw, Xc]) | |
| # Name features | |
| Xname = build_name_features(series) | |
| Xname = Xname.select_dtypes(include=[np.number]).astype("float64") | |
| # Combine | |
| X_combined = hstack([Xtf, csr_matrix(Xname.values)]) | |
| return X_combined | |
| def preprocess_cookies_batch(cookie_names: List[str]): | |
| """Complete preprocessing for multiple cookie names (vectorized)""" | |
| series = pd.Series(cookie_names) | |
| # TF-IDF features (vectorized) | |
| Xw = tfidf_word.transform(series.fillna("").astype(str)) | |
| Xc = tfidf_char.transform(series.fillna("").astype(str)) | |
| Xtf = hstack([Xw, Xc]) | |
| # Name features (vectorized) | |
| Xname = build_name_features(series) | |
| Xname = Xname.select_dtypes(include=[np.number]).astype("float64") | |
| # Combine | |
| X_combined = hstack([Xtf, csr_matrix(Xname.values)]) | |
| return X_combined | |
| async def load_model(): | |
| """Load model and vectorizers on startup""" | |
| global model, tfidf_word, tfidf_char | |
| try: | |
| print("🔄 Loading model from Hugging Face...") | |
| # Download model | |
| model_path = hf_hub_download( | |
| repo_id="aqibtahir/cookie-classifier-lr-tfidf", | |
| filename="LR_TFIDF+NAME.joblib" | |
| ) | |
| model = joblib.load(model_path) | |
| print("✓ Model loaded") | |
| # Load vectorizers | |
| print("🔄 Loading vectorizers...") | |
| tfidf_word_path = hf_hub_download( | |
| repo_id="aqibtahir/cookie-classifier-lr-tfidf", | |
| filename="tfidf_word.joblib" | |
| ) | |
| tfidf_char_path = hf_hub_download( | |
| repo_id="aqibtahir/cookie-classifier-lr-tfidf", | |
| filename="tfidf_char.joblib" | |
| ) | |
| tfidf_word = joblib.load(tfidf_word_path) | |
| tfidf_char = joblib.load(tfidf_char_path) | |
| print("✓ Vectorizers loaded") | |
| print("🎉 API ready to serve predictions!") | |
| except Exception as e: | |
| print(f"❌ Error during startup: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| raise | |
| # Request/Response models | |
| class CookieRequest(BaseModel): | |
| cookie_name: str | |
| class BatchCookieRequest(BaseModel): | |
| cookie_names: List[str] | |
| class PredictionResponse(BaseModel): | |
| cookie_name: str | |
| category: str | |
| class_id: int | |
| confidence: Optional[float] = None | |
| async def root(): | |
| """Health check and API info""" | |
| return { | |
| "status": "online", | |
| "model": "Cookie Classifier - Linear Regression", | |
| "categories": list(CLASS_NAMES.values()), | |
| "endpoints": { | |
| "predict": "/predict", | |
| "batch": "/predict/batch", | |
| "docs": "/docs" | |
| } | |
| } | |
| async def predict(request: CookieRequest): | |
| """ | |
| Predict cookie category for a single cookie name | |
| Example: | |
| ``` | |
| POST /predict | |
| {"cookie_name": "_ga"} | |
| ``` | |
| """ | |
| if not model: | |
| raise HTTPException(status_code=503, detail="Model not loaded") | |
| if not tfidf_word or not tfidf_char: | |
| raise HTTPException( | |
| status_code=503, | |
| detail="Vectorizers not available. Please upload tfidf_word.joblib and tfidf_char.joblib to the model repository" | |
| ) | |
| try: | |
| # Preprocess and predict | |
| features = preprocess_cookie(request.cookie_name) | |
| prediction = model.predict(features)[0] | |
| class_id = int(prediction) | |
| # Get confidence if available | |
| confidence = None | |
| try: | |
| decision = model.decision_function(features)[0] | |
| # Normalize decision scores to pseudo-probabilities | |
| scores = np.exp(decision) / np.exp(decision).sum() | |
| confidence = float(scores[class_id]) | |
| except: | |
| pass | |
| return PredictionResponse( | |
| cookie_name=request.cookie_name, | |
| category=CLASS_NAMES[class_id], | |
| class_id=class_id, | |
| confidence=confidence | |
| ) | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=f"Prediction error: {str(e)}") | |
| async def predict_batch(request: BatchCookieRequest): | |
| """ | |
| Predict categories for multiple cookie names (vectorized batch processing) | |
| Example: | |
| ``` | |
| POST /predict/batch | |
| {"cookie_names": ["_ga", "sessionid", "utm_campaign"]} | |
| ``` | |
| """ | |
| if not model: | |
| raise HTTPException(status_code=503, detail="Model not loaded") | |
| if not tfidf_word or not tfidf_char: | |
| raise HTTPException( | |
| status_code=503, | |
| detail="Vectorizers not available" | |
| ) | |
| if not request.cookie_names: | |
| return {"predictions": []} | |
| try: | |
| # Vectorized preprocessing (process all cookies at once) | |
| features = preprocess_cookies_batch(request.cookie_names) | |
| # Batch prediction (single model call for all cookies) | |
| predictions = model.predict(features) | |
| # Get confidence scores for all predictions at once | |
| confidences = [] | |
| try: | |
| decisions = model.decision_function(features) | |
| # Normalize decision scores to pseudo-probabilities | |
| exp_scores = np.exp(decisions) | |
| probabilities = exp_scores / exp_scores.sum(axis=1, keepdims=True) | |
| confidences = [float(probabilities[i, pred]) for i, pred in enumerate(predictions)] | |
| except: | |
| confidences = [None] * len(predictions) | |
| # Build results | |
| results = [] | |
| for idx, (cookie_name, prediction, confidence) in enumerate(zip(request.cookie_names, predictions, confidences)): | |
| class_id = int(prediction) | |
| results.append({ | |
| "cookie_name": cookie_name, | |
| "category": CLASS_NAMES[class_id], | |
| "class_id": class_id, | |
| "confidence": confidence | |
| }) | |
| return {"predictions": results} | |
| except Exception as e: | |
| import traceback | |
| traceback.print_exc() | |
| raise HTTPException(status_code=500, detail=f"Batch prediction error: {str(e)}") | |
| if __name__ == "__main__": | |
| import uvicorn | |
| uvicorn.run(app, host="0.0.0.0", port=7860) | |