aqibtahir commited on
Commit
12c511c
·
verified ·
1 Parent(s): 05ef637

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +264 -0
app.py ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FastAPI Serverless API for Cookie Classification
3
+ Deploy this to Hugging Face Spaces for FREE serverless inference!
4
+ """
5
+
6
+ from fastapi import FastAPI, HTTPException
7
+ from fastapi.middleware.cors import CORSMiddleware
8
+ from pydantic import BaseModel
9
+ from typing import List, Optional
10
+ from huggingface_hub import hf_hub_download
11
+ import joblib
12
+ import numpy as np
13
+ import re
14
+ import pandas as pd
15
+ from scipy.sparse import hstack, csr_matrix
16
+ import os
17
+
18
+ # Initialize FastAPI
19
+ app = FastAPI(
20
+ title="Cookie Classifier API",
21
+ description="Classify web cookies into privacy categories: Strictly Necessary, Functionality, Analytics, Advertising/Tracking",
22
+ version="1.0.0"
23
+ )
24
+
25
+ # Enable CORS for frontend access
26
+ app.add_middleware(
27
+ CORSMiddleware,
28
+ allow_origins=["*"], # In production, specify your frontend domain
29
+ allow_credentials=True,
30
+ allow_methods=["*"],
31
+ allow_headers=["*"],
32
+ )
33
+
34
+ # Class mapping
35
+ CLASS_NAMES = {
36
+ 0: "Strictly Necessary",
37
+ 1: "Functionality",
38
+ 2: "Analytics",
39
+ 3: "Advertising/Tracking"
40
+ }
41
+
42
+ # Tracker tokens
43
+ TRACKER_TOKENS = {
44
+ "ga", "gid", "utm", "ad", "ads", "pixel", "trk", "track", "fbp", "fbc",
45
+ "gclid", "sess", "session", "id", "uuid", "cid", "cmp", "campaign",
46
+ "click", "impress"
47
+ }
48
+
49
+ # Global model storage
50
+ model = None
51
+ tfidf_word = None
52
+ tfidf_char = None
53
+
54
+ def extract_name_features(s: str):
55
+ """Extract engineered features from cookie name"""
56
+ if not isinstance(s, str):
57
+ s = ""
58
+
59
+ lower = s.lower()
60
+ L = len(s)
61
+ digits = sum(ch.isdigit() for ch in s)
62
+ alphas = sum(ch.isalpha() for ch in s)
63
+ underscores = lower.count("_")
64
+ dashes = lower.count("-")
65
+ dots = lower.count(".")
66
+ prefix3 = lower[:3] if L >= 3 else lower
67
+ suffix3 = lower[-3:] if L >= 3 else lower
68
+ tokens = re.split(r"[^a-z0-9]+", lower)
69
+ tokens = [t for t in tokens if t]
70
+ uniq_tokens = len(set(tokens))
71
+ token_len_mean = np.mean([len(t) for t in tokens]) if tokens else 0.0
72
+ has_tracker = int(any(t in TRACKER_TOKENS for t in tokens))
73
+ camel = int(bool(re.search(r"[a-z][A-Z]", s)))
74
+ snake = int("_" in s)
75
+ has_hex = int(bool(re.search(r"\b[0-9a-f]{8,}\b", lower)))
76
+
77
+ return {
78
+ "len": L, "digits": digits, "alphas": alphas, "underscores": underscores,
79
+ "dashes": dashes, "dots": dots, "prefix3": prefix3, "suffix3": suffix3,
80
+ "uniq_tokens": uniq_tokens, "token_len_mean": float(token_len_mean),
81
+ "has_tracker_token": has_tracker, "camelCase": camel, "snake_case": snake,
82
+ "has_hex": has_hex
83
+ }
84
+
85
+ def build_name_features(series):
86
+ """Build name features DataFrame"""
87
+ X = pd.DataFrame([extract_name_features(x) for x in series.fillna("")])
88
+ for col in ["prefix3", "suffix3"]:
89
+ top = X[col].value_counts().head(30).index
90
+ X[col] = X[col].where(X[col].isin(top), "__other__")
91
+ X = pd.get_dummies(X, columns=["prefix3", "suffix3"], drop_first=True)
92
+ return X
93
+
94
+ def preprocess_cookie(cookie_name: str):
95
+ """Complete preprocessing for a single cookie name"""
96
+ series = pd.Series([cookie_name])
97
+
98
+ # TF-IDF features
99
+ Xw = tfidf_word.transform(series.fillna("").astype(str))
100
+ Xc = tfidf_char.transform(series.fillna("").astype(str))
101
+ Xtf = hstack([Xw, Xc])
102
+
103
+ # Name features
104
+ Xname = build_name_features(series)
105
+ Xname = Xname.select_dtypes(include=[np.number]).astype("float64")
106
+
107
+ # Combine
108
+ X_combined = hstack([Xtf, csr_matrix(Xname.values)])
109
+ return X_combined
110
+
111
+ @app.on_event("startup")
112
+ async def load_model():
113
+ """Load model and vectorizers on startup"""
114
+ global model, tfidf_word, tfidf_char
115
+
116
+ print("Loading model from Hugging Face...")
117
+
118
+ # Download model
119
+ model_path = hf_hub_download(
120
+ repo_id="aqibtahir/cookie-classifier-lr-tfidf",
121
+ filename="LR_TFIDF+NAME.joblib"
122
+ )
123
+ model = joblib.load(model_path)
124
+
125
+ # Try to load vectorizers (they should be in the same directory or uploaded separately)
126
+ try:
127
+ tfidf_word_path = hf_hub_download(
128
+ repo_id="aqibtahir/cookie-classifier-lr-tfidf",
129
+ filename="tfidf_word.joblib"
130
+ )
131
+ tfidf_char_path = hf_hub_download(
132
+ repo_id="aqibtahir/cookie-classifier-lr-tfidf",
133
+ filename="tfidf_char.joblib"
134
+ )
135
+ tfidf_word = joblib.load(tfidf_word_path)
136
+ tfidf_char = joblib.load(tfidf_char_path)
137
+ print("✓ Model and vectorizers loaded successfully!")
138
+ except Exception as e:
139
+ print(f"⚠️ Warning: Could not load vectorizers: {e}")
140
+ print("API will work with limited functionality")
141
+
142
+ # Request/Response models
143
+ class CookieRequest(BaseModel):
144
+ cookie_name: str
145
+
146
+ class BatchCookieRequest(BaseModel):
147
+ cookie_names: List[str]
148
+
149
+ class PredictionResponse(BaseModel):
150
+ cookie_name: str
151
+ category: str
152
+ class_id: int
153
+ confidence: Optional[float] = None
154
+
155
+ @app.get("/")
156
+ async def root():
157
+ """Health check and API info"""
158
+ return {
159
+ "status": "online",
160
+ "model": "Cookie Classifier - Linear Regression",
161
+ "categories": list(CLASS_NAMES.values()),
162
+ "endpoints": {
163
+ "predict": "/predict",
164
+ "batch": "/predict/batch",
165
+ "docs": "/docs"
166
+ }
167
+ }
168
+
169
+ @app.post("/predict", response_model=PredictionResponse)
170
+ async def predict(request: CookieRequest):
171
+ """
172
+ Predict cookie category for a single cookie name
173
+
174
+ Example:
175
+ ```
176
+ POST /predict
177
+ {"cookie_name": "_ga"}
178
+ ```
179
+ """
180
+ if not model:
181
+ raise HTTPException(status_code=503, detail="Model not loaded")
182
+
183
+ if not tfidf_word or not tfidf_char:
184
+ raise HTTPException(
185
+ status_code=503,
186
+ detail="Vectorizers not available. Please upload tfidf_word.joblib and tfidf_char.joblib to the model repository"
187
+ )
188
+
189
+ try:
190
+ # Preprocess and predict
191
+ features = preprocess_cookie(request.cookie_name)
192
+ prediction = model.predict(features)[0]
193
+ class_id = int(prediction)
194
+
195
+ # Get confidence if available
196
+ confidence = None
197
+ try:
198
+ decision = model.decision_function(features)[0]
199
+ # Normalize decision scores to pseudo-probabilities
200
+ scores = np.exp(decision) / np.exp(decision).sum()
201
+ confidence = float(scores[class_id])
202
+ except:
203
+ pass
204
+
205
+ return PredictionResponse(
206
+ cookie_name=request.cookie_name,
207
+ category=CLASS_NAMES[class_id],
208
+ class_id=class_id,
209
+ confidence=confidence
210
+ )
211
+
212
+ except Exception as e:
213
+ raise HTTPException(status_code=500, detail=f"Prediction error: {str(e)}")
214
+
215
+ @app.post("/predict/batch")
216
+ async def predict_batch(request: BatchCookieRequest):
217
+ """
218
+ Predict categories for multiple cookie names
219
+
220
+ Example:
221
+ ```
222
+ POST /predict/batch
223
+ {"cookie_names": ["_ga", "sessionid", "utm_campaign"]}
224
+ ```
225
+ """
226
+ if not model:
227
+ raise HTTPException(status_code=503, detail="Model not loaded")
228
+
229
+ if not tfidf_word or not tfidf_char:
230
+ raise HTTPException(
231
+ status_code=503,
232
+ detail="Vectorizers not available"
233
+ )
234
+
235
+ try:
236
+ results = []
237
+ for cookie_name in request.cookie_names:
238
+ features = preprocess_cookie(cookie_name)
239
+ prediction = model.predict(features)[0]
240
+ class_id = int(prediction)
241
+
242
+ confidence = None
243
+ try:
244
+ decision = model.decision_function(features)[0]
245
+ scores = np.exp(decision) / np.exp(decision).sum()
246
+ confidence = float(scores[class_id])
247
+ except:
248
+ pass
249
+
250
+ results.append({
251
+ "cookie_name": cookie_name,
252
+ "category": CLASS_NAMES[class_id],
253
+ "class_id": class_id,
254
+ "confidence": confidence
255
+ })
256
+
257
+ return {"predictions": results}
258
+
259
+ except Exception as e:
260
+ raise HTTPException(status_code=500, detail=f"Batch prediction error: {str(e)}")
261
+
262
+ if __name__ == "__main__":
263
+ import uvicorn
264
+ uvicorn.run(app, host="0.0.0.0", port=7860)