Starburst15 commited on
Commit
97d5e2d
Β·
verified Β·
1 Parent(s): 0f3722b

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +219 -268
src/streamlit_app.py CHANGED
@@ -1,294 +1,245 @@
1
- # =============================================================
2
- # πŸ“˜ USTP Student Handbook Assistant (2023 Edition)
3
- # =============================================================
4
- # Enhanced: dynamic model selection + real (printed) page numbering
5
-
6
  import os
7
- import glob
8
- import json
9
- import time
10
- from typing import List, Dict, Any
11
  import numpy as np
12
  import streamlit as st
13
- import PyPDF2
14
- import requests
15
  from dotenv import load_dotenv
16
  from huggingface_hub import InferenceClient, login
17
- from streamlit_chat import message as st_message
18
-
19
- # Optional: FAISS for fast vector search
20
- try:
21
- import faiss
22
- except ImportError:
23
- faiss = None
24
-
25
- # =============================================================
26
- # 🌐 Startup Fix for PermissionError
27
- # =============================================================
28
- os.environ["STREAMLIT_HOME"] = "/tmp/.streamlit"
29
- os.makedirs("/tmp/.streamlit", exist_ok=True)
30
 
31
- # =============================================================
32
- # βš™οΈ Streamlit Page Setup
33
- # =============================================================
34
- st.set_page_config(page_title="πŸ“˜ Handbook Assistant", page_icon="πŸ“˜", layout="wide")
35
- st.title("πŸ“˜ USTP Student Handbook Assistant (2023 Edition)")
36
- st.caption("Answers sourced only from the official *USTP Student Handbook 2023 Edition.pdf*.")
37
 
 
 
 
38
  load_dotenv()
39
- HF_TOKEN = os.getenv("HF_TOKEN")
 
40
 
41
  if not HF_TOKEN:
42
- st.warning("⚠️ No Hugging Face API token found in .env file. Online models will be unavailable.")
43
  else:
44
- try:
45
- login(HF_TOKEN)
46
- except Exception:
47
- pass
48
 
49
- hf_client = InferenceClient(token=HF_TOKEN) if HF_TOKEN else None
 
 
 
50
 
51
- # =============================================================
52
- # βš™οΈ Sidebar Configuration
53
- # =============================================================
54
  with st.sidebar:
55
- st.header("βš™οΈ Settings")
56
-
57
- model_options = {
58
- "Qwen 2.5 14B Instruct": "Qwen/Qwen2.5-14B-Instruct",
59
- "Mistral 7B Instruct": "mistralai/Mistral-7B-Instruct-v0.3",
60
- "Llama 3 8B Instruct": "meta-llama/Meta-Llama-3-8B-Instruct",
61
- "Mixtral 8x7B Instruct": "mistralai/Mixtral-8x7B-Instruct-v0.1",
62
- "Falcon 7B Instruct": "tiiuae/falcon-7b-instruct",
63
- }
64
- model_choice = st.selectbox("Select reasoning model", list(model_options.keys()), index=0)
65
- DEFAULT_MODEL = model_options[model_choice]
66
-
67
- st.markdown("---")
68
- similarity_threshold = st.slider("Similarity threshold", 0.3, 1.0, 0.6, 0.01)
69
- top_k = st.slider("Top K retrieved chunks", 1, 10, 4)
70
- chunk_size_chars = st.number_input("Chunk size (chars)", 400, 2500, 1200, 100)
71
- chunk_overlap = st.number_input("Chunk overlap (chars)", 20, 600, 150, 10)
72
- front_matter_pages = st.number_input(
73
- "Pages before main content (e.g. table of contents, cover)", min_value=0, max_value=50, value=12
74
  )
75
- regenerate_index = st.button("πŸ” Rebuild handbook index")
76
-
77
- # =============================================================
78
- # πŸ“‚ File Config
79
- # =============================================================
80
- INDEX_FILE = "handbook_faiss.index"
81
- META_FILE = "handbook_metadata.json"
82
- EMB_DIM_FILE = "handbook_emb_dim.json"
83
- EMBED_MODEL = "sentence-transformers/all-mpnet-base-v2"
84
-
85
- # =============================================================
86
- # 🧩 Utility Functions
87
- # =============================================================
88
- def find_handbook() -> List[str]:
89
- preferred = "USTP Student Handbook 2023 Edition.pdf"
90
- pdfs = glob.glob("*.pdf")
91
- for f in pdfs:
92
- if preferred.lower() in f.lower():
93
- st.success(f"πŸ“˜ Found handbook: {f}")
94
- return [f]
95
- if pdfs:
96
- st.warning(f"⚠️ Preferred handbook not found. Using {os.path.basename(pdfs[0])}.")
97
- return [pdfs[0]]
98
- st.error("❌ No PDF found in current folder.")
99
- return []
100
-
101
-
102
- def load_pdf_texts(pdf_paths: List[str]) -> List[Dict[str, Any]]:
103
- """Extract page text while adjusting page numbering to printed handbook numbers."""
104
- pages = []
105
- for path in pdf_paths:
106
- with open(path, "rb") as f:
107
- reader = PyPDF2.PdfReader(f)
108
- for i, page in enumerate(reader.pages):
109
- text = page.extract_text() or ""
110
- if text.strip():
111
- # Adjust logical page number to printed numbering
112
- logical_page = i + 1
113
- printed_page = logical_page - front_matter_pages
114
- if printed_page < 1:
115
- printed_page = 1
116
- pages.append({
117
- "filename": os.path.basename(path),
118
- "page": printed_page,
119
- "text": text.strip()
120
- })
121
- return pages
122
 
 
 
 
 
 
 
 
 
 
 
123
 
124
- def chunk_text(pages: List[Dict[str, Any]], size: int, overlap: int) -> List[Dict[str, Any]]:
125
- chunks = []
126
- for p in pages:
127
- text = p["text"]
128
- start = 0
129
- while start < len(text):
130
- end = start + size
131
- chunk = text[start:end]
132
- chunks.append({
133
- "filename": p["filename"],
134
- "page": p["page"],
135
- "content": chunk.strip()
136
- })
137
- start += size - overlap
138
- return chunks
139
 
 
 
 
 
140
 
141
- def embed_texts(texts: List[str]) -> np.ndarray:
142
- """Generate embeddings using Hugging Face feature extraction."""
143
- if not HF_TOKEN or not hf_client:
144
- st.error("❌ Missing Hugging Face token or client.")
145
- return np.zeros((len(texts), 768))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  try:
147
- embeddings = hf_client.feature_extraction(texts, model=EMBED_MODEL)
148
- if isinstance(embeddings[0][0], list):
149
- embeddings = [np.mean(np.array(e), axis=0) for e in embeddings]
150
- return np.array(embeddings)
151
- except Exception as e1:
152
- st.warning(f"⚠️ feature_extraction failed, using REST API fallback: {e1}")
153
- headers = {"Authorization": f"Bearer {HF_TOKEN}"}
154
- resp = requests.post(
155
- f"https://api-inference.huggingface.co/models/{EMBED_MODEL}",
156
- headers=headers,
157
- json={"inputs": texts}
158
- )
159
- data = resp.json()
160
- if isinstance(data[0][0], list):
161
- data = [np.mean(np.array(e), axis=0) for e in data]
162
- return np.array(data)
163
-
164
-
165
- def build_faiss_index(chunks: List[Dict[str, Any]]):
166
- """Build FAISS index for chunks."""
167
- texts = [c["content"] for c in chunks]
168
- embeddings = embed_texts(texts)
169
- if embeddings.size == 0:
170
- st.error("❌ Embedding generation failed.")
171
- return
172
- dim = embeddings.shape[1]
173
- index = faiss.IndexFlatL2(dim)
174
- index.add(embeddings.astype("float32"))
175
- faiss.write_index(index, INDEX_FILE)
176
- with open(META_FILE, "w") as f:
177
- json.dump(chunks, f)
178
- with open(EMB_DIM_FILE, "w") as f:
179
- json.dump({"dim": dim}, f)
180
- st.success(f"βœ… Indexed {len(chunks)} chunks.")
181
-
182
-
183
- def load_faiss_index():
184
- if not os.path.exists(INDEX_FILE) or not os.path.exists(META_FILE):
185
- return None, None
186
- index = faiss.read_index(INDEX_FILE)
187
- with open(META_FILE) as f:
188
- meta = json.load(f)
189
- return index, meta
190
-
191
-
192
- def search_index(query: str, index, meta, top_k: int, threshold: float):
193
- query_emb = embed_texts([query])
194
- distances, indices = index.search(query_emb.astype("float32"), top_k)
195
- results = []
196
- for i, dist in zip(indices[0], distances[0]):
197
- if i < len(meta):
198
- r = meta[i]
199
- r["distance"] = float(dist)
200
- results.append(r)
201
- return results
202
 
 
 
203
 
204
- def generate_answer(context: str, query: str) -> str:
205
- """Generate model-based answer using selected open-source model."""
206
- prompt = f"""
207
- You are a precise academic assistant specialized in university policy.
208
- Use only the *USTP Student Handbook 2023 Edition* below.
209
- If the answer is not in the text, reply:
210
- "The handbook does not specify that."
211
 
212
- ---
213
- πŸ“˜ Context:
214
- {context}
215
- ---
216
- 🧭 Question:
217
- {query}
218
- ---
219
- 🎯 Instructions:
220
- - Be factual and concise.
221
- - Cite the correct printed page number.
222
- - Never make assumptions.
223
  """
224
-
225
  try:
226
- response = hf_client.text_generation(
227
- model=DEFAULT_MODEL,
228
- prompt=prompt,
229
- max_new_tokens=400,
230
- temperature=0.25
231
- )
232
- return response if isinstance(response, str) else str(response)
233
- except Exception as e1:
234
- try:
235
- chat_response = hf_client.chat.completions.create(
236
- model=DEFAULT_MODEL,
237
- messages=[{"role": "user", "content": prompt}],
238
- max_tokens=400
239
  )
240
- return chat_response.choices[0].message["content"]
241
- except Exception as e2:
242
- return f"⚠️ Error generating answer: {e2}"
243
-
244
-
245
- def ensure_index():
246
- """Ensure FAISS index exists or rebuild."""
247
- if regenerate_index or not os.path.exists(INDEX_FILE):
248
- pdfs = find_handbook()
249
- if not pdfs:
250
- st.stop()
251
- st.info("πŸ“„ Extracting handbook text...")
252
- pages = load_pdf_texts(pdfs)
253
- chunks = chunk_text(pages, chunk_size_chars, chunk_overlap)
254
- build_faiss_index(chunks)
255
- index, meta = load_faiss_index()
256
- if index is None or meta is None:
257
- st.error("❌ Could not load FAISS index.")
258
- st.stop()
259
- return index, meta
260
-
261
- # =============================================================
262
- # πŸ’¬ Chat Interface
263
- # =============================================================
264
- st.divider()
265
- st.subheader("πŸ’¬ Ask about the Handbook")
266
-
267
- if "history" not in st.session_state:
268
- st.session_state.history = []
269
-
270
- user_query = st.text_input("Enter your question:")
271
- index, meta = ensure_index()
272
-
273
- if st.button("Ask") and user_query.strip():
274
- results = search_index(user_query, index, meta, top_k, similarity_threshold)
275
- if not results:
276
- st.warning("No relevant section found in the handbook.")
277
- else:
278
- context = "\n\n".join(
279
- [f"(πŸ“„ Page {r['page']})\n{r['content']}" for r in results]
280
- )
281
- answer = generate_answer(context, user_query)
282
- st.session_state.history.append({
283
- "user": user_query,
284
- "assistant": answer,
285
- "timestamp": time.time()
286
- })
287
-
288
- # βœ… Ensure unique keys to prevent StreamlitDuplicateElementId
289
- for i, chat in enumerate(st.session_state.history):
290
- st_message(chat["user"], is_user=True, key=f"user_{i}")
291
- st_message(chat["assistant"], key=f"assistant_{i}")
292
-
293
- st.caption("⚑ Powered by FAISS + Open Source Models + Accurate Page Referencing")
294
-
 
 
 
 
 
 
1
  import os
2
+ import pandas as pd
 
 
 
3
  import numpy as np
4
  import streamlit as st
 
 
5
  from dotenv import load_dotenv
6
  from huggingface_hub import InferenceClient, login
7
+ import google.generativeai as genai
8
+ from io import StringIO
9
+ import time
10
+ import requests
 
 
 
 
 
 
 
 
 
11
 
12
+ # ======================================================
13
+ # βš™οΈ APP CONFIGURATION
14
+ # ======================================================
15
+ st.set_page_config(page_title="πŸ“Š Smart Data Analyst Pro", layout="wide")
16
+ st.title("πŸ“Š Smart Data Analyst Pro (Chat Mode)")
17
+ st.caption("Chat with your dataset β€” AI cleans, analyzes, and visualizes data. Hugging Face + Gemini compatible.")
18
 
19
+ # ======================================================
20
+ # πŸ” Load Environment Variables
21
+ # ======================================================
22
  load_dotenv()
23
+ HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_API_KEY")
24
+ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
25
 
26
  if not HF_TOKEN:
27
+ st.error("❌ Missing HF_TOKEN. Please set it in your .env file.")
28
  else:
29
+ login(token=HF_TOKEN)
 
 
 
30
 
31
+ if GEMINI_API_KEY:
32
+ genai.configure(api_key=GEMINI_API_KEY)
33
+ else:
34
+ st.warning("⚠️ Gemini API key missing. Gemini 2.5 Flash will not work.")
35
 
36
+ # ======================================================
37
+ # 🧠 MODEL SETUP
38
+ # ======================================================
39
  with st.sidebar:
40
+ st.header("βš™οΈ Model Settings")
41
+
42
+ CLEANER_MODEL = st.selectbox(
43
+ "Select Cleaner Model:",
44
+ [
45
+ "Qwen/Qwen2.5-Coder-14B",
46
+ "mistralai/Mistral-7B-Instruct-v0.3"
47
+ ],
48
+ index=0
 
 
 
 
 
 
 
 
 
 
49
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
+ ANALYST_MODEL = st.selectbox(
52
+ "Select Analysis Model:",
53
+ [
54
+ "Gemini 2.5 Flash (Google)",
55
+ "Qwen/Qwen2.5-14B-Instruct",
56
+ "mistralai/Mistral-7B-Instruct-v0.3",
57
+ "HuggingFaceH4/zephyr-7b-beta"
58
+ ],
59
+ index=0
60
+ )
61
 
62
+ temperature = st.slider("Temperature", 0.0, 1.0, 0.3)
63
+ max_tokens = st.slider("Max Tokens", 128, 4096, 1024)
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
+ hf_cleaner_client = InferenceClient(model=CLEANER_MODEL, token=HF_TOKEN)
66
+ hf_analyst_client = None
67
+ if ANALYST_MODEL != "Gemini 2.5 Flash (Google)":
68
+ hf_analyst_client = InferenceClient(model=ANALYST_MODEL, token=HF_TOKEN)
69
 
70
+ # ======================================================
71
+ # 🧩 SAFE GENERATION FUNCTION
72
+ # ======================================================
73
+ def safe_hf_generate(client, prompt, temperature=0.3, max_tokens=512, retries=2):
74
+ """Try text generation, with retry + fallback on service errors."""
75
+ for attempt in range(retries + 1):
76
+ try:
77
+ resp = client.text_generation(
78
+ prompt,
79
+ temperature=temperature,
80
+ max_new_tokens=max_tokens,
81
+ return_full_text=False,
82
+ )
83
+ return resp.strip()
84
+ except Exception as e:
85
+ err = str(e)
86
+ # 🩹 FIX: Handle common server overloads gracefully
87
+ if "503" in err or "Service Temporarily Unavailable" in err:
88
+ time.sleep(2)
89
+ if attempt < retries:
90
+ continue # retry
91
+ else:
92
+ return "⚠️ The Hugging Face model is temporarily unavailable. Please try again or switch to Gemini."
93
+ elif "Supported task: conversational" in err:
94
+ chat_resp = client.chat_completion(
95
+ messages=[{"role": "user", "content": prompt}],
96
+ max_tokens=max_tokens,
97
+ temperature=temperature,
98
+ )
99
+ return chat_resp["choices"][0]["message"]["content"].strip()
100
+ else:
101
+ raise e
102
+ return "⚠️ Failed after retries."
103
+
104
+ # ======================================================
105
+ # 🧩 DATA CLEANING
106
+ # ======================================================
107
+ def fallback_clean(df: pd.DataFrame) -> pd.DataFrame:
108
+ df = df.copy()
109
+ df.dropna(axis=1, how="all", inplace=True)
110
+ df.columns = [c.strip().replace(" ", "_").lower() for c in df.columns]
111
+ for col in df.columns:
112
+ if df[col].dtype == "O":
113
+ if not df[col].mode().empty:
114
+ df[col].fillna(df[col].mode()[0], inplace=True)
115
+ else:
116
+ df[col].fillna("Unknown", inplace=True)
117
+ else:
118
+ df[col].fillna(df[col].median(), inplace=True)
119
+ df.drop_duplicates(inplace=True)
120
+ return df
121
+
122
+ def ai_clean_dataset(df: pd.DataFrame) -> (pd.DataFrame, str):
123
+ if len(df) > 50:
124
+ return df, "⚠️ AI cleaning skipped: dataset has more than 50 rows."
125
+ csv_text = df.to_csv(index=False)
126
+ prompt = f"""
127
+ You are a professional data cleaning assistant.
128
+ Clean and standardize the dataset below dynamically:
129
+ 1. Handle missing values
130
+ 2. Fix column name inconsistencies
131
+ 3. Convert data types (dates, numbers, categories)
132
+ 4. Remove irrelevant or duplicate rows
133
+ Return ONLY a valid CSV text (no markdown, no explanations).
134
+
135
+ Dataset:
136
+ {csv_text}
137
+ """
138
  try:
139
+ cleaned_str = safe_hf_generate(hf_cleaner_client, prompt, temperature=0.1, max_tokens=4096)
140
+ cleaned_str = cleaned_str.replace("```csv", "").replace("```", "").replace("###", "").strip()
141
+ cleaned_df = pd.read_csv(StringIO(cleaned_str), on_bad_lines="skip")
142
+ cleaned_df.columns = [c.strip().replace(" ", "_").lower() for c in cleaned_df.columns]
143
+ return cleaned_df, "βœ… AI cleaning completed successfully."
144
+ except Exception as e:
145
+ return df, f"⚠️ AI cleaning failed: {str(e)}"
146
+
147
+ # ======================================================
148
+ # 🧩 DATA SUMMARY (Token-efficient)
149
+ # ======================================================
150
+ def summarize_for_analysis(df: pd.DataFrame, sample_rows=10) -> str:
151
+ summary = [f"Rows: {len(df)}, Columns: {len(df.columns)}"]
152
+ for col in df.columns:
153
+ non_null = int(df[col].notnull().sum())
154
+ if pd.api.types.is_numeric_dtype(df[col]):
155
+ desc = df[col].describe().to_dict()
156
+ summary.append(f"- {col}: mean={desc.get('mean', np.nan):.2f}, median={df[col].median():.2f}, non_null={non_null}")
157
+ else:
158
+ top = df[col].value_counts().head(3).to_dict()
159
+ summary.append(f"- {col}: top_values={top}, non_null={non_null}")
160
+ sample = df.head(sample_rows).to_csv(index=False)
161
+ summary.append("--- Sample Data ---")
162
+ summary.append(sample)
163
+ return "\n".join(summary)
164
+
165
+ # ======================================================
166
+ # 🧠 ANALYSIS FUNCTION
167
+ # ======================================================
168
+ def query_analysis_model(df: pd.DataFrame, user_query: str, dataset_name: str) -> str:
169
+ prompt_summary = summarize_for_analysis(df)
170
+ prompt = f"""
171
+ You are a professional data analyst.
172
+ Analyze the dataset '{dataset_name}' and answer the user's question.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
 
174
+ --- DATA SUMMARY ---
175
+ {prompt_summary}
176
 
177
+ --- USER QUESTION ---
178
+ {user_query}
 
 
 
 
 
179
 
180
+ Respond with:
181
+ 1. Key insights and patterns
182
+ 2. Quantitative findings
183
+ 3. Notable relationships or anomalies
184
+ 4. Data-driven recommendations
 
 
 
 
 
 
185
  """
 
186
  try:
187
+ if ANALYST_MODEL == "Gemini 2.5 Flash (Google)":
188
+ response = genai.GenerativeModel("gemini-2.5-flash").generate_content(
189
+ prompt,
190
+ generation_config={
191
+ "temperature": temperature,
192
+ "max_output_tokens": max_tokens
193
+ }
 
 
 
 
 
 
194
  )
195
+ return response.text if hasattr(response, "text") else "No valid text response."
196
+ else:
197
+ # 🩹 FIX: wrap in retry-aware generator
198
+ result = safe_hf_generate(hf_analyst_client, prompt, temperature=temperature, max_tokens=max_tokens)
199
+ # fallback to Gemini if Hugging Face failed entirely
200
+ if "temporarily unavailable" in result.lower() and GEMINI_API_KEY:
201
+ alt = genai.GenerativeModel("gemini-2.5-flash").generate_content(prompt)
202
+ return f"πŸ”„ Fallback to Gemini:\n\n{alt.text}"
203
+ return result
204
+ except Exception as e:
205
+ # 🩹 FIX: fallback if server rejects or 5xx
206
+ if "503" in str(e) and GEMINI_API_KEY:
207
+ response = genai.GenerativeModel("gemini-2.5-flash").generate_content(prompt)
208
+ return f"πŸ”„ Fallback to Gemini due to 503 error:\n\n{response.text}"
209
+ return f"⚠️ Analysis failed: {str(e)}"
210
+
211
+ # ======================================================
212
+ # πŸš€ MAIN CHATBOT LOGIC
213
+ # ======================================================
214
+ uploaded = st.file_uploader("πŸ“Ž Upload CSV or Excel file", type=["csv", "xlsx"])
215
+ if "messages" not in st.session_state:
216
+ st.session_state.messages = []
217
+
218
+ if uploaded:
219
+ df = pd.read_csv(uploaded) if uploaded.name.endswith(".csv") else pd.read_excel(uploaded)
220
+
221
+ with st.spinner("🧼 Cleaning your dataset..."):
222
+ cleaned_df, cleaning_status = ai_clean_dataset(df)
223
+
224
+ st.subheader("βœ… Cleaning Status")
225
+ st.info(cleaning_status)
226
+ st.subheader("πŸ“Š Dataset Preview")
227
+ st.dataframe(cleaned_df.head(), use_container_width=True)
228
+
229
+ st.subheader("πŸ’¬ Chat with Your Dataset")
230
+ for msg in st.session_state.messages:
231
+ with st.chat_message(msg["role"]):
232
+ st.markdown(msg["content"])
233
+
234
+ if user_query := st.chat_input("Ask something about your dataset..."):
235
+ st.session_state.messages.append({"role": "user", "content": user_query})
236
+ with st.chat_message("user"):
237
+ st.markdown(user_query)
238
+
239
+ with st.chat_message("assistant"):
240
+ with st.spinner("πŸ€– Analyzing..."):
241
+ result = query_analysis_model(cleaned_df, user_query, uploaded.name)
242
+ st.markdown(result)
243
+ st.session_state.messages.append({"role": "assistant", "content": result})
244
+ else:
245
+ st.info("πŸ“₯ Upload a dataset to begin chatting with your AI analyst.")