Starburst15 commited on
Commit
793855f
ยท
verified ยท
1 Parent(s): 8d8d767

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +34 -148
src/streamlit_app.py CHANGED
@@ -1,13 +1,20 @@
 
 
 
 
 
1
  import os
2
  import pandas as pd
3
- import numpy as np
4
  import streamlit as st
5
  from dotenv import load_dotenv
6
  from huggingface_hub import InferenceClient, login
7
  import google.generativeai as genai
8
- from io import StringIO
9
- import time
10
- import requests
 
 
 
11
 
12
  # ======================================================
13
  # โš™๏ธ APP CONFIGURATION
@@ -34,7 +41,7 @@ else:
34
  st.warning("โš ๏ธ Gemini API key missing. Gemini 2.5 Flash will not work.")
35
 
36
  # ======================================================
37
- # ๐Ÿง  MODEL SETUP
38
  # ======================================================
39
  with st.sidebar:
40
  st.header("โš™๏ธ Model Settings")
@@ -62,171 +69,40 @@ with st.sidebar:
62
  temperature = st.slider("Temperature", 0.0, 1.0, 0.3)
63
  max_tokens = st.slider("Max Tokens", 128, 4096, 1024)
64
 
 
 
 
65
  hf_cleaner_client = InferenceClient(model=CLEANER_MODEL, token=HF_TOKEN)
66
  hf_analyst_client = None
67
  if ANALYST_MODEL != "Gemini 2.5 Flash (Google)":
68
  hf_analyst_client = InferenceClient(model=ANALYST_MODEL, token=HF_TOKEN)
69
 
70
- # ======================================================
71
- # ๐Ÿงฉ SAFE GENERATION FUNCTION
72
- # ======================================================
73
- def safe_hf_generate(client, prompt, temperature=0.3, max_tokens=512, retries=2):
74
- """Try text generation, with retry + fallback on service errors."""
75
- for attempt in range(retries + 1):
76
- try:
77
- resp = client.text_generation(
78
- prompt,
79
- temperature=temperature,
80
- max_new_tokens=max_tokens,
81
- return_full_text=False,
82
- )
83
- return resp.strip()
84
- except Exception as e:
85
- err = str(e)
86
- # ๐Ÿฉน FIX: Handle common server overloads gracefully
87
- if "503" in err or "Service Temporarily Unavailable" in err:
88
- time.sleep(2)
89
- if attempt < retries:
90
- continue # retry
91
- else:
92
- return "โš ๏ธ The Hugging Face model is temporarily unavailable. Please try again or switch to Gemini."
93
- elif "Supported task: conversational" in err:
94
- chat_resp = client.chat_completion(
95
- messages=[{"role": "user", "content": prompt}],
96
- max_tokens=max_tokens,
97
- temperature=temperature,
98
- )
99
- return chat_resp["choices"][0]["message"]["content"].strip()
100
- else:
101
- raise e
102
- return "โš ๏ธ Failed after retries."
103
-
104
- # ======================================================
105
- # ๐Ÿงฉ DATA CLEANING
106
- # ======================================================
107
- def fallback_clean(df: pd.DataFrame) -> pd.DataFrame:
108
- df = df.copy()
109
- df.dropna(axis=1, how="all", inplace=True)
110
- df.columns = [c.strip().replace(" ", "_").lower() for c in df.columns]
111
- for col in df.columns:
112
- if df[col].dtype == "O":
113
- if not df[col].mode().empty:
114
- df[col].fillna(df[col].mode()[0], inplace=True)
115
- else:
116
- df[col].fillna("Unknown", inplace=True)
117
- else:
118
- df[col].fillna(df[col].median(), inplace=True)
119
- df.drop_duplicates(inplace=True)
120
- return df
121
-
122
- def ai_clean_dataset(df: pd.DataFrame) -> (pd.DataFrame, str):
123
- if len(df) > 50:
124
- return df, "โš ๏ธ AI cleaning skipped: dataset has more than 50 rows."
125
- csv_text = df.to_csv(index=False)
126
- prompt = f"""
127
- You are a professional data cleaning assistant.
128
- Clean and standardize the dataset below dynamically:
129
- 1. Handle missing values
130
- 2. Fix column name inconsistencies
131
- 3. Convert data types (dates, numbers, categories)
132
- 4. Remove irrelevant or duplicate rows
133
- Return ONLY a valid CSV text (no markdown, no explanations).
134
-
135
- Dataset:
136
- {csv_text}
137
- """
138
- try:
139
- cleaned_str = safe_hf_generate(hf_cleaner_client, prompt, temperature=0.1, max_tokens=4096)
140
- cleaned_str = cleaned_str.replace("```csv", "").replace("```", "").replace("###", "").strip()
141
- cleaned_df = pd.read_csv(StringIO(cleaned_str), on_bad_lines="skip")
142
- cleaned_df.columns = [c.strip().replace(" ", "_").lower() for c in cleaned_df.columns]
143
- return cleaned_df, "โœ… AI cleaning completed successfully."
144
- except Exception as e:
145
- return df, f"โš ๏ธ AI cleaning failed: {str(e)}"
146
-
147
- # ======================================================
148
- # ๐Ÿงฉ DATA SUMMARY (Token-efficient)
149
- # ======================================================
150
- def summarize_for_analysis(df: pd.DataFrame, sample_rows=10) -> str:
151
- summary = [f"Rows: {len(df)}, Columns: {len(df.columns)}"]
152
- for col in df.columns:
153
- non_null = int(df[col].notnull().sum())
154
- if pd.api.types.is_numeric_dtype(df[col]):
155
- desc = df[col].describe().to_dict()
156
- summary.append(f"- {col}: mean={desc.get('mean', np.nan):.2f}, median={df[col].median():.2f}, non_null={non_null}")
157
- else:
158
- top = df[col].value_counts().head(3).to_dict()
159
- summary.append(f"- {col}: top_values={top}, non_null={non_null}")
160
- sample = df.head(sample_rows).to_csv(index=False)
161
- summary.append("--- Sample Data ---")
162
- summary.append(sample)
163
- return "\n".join(summary)
164
-
165
- # ======================================================
166
- # ๐Ÿง  ANALYSIS FUNCTION
167
- # ======================================================
168
- def query_analysis_model(df: pd.DataFrame, user_query: str, dataset_name: str) -> str:
169
- prompt_summary = summarize_for_analysis(df)
170
- prompt = f"""
171
- You are a professional data analyst.
172
- Analyze the dataset '{dataset_name}' and answer the user's question.
173
-
174
- --- DATA SUMMARY ---
175
- {prompt_summary}
176
-
177
- --- USER QUESTION ---
178
- {user_query}
179
-
180
- Respond with:
181
- 1. Key insights and patterns
182
- 2. Quantitative findings
183
- 3. Notable relationships or anomalies
184
- 4. Data-driven recommendations
185
- """
186
- try:
187
- if ANALYST_MODEL == "Gemini 2.5 Flash (Google)":
188
- response = genai.GenerativeModel("gemini-2.5-flash").generate_content(
189
- prompt,
190
- generation_config={
191
- "temperature": temperature,
192
- "max_output_tokens": max_tokens
193
- }
194
- )
195
- return response.text if hasattr(response, "text") else "No valid text response."
196
- else:
197
- # ๐Ÿฉน FIX: wrap in retry-aware generator
198
- result = safe_hf_generate(hf_analyst_client, prompt, temperature=temperature, max_tokens=max_tokens)
199
- # fallback to Gemini if Hugging Face failed entirely
200
- if "temporarily unavailable" in result.lower() and GEMINI_API_KEY:
201
- alt = genai.GenerativeModel("gemini-2.5-flash").generate_content(prompt)
202
- return f"๐Ÿ”„ Fallback to Gemini:\n\n{alt.text}"
203
- return result
204
- except Exception as e:
205
- # ๐Ÿฉน FIX: fallback if server rejects or 5xx
206
- if "503" in str(e) and GEMINI_API_KEY:
207
- response = genai.GenerativeModel("gemini-2.5-flash").generate_content(prompt)
208
- return f"๐Ÿ”„ Fallback to Gemini due to 503 error:\n\n{response.text}"
209
- return f"โš ๏ธ Analysis failed: {str(e)}"
210
-
211
  # ======================================================
212
  # ๐Ÿš€ MAIN CHATBOT LOGIC
213
  # ======================================================
214
  uploaded = st.file_uploader("๐Ÿ“Ž Upload CSV or Excel file", type=["csv", "xlsx"])
 
215
  if "messages" not in st.session_state:
216
  st.session_state.messages = []
217
 
218
  if uploaded:
 
219
  df = pd.read_csv(uploaded) if uploaded.name.endswith(".csv") else pd.read_excel(uploaded)
220
 
 
221
  with st.spinner("๐Ÿงผ Cleaning your dataset..."):
222
- cleaned_df, cleaning_status = ai_clean_dataset(df)
223
 
 
224
  st.subheader("โœ… Cleaning Status")
225
  st.info(cleaning_status)
 
226
  st.subheader("๐Ÿ“Š Dataset Preview")
227
  st.dataframe(cleaned_df.head(), use_container_width=True)
228
 
 
229
  st.subheader("๐Ÿ’ฌ Chat with Your Dataset")
 
230
  for msg in st.session_state.messages:
231
  with st.chat_message(msg["role"]):
232
  st.markdown(msg["content"])
@@ -238,8 +114,18 @@ if uploaded:
238
 
239
  with st.chat_message("assistant"):
240
  with st.spinner("๐Ÿค– Analyzing..."):
241
- result = query_analysis_model(cleaned_df, user_query, uploaded.name)
 
 
 
 
 
 
 
 
 
242
  st.markdown(result)
243
  st.session_state.messages.append({"role": "assistant", "content": result})
 
244
  else:
245
  st.info("๐Ÿ“ฅ Upload a dataset to begin chatting with your AI analyst.")
 
1
+ # ======================================================
2
+ # ๐Ÿ“Š Smart Data Analyst Pro (Chat Mode)
3
+ # Frontend & Orchestration โ€” Uses utils.py for backend logic
4
+ # ======================================================
5
+
6
  import os
7
  import pandas as pd
 
8
  import streamlit as st
9
  from dotenv import load_dotenv
10
  from huggingface_hub import InferenceClient, login
11
  import google.generativeai as genai
12
+
13
+ # ๐Ÿง  Import backend logic
14
+ from utils import (
15
+ ai_clean_dataset,
16
+ query_analysis_model,
17
+ )
18
 
19
  # ======================================================
20
  # โš™๏ธ APP CONFIGURATION
 
41
  st.warning("โš ๏ธ Gemini API key missing. Gemini 2.5 Flash will not work.")
42
 
43
  # ======================================================
44
+ # ๐Ÿง  MODEL SETTINGS (SIDEBAR)
45
  # ======================================================
46
  with st.sidebar:
47
  st.header("โš™๏ธ Model Settings")
 
69
  temperature = st.slider("Temperature", 0.0, 1.0, 0.3)
70
  max_tokens = st.slider("Max Tokens", 128, 4096, 1024)
71
 
72
+ # ======================================================
73
+ # ๐Ÿงฉ MODEL CLIENTS
74
+ # ======================================================
75
  hf_cleaner_client = InferenceClient(model=CLEANER_MODEL, token=HF_TOKEN)
76
  hf_analyst_client = None
77
  if ANALYST_MODEL != "Gemini 2.5 Flash (Google)":
78
  hf_analyst_client = InferenceClient(model=ANALYST_MODEL, token=HF_TOKEN)
79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  # ======================================================
81
  # ๐Ÿš€ MAIN CHATBOT LOGIC
82
  # ======================================================
83
  uploaded = st.file_uploader("๐Ÿ“Ž Upload CSV or Excel file", type=["csv", "xlsx"])
84
+
85
  if "messages" not in st.session_state:
86
  st.session_state.messages = []
87
 
88
  if uploaded:
89
+ # Load dataset
90
  df = pd.read_csv(uploaded) if uploaded.name.endswith(".csv") else pd.read_excel(uploaded)
91
 
92
+ # ๐Ÿงผ AI-BASED CLEANING
93
  with st.spinner("๐Ÿงผ Cleaning your dataset..."):
94
+ cleaned_df, cleaning_status = ai_clean_dataset(df, hf_cleaner_client)
95
 
96
+ # Display cleaning info
97
  st.subheader("โœ… Cleaning Status")
98
  st.info(cleaning_status)
99
+
100
  st.subheader("๐Ÿ“Š Dataset Preview")
101
  st.dataframe(cleaned_df.head(), use_container_width=True)
102
 
103
+ # ๐Ÿ’ฌ Chat interface
104
  st.subheader("๐Ÿ’ฌ Chat with Your Dataset")
105
+
106
  for msg in st.session_state.messages:
107
  with st.chat_message(msg["role"]):
108
  st.markdown(msg["content"])
 
114
 
115
  with st.chat_message("assistant"):
116
  with st.spinner("๐Ÿค– Analyzing..."):
117
+ result = query_analysis_model(
118
+ cleaned_df,
119
+ user_query,
120
+ uploaded.name,
121
+ ANALYST_MODEL,
122
+ hf_client=hf_analyst_client,
123
+ temperature=temperature,
124
+ max_tokens=max_tokens,
125
+ gemini_api_key=GEMINI_API_KEY
126
+ )
127
  st.markdown(result)
128
  st.session_state.messages.append({"role": "assistant", "content": result})
129
+
130
  else:
131
  st.info("๐Ÿ“ฅ Upload a dataset to begin chatting with your AI analyst.")