Data_Analysis_Chatbot

Sleeping

App Files Files Community

Starburst15 commited on Oct 22

Commit

793855f

verified ·

1 Parent(s): 8d8d767

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +34 -148

src/streamlit_app.py CHANGED Viewed

@@ -1,13 +1,20 @@
 import os
 import pandas as pd
-import numpy as np
 import streamlit as st
 from dotenv import load_dotenv
 from huggingface_hub import InferenceClient, login
 import google.generativeai as genai
-from io import StringIO
-import time
-import requests
 # ======================================================
 # ⚙️ APP CONFIGURATION
@@ -34,7 +41,7 @@ else:
     st.warning("⚠️ Gemini API key missing. Gemini 2.5 Flash will not work.")
 # ======================================================
-# 🧠 MODEL SETUP
 # ======================================================
 with st.sidebar:
     st.header("⚙️ Model Settings")
@@ -62,171 +69,40 @@ with st.sidebar:
     temperature = st.slider("Temperature", 0.0, 1.0, 0.3)
     max_tokens = st.slider("Max Tokens", 128, 4096, 1024)
 hf_cleaner_client = InferenceClient(model=CLEANER_MODEL, token=HF_TOKEN)
 hf_analyst_client = None
 if ANALYST_MODEL != "Gemini 2.5 Flash (Google)":
     hf_analyst_client = InferenceClient(model=ANALYST_MODEL, token=HF_TOKEN)
-# ======================================================
-# 🧩 SAFE GENERATION FUNCTION
-# ======================================================
-def safe_hf_generate(client, prompt, temperature=0.3, max_tokens=512, retries=2):
-    """Try text generation, with retry + fallback on service errors."""
-    for attempt in range(retries + 1):
-        try:
-            resp = client.text_generation(
-                prompt,
-                temperature=temperature,
-                max_new_tokens=max_tokens,
-                return_full_text=False,
-            )
-            return resp.strip()
-        except Exception as e:
-            err = str(e)
-            # 🩹 FIX: Handle common server overloads gracefully
-            if "503" in err or "Service Temporarily Unavailable" in err:
-                time.sleep(2)
-                if attempt < retries:
-                    continue  # retry
-                else:
-                    return "⚠️ The Hugging Face model is temporarily unavailable. Please try again or switch to Gemini."
-            elif "Supported task: conversational" in err:
-                chat_resp = client.chat_completion(
-                    messages=[{"role": "user", "content": prompt}],
-                    max_tokens=max_tokens,
-                    temperature=temperature,
-                )
-                return chat_resp["choices"][0]["message"]["content"].strip()
-            else:
-                raise e
-    return "⚠️ Failed after retries."
-# ======================================================
-# 🧩 DATA CLEANING
-# ======================================================
-def fallback_clean(df: pd.DataFrame) -> pd.DataFrame:
-    df = df.copy()
-    df.dropna(axis=1, how="all", inplace=True)
-    df.columns = [c.strip().replace(" ", "_").lower() for c in df.columns]
-    for col in df.columns:
-        if df[col].dtype == "O":
-            if not df[col].mode().empty:
-                df[col].fillna(df[col].mode()[0], inplace=True)
-            else:
-                df[col].fillna("Unknown", inplace=True)
-        else:
-            df[col].fillna(df[col].median(), inplace=True)
-    df.drop_duplicates(inplace=True)
-    return df
-def ai_clean_dataset(df: pd.DataFrame) -> (pd.DataFrame, str):
-    if len(df) > 50:
-        return df, "⚠️ AI cleaning skipped: dataset has more than 50 rows."
-    csv_text = df.to_csv(index=False)
-    prompt = f"""
-You are a professional data cleaning assistant.
-Clean and standardize the dataset below dynamically:
-1. Handle missing values
-2. Fix column name inconsistencies
-3. Convert data types (dates, numbers, categories)
-4. Remove irrelevant or duplicate rows
-Return ONLY a valid CSV text (no markdown, no explanations).
-Dataset:
-{csv_text}
-"""
-    try:
-        cleaned_str = safe_hf_generate(hf_cleaner_client, prompt, temperature=0.1, max_tokens=4096)
-        cleaned_str = cleaned_str.replace("```csv", "").replace("```", "").replace("###", "").strip()
-        cleaned_df = pd.read_csv(StringIO(cleaned_str), on_bad_lines="skip")
-        cleaned_df.columns = [c.strip().replace(" ", "_").lower() for c in cleaned_df.columns]
-        return cleaned_df, "✅ AI cleaning completed successfully."
-    except Exception as e:
-        return df, f"⚠️ AI cleaning failed: {str(e)}"
-# ======================================================
-# 🧩 DATA SUMMARY (Token-efficient)
-# ======================================================
-def summarize_for_analysis(df: pd.DataFrame, sample_rows=10) -> str:
-    summary = [f"Rows: {len(df)}, Columns: {len(df.columns)}"]
-    for col in df.columns:
-        non_null = int(df[col].notnull().sum())
-        if pd.api.types.is_numeric_dtype(df[col]):
-            desc = df[col].describe().to_dict()
-            summary.append(f"- {col}: mean={desc.get('mean', np.nan):.2f}, median={df[col].median():.2f}, non_null={non_null}")
-        else:
-            top = df[col].value_counts().head(3).to_dict()
-            summary.append(f"- {col}: top_values={top}, non_null={non_null}")
-    sample = df.head(sample_rows).to_csv(index=False)
-    summary.append("--- Sample Data ---")
-    summary.append(sample)
-    return "\n".join(summary)
-# ======================================================
-# 🧠 ANALYSIS FUNCTION
-# ======================================================
-def query_analysis_model(df: pd.DataFrame, user_query: str, dataset_name: str) -> str:
-    prompt_summary = summarize_for_analysis(df)
-    prompt = f"""
-You are a professional data analyst.
-Analyze the dataset '{dataset_name}' and answer the user's question.
---- DATA SUMMARY ---
-{prompt_summary}
---- USER QUESTION ---
-{user_query}
-Respond with:
-1. Key insights and patterns
-2. Quantitative findings
-3. Notable relationships or anomalies
-4. Data-driven recommendations
-"""
-    try:
-        if ANALYST_MODEL == "Gemini 2.5 Flash (Google)":
-            response = genai.GenerativeModel("gemini-2.5-flash").generate_content(
-                prompt,
-                generation_config={
-                    "temperature": temperature,
-                    "max_output_tokens": max_tokens
-                }
-            )
-            return response.text if hasattr(response, "text") else "No valid text response."
-        else:
-            # 🩹 FIX: wrap in retry-aware generator
-            result = safe_hf_generate(hf_analyst_client, prompt, temperature=temperature, max_tokens=max_tokens)
-            # fallback to Gemini if Hugging Face failed entirely
-            if "temporarily unavailable" in result.lower() and GEMINI_API_KEY:
-                alt = genai.GenerativeModel("gemini-2.5-flash").generate_content(prompt)
-                return f"🔄 Fallback to Gemini:\n\n{alt.text}"
-            return result
-    except Exception as e:
-        # 🩹 FIX: fallback if server rejects or 5xx
-        if "503" in str(e) and GEMINI_API_KEY:
-            response = genai.GenerativeModel("gemini-2.5-flash").generate_content(prompt)
-            return f"🔄 Fallback to Gemini due to 503 error:\n\n{response.text}"
-        return f"⚠️ Analysis failed: {str(e)}"
 # ======================================================
 # 🚀 MAIN CHATBOT LOGIC
 # ======================================================
 uploaded = st.file_uploader("📎 Upload CSV or Excel file", type=["csv", "xlsx"])
 if "messages" not in st.session_state:
     st.session_state.messages = []
 if uploaded:
     df = pd.read_csv(uploaded) if uploaded.name.endswith(".csv") else pd.read_excel(uploaded)
     with st.spinner("🧼 Cleaning your dataset..."):
-        cleaned_df, cleaning_status = ai_clean_dataset(df)
     st.subheader("✅ Cleaning Status")
     st.info(cleaning_status)
     st.subheader("📊 Dataset Preview")
     st.dataframe(cleaned_df.head(), use_container_width=True)
     st.subheader("💬 Chat with Your Dataset")
     for msg in st.session_state.messages:
         with st.chat_message(msg["role"]):
             st.markdown(msg["content"])
@@ -238,8 +114,18 @@ if uploaded:
         with st.chat_message("assistant"):
             with st.spinner("🤖 Analyzing..."):
-                result = query_analysis_model(cleaned_df, user_query, uploaded.name)
                 st.markdown(result)
                 st.session_state.messages.append({"role": "assistant", "content": result})
 else:
     st.info("📥 Upload a dataset to begin chatting with your AI analyst.")

+# ======================================================
+# 📊 Smart Data Analyst Pro (Chat Mode)
+# Frontend & Orchestration — Uses utils.py for backend logic
+# ======================================================
 import os
 import pandas as pd
 import streamlit as st
 from dotenv import load_dotenv
 from huggingface_hub import InferenceClient, login
 import google.generativeai as genai
+# 🧠 Import backend logic
+from utils import (
+    ai_clean_dataset,
+    query_analysis_model,
+)
 # ======================================================
 # ⚙️ APP CONFIGURATION
     st.warning("⚠️ Gemini API key missing. Gemini 2.5 Flash will not work.")
 # ======================================================
+# 🧠 MODEL SETTINGS (SIDEBAR)
 # ======================================================
 with st.sidebar:
     st.header("⚙️ Model Settings")
     temperature = st.slider("Temperature", 0.0, 1.0, 0.3)
     max_tokens = st.slider("Max Tokens", 128, 4096, 1024)
+# ======================================================
+# 🧩 MODEL CLIENTS
+# ======================================================
 hf_cleaner_client = InferenceClient(model=CLEANER_MODEL, token=HF_TOKEN)
 hf_analyst_client = None
 if ANALYST_MODEL != "Gemini 2.5 Flash (Google)":
     hf_analyst_client = InferenceClient(model=ANALYST_MODEL, token=HF_TOKEN)
 # ======================================================
 # 🚀 MAIN CHATBOT LOGIC
 # ======================================================
 uploaded = st.file_uploader("📎 Upload CSV or Excel file", type=["csv", "xlsx"])
 if "messages" not in st.session_state:
     st.session_state.messages = []
 if uploaded:
+    # Load dataset
     df = pd.read_csv(uploaded) if uploaded.name.endswith(".csv") else pd.read_excel(uploaded)
+    # 🧼 AI-BASED CLEANING
     with st.spinner("🧼 Cleaning your dataset..."):
+        cleaned_df, cleaning_status = ai_clean_dataset(df, hf_cleaner_client)
+    # Display cleaning info
     st.subheader("✅ Cleaning Status")
     st.info(cleaning_status)
     st.subheader("📊 Dataset Preview")
     st.dataframe(cleaned_df.head(), use_container_width=True)
+    # 💬 Chat interface
     st.subheader("💬 Chat with Your Dataset")
     for msg in st.session_state.messages:
         with st.chat_message(msg["role"]):
             st.markdown(msg["content"])
         with st.chat_message("assistant"):
             with st.spinner("🤖 Analyzing..."):
+                result = query_analysis_model(
+                    cleaned_df,
+                    user_query,
+                    uploaded.name,
+                    ANALYST_MODEL,
+                    hf_client=hf_analyst_client,
+                    temperature=temperature,
+                    max_tokens=max_tokens,
+                    gemini_api_key=GEMINI_API_KEY
+                )
                 st.markdown(result)
                 st.session_state.messages.append({"role": "assistant", "content": result})
 else:
     st.info("📥 Upload a dataset to begin chatting with your AI analyst.")