Data_Analysis_Chatbot

Sleeping

App Files Files Community

Starburst15 commited on Oct 22

Commit

97d5e2d

verified ·

1 Parent(s): 0f3722b

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +219 -268

src/streamlit_app.py CHANGED Viewed

@@ -1,294 +1,245 @@
-# =============================================================
-# 📘 USTP Student Handbook Assistant (2023 Edition)
-# =============================================================
-# Enhanced: dynamic model selection + real (printed) page numbering
 import os
-import glob
-import json
-import time
-from typing import List, Dict, Any
 import numpy as np
 import streamlit as st
-import PyPDF2
-import requests
 from dotenv import load_dotenv
 from huggingface_hub import InferenceClient, login
-from streamlit_chat import message as st_message
-# Optional: FAISS for fast vector search
-try:
-    import faiss
-except ImportError:
-    faiss = None
-# =============================================================
-# 🌐 Startup Fix for PermissionError
-# =============================================================
-os.environ["STREAMLIT_HOME"] = "/tmp/.streamlit"
-os.makedirs("/tmp/.streamlit", exist_ok=True)
-# =============================================================
-# ⚙️ Streamlit Page Setup
-# =============================================================
-st.set_page_config(page_title="📘 Handbook Assistant", page_icon="📘", layout="wide")
-st.title("📘 USTP Student Handbook Assistant (2023 Edition)")
-st.caption("Answers sourced only from the official *USTP Student Handbook 2023 Edition.pdf*.")
 load_dotenv()
-HF_TOKEN = os.getenv("HF_TOKEN")
 if not HF_TOKEN:
-    st.warning("⚠️ No Hugging Face API token found in .env file. Online models will be unavailable.")
 else:
-    try:
-        login(HF_TOKEN)
-    except Exception:
-        pass
-hf_client = InferenceClient(token=HF_TOKEN) if HF_TOKEN else None
-# =============================================================
-# ⚙️ Sidebar Configuration
-# =============================================================
 with st.sidebar:
-    st.header("⚙️ Settings")
-    model_options = {
-        "Qwen 2.5 14B Instruct": "Qwen/Qwen2.5-14B-Instruct",
-        "Mistral 7B Instruct": "mistralai/Mistral-7B-Instruct-v0.3",
-        "Llama 3 8B Instruct": "meta-llama/Meta-Llama-3-8B-Instruct",
-        "Mixtral 8x7B Instruct": "mistralai/Mixtral-8x7B-Instruct-v0.1",
-        "Falcon 7B Instruct": "tiiuae/falcon-7b-instruct",
-    }
-    model_choice = st.selectbox("Select reasoning model", list(model_options.keys()), index=0)
-    DEFAULT_MODEL = model_options[model_choice]
-    st.markdown("---")
-    similarity_threshold = st.slider("Similarity threshold", 0.3, 1.0, 0.6, 0.01)
-    top_k = st.slider("Top K retrieved chunks", 1, 10, 4)
-    chunk_size_chars = st.number_input("Chunk size (chars)", 400, 2500, 1200, 100)
-    chunk_overlap = st.number_input("Chunk overlap (chars)", 20, 600, 150, 10)
-    front_matter_pages = st.number_input(
-        "Pages before main content (e.g. table of contents, cover)", min_value=0, max_value=50, value=12
     )
-    regenerate_index = st.button("🔁 Rebuild handbook index")
-# =============================================================
-# 📂 File Config
-# =============================================================
-INDEX_FILE = "handbook_faiss.index"
-META_FILE = "handbook_metadata.json"
-EMB_DIM_FILE = "handbook_emb_dim.json"
-EMBED_MODEL = "sentence-transformers/all-mpnet-base-v2"
-# =============================================================
-# 🧩 Utility Functions
-# =============================================================
-def find_handbook() -> List[str]:
-    preferred = "USTP Student Handbook 2023 Edition.pdf"
-    pdfs = glob.glob("*.pdf")
-    for f in pdfs:
-        if preferred.lower() in f.lower():
-            st.success(f"📘 Found handbook: {f}")
-            return [f]
-    if pdfs:
-        st.warning(f"⚠️ Preferred handbook not found. Using {os.path.basename(pdfs[0])}.")
-        return [pdfs[0]]
-    st.error("❌ No PDF found in current folder.")
-    return []
-def load_pdf_texts(pdf_paths: List[str]) -> List[Dict[str, Any]]:
-    """Extract page text while adjusting page numbering to printed handbook numbers."""
-    pages = []
-    for path in pdf_paths:
-        with open(path, "rb") as f:
-            reader = PyPDF2.PdfReader(f)
-            for i, page in enumerate(reader.pages):
-                text = page.extract_text() or ""
-                if text.strip():
-                    # Adjust logical page number to printed numbering
-                    logical_page = i + 1
-                    printed_page = logical_page - front_matter_pages
-                    if printed_page < 1:
-                        printed_page = 1
-                    pages.append({
-                        "filename": os.path.basename(path),
-                        "page": printed_page,
-                        "text": text.strip()
-                    })
-    return pages
-def chunk_text(pages: List[Dict[str, Any]], size: int, overlap: int) -> List[Dict[str, Any]]:
-    chunks = []
-    for p in pages:
-        text = p["text"]
-        start = 0
-        while start < len(text):
-            end = start + size
-            chunk = text[start:end]
-            chunks.append({
-                "filename": p["filename"],
-                "page": p["page"],
-                "content": chunk.strip()
-            })
-            start += size - overlap
-    return chunks
-def embed_texts(texts: List[str]) -> np.ndarray:
-    """Generate embeddings using Hugging Face feature extraction."""
-    if not HF_TOKEN or not hf_client:
-        st.error("❌ Missing Hugging Face token or client.")
-        return np.zeros((len(texts), 768))
     try:
-        embeddings = hf_client.feature_extraction(texts, model=EMBED_MODEL)
-        if isinstance(embeddings[0][0], list):
-            embeddings = [np.mean(np.array(e), axis=0) for e in embeddings]
-        return np.array(embeddings)
-    except Exception as e1:
-        st.warning(f"⚠️ feature_extraction failed, using REST API fallback: {e1}")
-        headers = {"Authorization": f"Bearer {HF_TOKEN}"}
-        resp = requests.post(
-            f"https://api-inference.huggingface.co/models/{EMBED_MODEL}",
-            headers=headers,
-            json={"inputs": texts}
-        )
-        data = resp.json()
-        if isinstance(data[0][0], list):
-            data = [np.mean(np.array(e), axis=0) for e in data]
-        return np.array(data)
-def build_faiss_index(chunks: List[Dict[str, Any]]):
-    """Build FAISS index for chunks."""
-    texts = [c["content"] for c in chunks]
-    embeddings = embed_texts(texts)
-    if embeddings.size == 0:
-        st.error("❌ Embedding generation failed.")
-        return
-    dim = embeddings.shape[1]
-    index = faiss.IndexFlatL2(dim)
-    index.add(embeddings.astype("float32"))
-    faiss.write_index(index, INDEX_FILE)
-    with open(META_FILE, "w") as f:
-        json.dump(chunks, f)
-    with open(EMB_DIM_FILE, "w") as f:
-        json.dump({"dim": dim}, f)
-    st.success(f"✅ Indexed {len(chunks)} chunks.")
-def load_faiss_index():
-    if not os.path.exists(INDEX_FILE) or not os.path.exists(META_FILE):
-        return None, None
-    index = faiss.read_index(INDEX_FILE)
-    with open(META_FILE) as f:
-        meta = json.load(f)
-    return index, meta
-def search_index(query: str, index, meta, top_k: int, threshold: float):
-    query_emb = embed_texts([query])
-    distances, indices = index.search(query_emb.astype("float32"), top_k)
-    results = []
-    for i, dist in zip(indices[0], distances[0]):
-        if i < len(meta):
-            r = meta[i]
-            r["distance"] = float(dist)
-            results.append(r)
-    return results
-def generate_answer(context: str, query: str) -> str:
-    """Generate model-based answer using selected open-source model."""
-    prompt = f"""
-You are a precise academic assistant specialized in university policy.
-Use only the *USTP Student Handbook 2023 Edition* below.
-If the answer is not in the text, reply:
-"The handbook does not specify that."
----
-📘 Context:
-{context}
----
-🧭 Question:
-{query}
----
-🎯 Instructions:
-- Be factual and concise.
-- Cite the correct printed page number.
-- Never make assumptions.
 """
     try:
-        response = hf_client.text_generation(
-            model=DEFAULT_MODEL,
-            prompt=prompt,
-            max_new_tokens=400,
-            temperature=0.25
-        )
-        return response if isinstance(response, str) else str(response)
-    except Exception as e1:
-        try:
-            chat_response = hf_client.chat.completions.create(
-                model=DEFAULT_MODEL,
-                messages=[{"role": "user", "content": prompt}],
-                max_tokens=400
             )
-            return chat_response.choices[0].message["content"]
-        except Exception as e2:
-            return f"⚠️ Error generating answer: {e2}"
-def ensure_index():
-    """Ensure FAISS index exists or rebuild."""
-    if regenerate_index or not os.path.exists(INDEX_FILE):
-        pdfs = find_handbook()
-        if not pdfs:
-            st.stop()
-        st.info("📄 Extracting handbook text...")
-        pages = load_pdf_texts(pdfs)
-        chunks = chunk_text(pages, chunk_size_chars, chunk_overlap)
-        build_faiss_index(chunks)
-    index, meta = load_faiss_index()
-    if index is None or meta is None:
-        st.error("❌ Could not load FAISS index.")
-        st.stop()
-    return index, meta
-# =============================================================
-# 💬 Chat Interface
-# =============================================================
-st.divider()
-st.subheader("💬 Ask about the Handbook")
-if "history" not in st.session_state:
-    st.session_state.history = []
-user_query = st.text_input("Enter your question:")
-index, meta = ensure_index()
-if st.button("Ask") and user_query.strip():
-    results = search_index(user_query, index, meta, top_k, similarity_threshold)
-    if not results:
-        st.warning("No relevant section found in the handbook.")
-    else:
-        context = "\n\n".join(
-            [f"(📄 Page {r['page']})\n{r['content']}" for r in results]
-        )
-        answer = generate_answer(context, user_query)
-        st.session_state.history.append({
-            "user": user_query,
-            "assistant": answer,
-            "timestamp": time.time()
-        })
-# ✅ Ensure unique keys to prevent StreamlitDuplicateElementId
-for i, chat in enumerate(st.session_state.history):
-    st_message(chat["user"], is_user=True, key=f"user_{i}")
-    st_message(chat["assistant"], key=f"assistant_{i}")
-st.caption("⚡ Powered by FAISS + Open Source Models + Accurate Page Referencing")

 import os
+import pandas as pd
 import numpy as np
 import streamlit as st
 from dotenv import load_dotenv
 from huggingface_hub import InferenceClient, login
+import google.generativeai as genai
+from io import StringIO
+import time
+import requests
+# ======================================================
+# ⚙️ APP CONFIGURATION
+# ======================================================
+st.set_page_config(page_title="📊 Smart Data Analyst Pro", layout="wide")
+st.title("📊 Smart Data Analyst Pro (Chat Mode)")
+st.caption("Chat with your dataset — AI cleans, analyzes, and visualizes data. Hugging Face + Gemini compatible.")
+# ======================================================
+# 🔐 Load Environment Variables
+# ======================================================
 load_dotenv()
+HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_API_KEY")
+GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
 if not HF_TOKEN:
+    st.error("❌ Missing HF_TOKEN. Please set it in your .env file.")
 else:
+    login(token=HF_TOKEN)
+if GEMINI_API_KEY:
+    genai.configure(api_key=GEMINI_API_KEY)
+else:
+    st.warning("⚠️ Gemini API key missing. Gemini 2.5 Flash will not work.")
+# ======================================================
+# 🧠 MODEL SETUP
+# ======================================================
 with st.sidebar:
+    st.header("⚙️ Model Settings")
+    CLEANER_MODEL = st.selectbox(
+        "Select Cleaner Model:",
+        [
+            "Qwen/Qwen2.5-Coder-14B",
+            "mistralai/Mistral-7B-Instruct-v0.3"
+        ],
+        index=0
     )
+    ANALYST_MODEL = st.selectbox(
+        "Select Analysis Model:",
+        [
+            "Gemini 2.5 Flash (Google)",
+            "Qwen/Qwen2.5-14B-Instruct",
+            "mistralai/Mistral-7B-Instruct-v0.3",
+            "HuggingFaceH4/zephyr-7b-beta"
+        ],
+        index=0
+    )
+    temperature = st.slider("Temperature", 0.0, 1.0, 0.3)
+    max_tokens = st.slider("Max Tokens", 128, 4096, 1024)
+hf_cleaner_client = InferenceClient(model=CLEANER_MODEL, token=HF_TOKEN)
+hf_analyst_client = None
+if ANALYST_MODEL != "Gemini 2.5 Flash (Google)":
+    hf_analyst_client = InferenceClient(model=ANALYST_MODEL, token=HF_TOKEN)
+# ======================================================
+# 🧩 SAFE GENERATION FUNCTION
+# ======================================================
+def safe_hf_generate(client, prompt, temperature=0.3, max_tokens=512, retries=2):
+    """Try text generation, with retry + fallback on service errors."""
+    for attempt in range(retries + 1):
+        try:
+            resp = client.text_generation(
+                prompt,
+                temperature=temperature,
+                max_new_tokens=max_tokens,
+                return_full_text=False,
+            )
+            return resp.strip()
+        except Exception as e:
+            err = str(e)
+            # 🩹 FIX: Handle common server overloads gracefully
+            if "503" in err or "Service Temporarily Unavailable" in err:
+                time.sleep(2)
+                if attempt < retries:
+                    continue  # retry
+                else:
+                    return "⚠️ The Hugging Face model is temporarily unavailable. Please try again or switch to Gemini."
+            elif "Supported task: conversational" in err:
+                chat_resp = client.chat_completion(
+                    messages=[{"role": "user", "content": prompt}],
+                    max_tokens=max_tokens,
+                    temperature=temperature,
+                )
+                return chat_resp["choices"][0]["message"]["content"].strip()
+            else:
+                raise e
+    return "⚠️ Failed after retries."
+# ======================================================
+# 🧩 DATA CLEANING
+# ======================================================
+def fallback_clean(df: pd.DataFrame) -> pd.DataFrame:
+    df = df.copy()
+    df.dropna(axis=1, how="all", inplace=True)
+    df.columns = [c.strip().replace(" ", "_").lower() for c in df.columns]
+    for col in df.columns:
+        if df[col].dtype == "O":
+            if not df[col].mode().empty:
+                df[col].fillna(df[col].mode()[0], inplace=True)
+            else:
+                df[col].fillna("Unknown", inplace=True)
+        else:
+            df[col].fillna(df[col].median(), inplace=True)
+    df.drop_duplicates(inplace=True)
+    return df
+def ai_clean_dataset(df: pd.DataFrame) -> (pd.DataFrame, str):
+    if len(df) > 50:
+        return df, "⚠️ AI cleaning skipped: dataset has more than 50 rows."
+    csv_text = df.to_csv(index=False)
+    prompt = f"""
+You are a professional data cleaning assistant.
+Clean and standardize the dataset below dynamically:
+1. Handle missing values
+2. Fix column name inconsistencies
+3. Convert data types (dates, numbers, categories)
+4. Remove irrelevant or duplicate rows
+Return ONLY a valid CSV text (no markdown, no explanations).
+Dataset:
+{csv_text}
+"""
     try:
+        cleaned_str = safe_hf_generate(hf_cleaner_client, prompt, temperature=0.1, max_tokens=4096)
+        cleaned_str = cleaned_str.replace("```csv", "").replace("```", "").replace("###", "").strip()
+        cleaned_df = pd.read_csv(StringIO(cleaned_str), on_bad_lines="skip")
+        cleaned_df.columns = [c.strip().replace(" ", "_").lower() for c in cleaned_df.columns]
+        return cleaned_df, "✅ AI cleaning completed successfully."
+    except Exception as e:
+        return df, f"⚠️ AI cleaning failed: {str(e)}"
+# ======================================================
+# 🧩 DATA SUMMARY (Token-efficient)
+# ======================================================
+def summarize_for_analysis(df: pd.DataFrame, sample_rows=10) -> str:
+    summary = [f"Rows: {len(df)}, Columns: {len(df.columns)}"]
+    for col in df.columns:
+        non_null = int(df[col].notnull().sum())
+        if pd.api.types.is_numeric_dtype(df[col]):
+            desc = df[col].describe().to_dict()
+            summary.append(f"- {col}: mean={desc.get('mean', np.nan):.2f}, median={df[col].median():.2f}, non_null={non_null}")
+        else:
+            top = df[col].value_counts().head(3).to_dict()
+            summary.append(f"- {col}: top_values={top}, non_null={non_null}")
+    sample = df.head(sample_rows).to_csv(index=False)
+    summary.append("--- Sample Data ---")
+    summary.append(sample)
+    return "\n".join(summary)
+# ======================================================
+# 🧠 ANALYSIS FUNCTION
+# ======================================================
+def query_analysis_model(df: pd.DataFrame, user_query: str, dataset_name: str) -> str:
+    prompt_summary = summarize_for_analysis(df)
+    prompt = f"""
+You are a professional data analyst.
+Analyze the dataset '{dataset_name}' and answer the user's question.
+--- DATA SUMMARY ---
+{prompt_summary}
+--- USER QUESTION ---
+{user_query}
+Respond with:
+1. Key insights and patterns
+2. Quantitative findings
+3. Notable relationships or anomalies
+4. Data-driven recommendations
 """
     try:
+        if ANALYST_MODEL == "Gemini 2.5 Flash (Google)":
+            response = genai.GenerativeModel("gemini-2.5-flash").generate_content(
+                prompt,
+                generation_config={
+                    "temperature": temperature,
+                    "max_output_tokens": max_tokens
+                }
             )
+            return response.text if hasattr(response, "text") else "No valid text response."
+        else:
+            # 🩹 FIX: wrap in retry-aware generator
+            result = safe_hf_generate(hf_analyst_client, prompt, temperature=temperature, max_tokens=max_tokens)
+            # fallback to Gemini if Hugging Face failed entirely
+            if "temporarily unavailable" in result.lower() and GEMINI_API_KEY:
+                alt = genai.GenerativeModel("gemini-2.5-flash").generate_content(prompt)
+                return f"🔄 Fallback to Gemini:\n\n{alt.text}"
+            return result
+    except Exception as e:
+        # 🩹 FIX: fallback if server rejects or 5xx
+        if "503" in str(e) and GEMINI_API_KEY:
+            response = genai.GenerativeModel("gemini-2.5-flash").generate_content(prompt)
+            return f"🔄 Fallback to Gemini due to 503 error:\n\n{response.text}"
+        return f"⚠️ Analysis failed: {str(e)}"
+# ======================================================
+# 🚀 MAIN CHATBOT LOGIC
+# ======================================================
+uploaded = st.file_uploader("📎 Upload CSV or Excel file", type=["csv", "xlsx"])
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+if uploaded:
+    df = pd.read_csv(uploaded) if uploaded.name.endswith(".csv") else pd.read_excel(uploaded)
+    with st.spinner("🧼 Cleaning your dataset..."):
+        cleaned_df, cleaning_status = ai_clean_dataset(df)
+    st.subheader("✅ Cleaning Status")
+    st.info(cleaning_status)
+    st.subheader("📊 Dataset Preview")
+    st.dataframe(cleaned_df.head(), use_container_width=True)
+    st.subheader("💬 Chat with Your Dataset")
+    for msg in st.session_state.messages:
+        with st.chat_message(msg["role"]):
+            st.markdown(msg["content"])
+    if user_query := st.chat_input("Ask something about your dataset..."):
+        st.session_state.messages.append({"role": "user", "content": user_query})
+        with st.chat_message("user"):
+            st.markdown(user_query)
+        with st.chat_message("assistant"):
+            with st.spinner("🤖 Analyzing..."):
+                result = query_analysis_model(cleaned_df, user_query, uploaded.name)
+                st.markdown(result)
+                st.session_state.messages.append({"role": "assistant", "content": result})
+else:
+    st.info("📥 Upload a dataset to begin chatting with your AI analyst.")