File size: 4,505 Bytes
793855f
 
 
 
 
44669ca
97d5e2d
ce96f36
44669ca
 
97d5e2d
793855f
 
 
 
 
 
44669ca
97d5e2d
 
 
 
 
 
44669ca
97d5e2d
 
 
44669ca
97d5e2d
 
44669ca
 
97d5e2d
44669ca
97d5e2d
44669ca
97d5e2d
 
 
 
44669ca
97d5e2d
793855f
97d5e2d
44669ca
97d5e2d
 
 
 
 
 
 
 
 
44669ca
 
97d5e2d
 
 
 
 
 
 
 
 
 
44669ca
97d5e2d
 
ce96f36
793855f
 
 
97d5e2d
 
 
 
ce96f36
97d5e2d
 
 
 
793855f
97d5e2d
 
 
 
793855f
97d5e2d
 
793855f
97d5e2d
793855f
97d5e2d
793855f
97d5e2d
 
793855f
97d5e2d
 
 
793855f
97d5e2d
793855f
97d5e2d
 
 
 
 
 
 
 
 
 
 
793855f
 
 
 
 
 
 
 
 
 
97d5e2d
 
793855f
97d5e2d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# ======================================================
# πŸ“Š Smart Data Analyst Pro (Chat Mode)
# Frontend & Orchestration β€” Uses utils.py for backend logic
# ======================================================

import os
import pandas as pd
import streamlit as st
from dotenv import load_dotenv
from huggingface_hub import InferenceClient, login
import google.generativeai as genai

# 🧠 Import backend logic
from utils import (
    ai_clean_dataset,
    query_analysis_model,
)

# ======================================================
# βš™οΈ APP CONFIGURATION
# ======================================================
st.set_page_config(page_title="πŸ“Š Smart Data Analyst Pro", layout="wide")
st.title("πŸ“Š Smart Data Analyst Pro (Chat Mode)")
st.caption("Chat with your dataset β€” AI cleans, analyzes, and visualizes data. Hugging Face + Gemini compatible.")

# ======================================================
# πŸ” Load Environment Variables
# ======================================================
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_API_KEY")
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

if not HF_TOKEN:
    st.error("❌ Missing HF_TOKEN. Please set it in your .env file.")
else:
    login(token=HF_TOKEN)

if GEMINI_API_KEY:
    genai.configure(api_key=GEMINI_API_KEY)
else:
    st.warning("⚠️ Gemini API key missing. Gemini 2.5 Flash will not work.")

# ======================================================
# 🧠 MODEL SETTINGS (SIDEBAR)
# ======================================================
with st.sidebar:
    st.header("βš™οΈ Model Settings")

    CLEANER_MODEL = st.selectbox(
        "Select Cleaner Model:",
        [
            "Qwen/Qwen2.5-Coder-14B",
            "mistralai/Mistral-7B-Instruct-v0.3"
        ],
        index=0
    )

    ANALYST_MODEL = st.selectbox(
        "Select Analysis Model:",
        [
            "Gemini 2.5 Flash (Google)",
            "Qwen/Qwen2.5-14B-Instruct",
            "mistralai/Mistral-7B-Instruct-v0.3",
            "HuggingFaceH4/zephyr-7b-beta"
        ],
        index=0
    )

    temperature = st.slider("Temperature", 0.0, 1.0, 0.3)
    max_tokens = st.slider("Max Tokens", 128, 4096, 1024)

# ======================================================
# 🧩 MODEL CLIENTS
# ======================================================
hf_cleaner_client = InferenceClient(model=CLEANER_MODEL, token=HF_TOKEN)
hf_analyst_client = None
if ANALYST_MODEL != "Gemini 2.5 Flash (Google)":
    hf_analyst_client = InferenceClient(model=ANALYST_MODEL, token=HF_TOKEN)

# ======================================================
# πŸš€ MAIN CHATBOT LOGIC
# ======================================================
uploaded = st.file_uploader("πŸ“Ž Upload CSV or Excel file", type=["csv", "xlsx"])

if "messages" not in st.session_state:
    st.session_state.messages = []

if uploaded:
    # Load dataset
    df = pd.read_csv(uploaded) if uploaded.name.endswith(".csv") else pd.read_excel(uploaded)

    # 🧼 AI-BASED CLEANING
    with st.spinner("🧼 Cleaning your dataset..."):
        cleaned_df, cleaning_status = ai_clean_dataset(df, hf_cleaner_client)

    # Display cleaning info
    st.subheader("βœ… Cleaning Status")
    st.info(cleaning_status)

    st.subheader("πŸ“Š Dataset Preview")
    st.dataframe(cleaned_df.head(), use_container_width=True)

    # πŸ’¬ Chat interface
    st.subheader("πŸ’¬ Chat with Your Dataset")

    for msg in st.session_state.messages:
        with st.chat_message(msg["role"]):
            st.markdown(msg["content"])

    if user_query := st.chat_input("Ask something about your dataset..."):
        st.session_state.messages.append({"role": "user", "content": user_query})
        with st.chat_message("user"):
            st.markdown(user_query)

        with st.chat_message("assistant"):
            with st.spinner("πŸ€– Analyzing..."):
                result = query_analysis_model(
                    cleaned_df,
                    user_query,
                    uploaded.name,
                    ANALYST_MODEL,
                    hf_client=hf_analyst_client,
                    temperature=temperature,
                    max_tokens=max_tokens,
                    gemini_api_key=GEMINI_API_KEY
                )
                st.markdown(result)
                st.session_state.messages.append({"role": "assistant", "content": result})

else:
    st.info("πŸ“₯ Upload a dataset to begin chatting with your AI analyst.")