Spaces:
Runtime error
Runtime error
| import json | |
| import os | |
| import streamlit as st | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import plotly.express as px | |
| from utils import load_data_pickle, load_numpy, check_password | |
| from st_pages import add_indentation | |
| st.set_page_config(layout="wide") | |
| if check_password(): | |
| st.title("Topic Modeling π") | |
| st.markdown("### What is Topic Modeling ?") | |
| st.info(""" | |
| Topic modeling is a text-mining technique used to **identify topics within a collection of documents**. | |
| It is a useful tool for organizing and summarizing vast amounts of textual data as well as automate the discovery of hidden thematic structures in a corpus of text data, without any prior knowledge. | |
| """) | |
| st.markdown(" ") | |
| _, col, _ = st.columns([0.25,0.4,0.35]) | |
| with col: | |
| st.image("images/topic_modeling.gif", caption="An example of Topic Modeling", use_column_width=True) | |
| st.markdown("""Common applications of Topic Modeling include: | |
| - **Search Engine Optimization (SEO): π** Determine the main topics/keywords present on a website to optimize content and improve search engine rankings. | |
| - **Customer Support** βοΈ: Analyze customer support tickets, emails, and chat transcripts to identify common questions and complaints. | |
| - **Fraud Detection and Risk Management: π¦** : Detect fraudulent activities, compliance violations, and operational risks by analyzing textual data such as transaction descriptions, audit reports and regulatory filings. | |
| - **Market Research π**: Gain competitive intelligence and make informed decisions regarding product development, marketing strategies, and market positioning by analyzing research reports and industry news. | |
| """) | |
| st.markdown(" ") | |
| st.divider() | |
| st.markdown("# Topic modeling on product descriptions ποΈ") | |
| st.markdown("""In this use case, we will use a **topic model** to categorize around **20 000 e-commerce products** as well as identify | |
| the main types of products solds.""") | |
| _, col, _ = st.columns([0.2,0.6,0.2]) | |
| with col: | |
| st.image("images/e-commerce.jpg") | |
| st.markdown(" ") | |
| # Load data | |
| path_data = "data/topic-modeling" | |
| # data = load_data_csv(path_data,"data-topicmodeling.csv") | |
| # Load the topic data | |
| topic_info = load_data_pickle(path_data, 'topic_info.pkl') | |
| ##### ABOUT THE USE CASE | |
| st.markdown("#### About the data π") | |
| st.markdown("""You were provided a dataset with around 20 000 products from a large e-commerce retailer. <br> | |
| This dataset contains the products' title and description on the website.""", unsafe_allow_html=True) | |
| st.info("""**Note**: Some of the descriptions featured below are shown in their 'raw' form, meaning they contain unprocessed html code and special characters. | |
| These descriptions were first 'cleaned' (by removing unwanted characters) before being used in the model.""") | |
| see_data = st.checkbox('**See the data**', key="credit_score_data") # Corrected the key to use an underscore | |
| if see_data: | |
| st.markdown(" ") | |
| st.warning("This view only shows a subset of the 20 000 product description used.") | |
| data = load_data_pickle(path_data,"data-tm-view.pkl") | |
| data_show = data[["TITLE", "DESCRIPTION"]] | |
| st.dataframe(data_show.reset_index(drop=True), use_container_width=True) | |
| st.markdown(" ") | |
| st.markdown(" ") | |
| # RUN THE MODEL | |
| st.markdown("#### About the model π") | |
| st.markdown("""**Topic models** can be seen as unsupervised clustering models where text documents are grouped into topics/clusters based on their similarities. | |
| We will use here a topic model to automatically categorize/group the retailer's products based on their description, | |
| as well as understand what are the most common type of products being sold.""", unsafe_allow_html=True) | |
| st.info("""**Note**: In topic modeling, the final topics are represented by the model using 'top words'. | |
| A topic's top words are chosen based on how much they appear in the topic's documents.""") | |
| def show_results(): | |
| st.markdown("#### See the results βοΈ") | |
| tab1, tab2 = st.tabs(["Overall results", "Specific Topic Details", ])# "Search Similar Topics"]) | |
| st.markdown(" ") | |
| # Tab 1: Summary Table | |
| with tab1: | |
| st.header("Overall results") | |
| st.markdown("""This tab showcases all of the **topics identified** within the product dataset. <br> | |
| Each topic's <b>most significant words</b> (top words), as well as the <b>proportion</b> of products that were assigned to it are given.""", | |
| unsafe_allow_html=True) | |
| summary_table = topic_info[['Title','Representation', 'Percentage']].copy() | |
| summary_table['Top Words'] = summary_table['Representation'].apply(lambda x: x[:5]) #:5 | |
| summary_table = summary_table[["Title","Top Words","Percentage"]] | |
| summary_table.rename({"Title":"Topic Title"}, axis=1, inplace=True) | |
| st.data_editor( | |
| summary_table, #.loc[df_results_tab1["Customer ID"].isin(filter_customers)], | |
| column_config={ | |
| "Percentage": st.column_config.ProgressColumn( | |
| "Proportion %", | |
| help="Propotion of documents within each topic", | |
| format="%.1f%%", | |
| min_value=0, | |
| max_value=100)}, | |
| use_container_width=True | |
| ) | |
| st.info("""**Note**: The topic 'titles' were not provided by the model but instead were generated by feeding the topic's top words to an LLM. | |
| Traditional topic models define topics using representative/top words but weren't built to generate a specific title to each topic.""") | |
| # Tab 2: Specific Topic Details | |
| with tab2: | |
| # Load top words | |
| with open(os.path.join(path_data,"topics_top_words.json"), "r") as json_file: | |
| top_words_dict = json.load(json_file) | |
| # Load similarity df and scores | |
| similarity_df = load_data_pickle(path_data, "similarity_topic_df.pkl") | |
| similarity_scores = load_numpy(path_data, "similarity_topic_scores.npy") | |
| #st.markdown(" ") | |
| st.header("Learn more about each topic") | |
| st.markdown("""You can **select a specific topic** to get more information on its **top words**, as well as the | |
| **other topics that are most similar to it**.""") | |
| # st.info("""In this section, you can find more information on each of the topics identified by the model. | |
| # This includes the topic's a full list of its top words, the importance of each of these words, as well as the top five topics that are most similar to it.""") | |
| st.markdown(" ") | |
| # Select topic | |
| topics = topic_info["Title"].sort_values().to_list() | |
| selected_topic = st.selectbox('**Select a Topic**', topics) | |
| selected_topic_id = topic_info[topic_info['Title'] == selected_topic]["Topic"].to_numpy()[0] + 1 | |
| st.markdown(" ") | |
| col1, col2 = st.columns(2) | |
| # Top words | |
| with col1: | |
| top_words_df = pd.DataFrame(top_words_dict[selected_topic], columns=["Word", "Importance"]) | |
| top_words_df.sort_values(by=["Importance"], ascending=False, inplace=True) | |
| top_words_df["Importance"] = top_words_df["Importance"].round(2) | |
| fig = px.bar(top_words_df, x='Word', y='Importance', color="Importance", title="Top words", text_auto=True) | |
| fig.update_layout(yaxis=dict(range=[0, 1]), xaxis_title="", showlegend=False) | |
| st.plotly_chart(fig, use_container_width=True) | |
| st.info("""**Note:** Each score was computed based on the words importance in the particular topic using | |
| a popular metric in NLP called TF-IDF (Term Frequency-Inverse Document Frequency). """) | |
| # Similar topics to the selected topic | |
| with col2: | |
| similarity_df = similarity_df.loc[similarity_df["Topic"]==selected_topic] | |
| similarity_df["scores"] = 100*similarity_scores[selected_topic_id,:] | |
| similarity_df.columns = ["Original Topic", "Rank", "Topic", "Similarity (%)"] | |
| fig = px.bar(similarity_df, y='Similarity (%)', x='Topic', color="Topic", title="Five most similar topics", text_auto=True) | |
| fig.update_layout(yaxis=dict(range=[0, 100]), | |
| xaxis_title="", | |
| showlegend=False) | |
| st.plotly_chart(fig, use_container_width=True) | |
| st.info("""**Note:** Topics with a high similarity score can be merged together as to reduce the number of topics, as | |
| well as improve the topics' coherence.""") | |
| return None | |
| if 'button_clicked' not in st.session_state: | |
| st.session_state['button_clicked'] = False | |
| def run_model(): | |
| run_model = st.button("**Run the model**", type="primary") | |
| st.markdown(" ") | |
| st.markdown(" ") | |
| if not st.session_state['button_clicked']: | |
| if run_model: | |
| show_results() | |
| st.session_state['button_clicked'] = True | |
| else: | |
| show_results() | |
| run_model() | |