from datetime import datetime, timezone import gradio as gr import pandas as pd from datasets import Dataset, load_dataset from constants import format_dataframe_for_display, format_dataframe_for_html_display # Dataset configuration DATASET_NAME = "somosnlp/recursos-pln-es" CONFIG_NAME = "models" RESOURCE_TYPE = "models" RESOURCE_TITLE = "Models" def load_data() -> pd.DataFrame: """Load data from HuggingFace dataset or return empty DataFrame.""" try: dataset = load_dataset(DATASET_NAME, CONFIG_NAME, split="train") return dataset.to_pandas() except Exception as e: print(f"Could not load {RESOURCE_TYPE} dataset: {e}") # Return empty DataFrame with required columns return pd.DataFrame( columns=[ "familia", "available_sizes", "hf_collection_url", "website_url", "paper_url", "submitted_by", "date_submitted", ] ) def search_and_filter_data(df: pd.DataFrame, search_query: str) -> pd.DataFrame: """Filter dataframe based on search query.""" if search_query == "": return df else: filtered_df = df[ df.apply( lambda row: row.astype(str) .str.contains(search_query, case=False) .any(), axis=1, ) ] return filtered_df def validate_url(url: str) -> bool: """Validate if a string is a valid URL.""" if not url: return True # Empty URLs are allowed for optional fields return url.startswith(("http://", "https://")) def validate_sizes(sizes_str: str) -> bool: """Validate comma-separated list of numbers (model sizes).""" if not sizes_str: return True # Empty is allowed try: sizes = [s.strip() for s in sizes_str.split(",")] for size in sizes: float(size) # Check if it's a valid number return True except ValueError: return False def submit_resource( familia: str, available_sizes: str, hf_collection_url: str, website_url: str, paper_url: str, profile: gr.OAuthProfile | None, ): """Submit a new resource to the corresponding dataset.""" # Login required if profile is None: return "❌ Error: You need to be logged in to submit a resource." # Validate required fields if not hf_collection_url: return "❌ Error: Hugging Face Collection URL is required." # Validate URLs urls_to_check = [ ("Hugging Face Collection URL", hf_collection_url), ("Website URL", website_url), ("Paper URL", paper_url), ] for url_name, url_value in urls_to_check: if url_value and not validate_url(url_value): return f"❌ Error: {url_name} must be a valid URL starting with http:// or https://" # Validate sizes format if available_sizes and not validate_sizes(available_sizes): return "❌ Error: Available sizes must be a comma-separated list of numbers (e.g., '0.1, 1.3, 7, 14')" try: username = profile.username current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") # Create new row data new_data = { "familia": familia, "available_sizes": available_sizes, "hf_collection_url": hf_collection_url, "website_url": website_url, "paper_url": paper_url, "submitted_by": username, "date_submitted": current_time, } # Try to load existing dataset, or create new one try: existing_dataset = load_dataset(DATASET_NAME, CONFIG_NAME, split="train") existing_df = existing_dataset.to_pandas() # Add new row updated_df = pd.concat( [existing_df, pd.DataFrame([new_data])], ignore_index=True ) except: # Create new dataset if it doesn't exist updated_df = pd.DataFrame([new_data]) # Convert back to Dataset and push to hub updated_dataset = Dataset.from_pandas(updated_df) updated_dataset.push_to_hub( DATASET_NAME, config_name=CONFIG_NAME, commit_message=f"Add {familia or 'model'} by {username}", token=True, # Use the user's token ) return f"✅ Success: Model has been submitted successfully!" except Exception as e: return f"❌ Error: Failed to submit resource. {str(e)}" def create_all_tab(): """Create the 'All' tab for this resource type.""" with gr.TabItem("📋 All", id=f"{RESOURCE_TYPE}_all"): gr.Markdown(f"### All {RESOURCE_TITLE}") search_box = gr.Textbox( placeholder=f"Search {RESOURCE_TYPE}...", label="Filter the table", show_label=False, ) # Load and format initial data with clickable links def get_formatted_data(): df = load_data() return format_dataframe_for_display( df, url_columns=["hf_collection_url", "website_url", "paper_url"], hide_columns=["date_submitted"], ) table = gr.Dataframe( value=get_formatted_data(), label=RESOURCE_TITLE, show_label=False, interactive=False, wrap=True, datatype="markdown", ) # Connect search functionality def search_and_format(query): initial_df = load_data() filtered_df = search_and_filter_data(initial_df, query) return format_dataframe_for_display( filtered_df, url_columns=["hf_collection_url", "website_url", "paper_url"], hide_columns=["date_submitted"], ) search_box.change( fn=search_and_format, inputs=search_box, outputs=table, ) # Refresh button to reload data refresh_btn = gr.Button("🔄 Refresh Data", variant="secondary") refresh_btn.click(fn=get_formatted_data, outputs=table) return table def create_contribute_tab(): """Create the 'Contribute' tab for this resource type.""" with gr.TabItem("➕ Contribute", id=f"{RESOURCE_TYPE}_contribute"): gr.Markdown(f"### Contribute a New {RESOURCE_TITLE[:-1]}") # Login section gr.Markdown("Please log in to contribute resources:") login_button = gr.LoginButton(elem_id=f"{RESOURCE_TYPE}-oauth-button") gr.Markdown("Please fill in the information below to add a new model:") with gr.Column(): # Required fields hf_collection_url_input = gr.Textbox( label="Hugging Face Collection URL *", placeholder="https://huggingface.co/collections/...", info="Link to the Hugging Face model collection (required)", ) # Optional fields familia_input = gr.Textbox( label="Familia", placeholder="e.g., BERT, GPT, LLaMA", info="Model family or architecture name", ) available_sizes_input = gr.Textbox( label="Available Sizes (in B parameters)", placeholder="e.g., 0.1, 1.3, 7, 14", info="Comma-separated list of model sizes in billions of parameters", ) website_url_input = gr.Textbox( label="Website URL", placeholder="https://...", info="Project or model website", ) paper_url_input = gr.Textbox( label="Paper URL", placeholder="https://...", info="Link to associated research paper", ) submit_btn = gr.Button(f"Submit {RESOURCE_TITLE[:-1]}", variant="primary") result_msg = gr.Markdown() # Submit function submit_btn.click( fn=submit_resource, inputs=[ familia_input, available_sizes_input, hf_collection_url_input, website_url_input, paper_url_input, ], outputs=[result_msg], ) return ( familia_input, available_sizes_input, hf_collection_url_input, website_url_input, paper_url_input, submit_btn, result_msg, ) def search_entries(query: str) -> pd.DataFrame: """Search for entries by familia or URL.""" if not query.strip(): return pd.DataFrame() df = load_data() if df.empty: return df # Search in familia and hf_collection_url columns mask = df["familia"].str.contains(query, case=False, na=False) | df[ "hf_collection_url" ].str.contains(query, case=False, na=False) return df[mask] def load_entry_for_edit(selected_entry: str) -> tuple: """Load a specific entry for editing.""" if not selected_entry: return ("",) * 5 # Return empty values for all fields df = load_data() if df.empty: return ("",) * 5 # Find the entry by familia or hf_collection_url entry = ( df[df["familia"] == selected_entry].iloc[0] if (df["familia"] == selected_entry).any() else df[df["hf_collection_url"] == selected_entry].iloc[0] ) return ( entry["familia"], entry["available_sizes"], entry["hf_collection_url"], entry["website_url"], entry["paper_url"], ) def update_entry( original_identifier: str, familia: str, available_sizes: str, hf_collection_url: str, website_url: str, paper_url: str, profile: gr.OAuthProfile | None, ): """Update an existing entry.""" # Login required if profile is None: return "❌ Error: You need to be logged in to edit entries." username = profile.username if not username: return "❌ Could not get username from profile." if not original_identifier: return "❌ No entry selected to edit." if not hf_collection_url.strip(): return "❌ Hugging Face Collection URL is required." # Validate URLs for url_field, url_value in [ ("Hugging Face Collection URL", hf_collection_url), ("Website URL", website_url), ("Paper URL", paper_url), ]: if url_value.strip() and not validate_url(url_value): return f"❌ Invalid {url_field}. Please provide a valid URL." # Validate available_sizes format if available_sizes.strip() and not validate_sizes(available_sizes): return "❌ Invalid available sizes format. Use comma-separated numbers (e.g., '0.1, 1.3, 7, 14')." try: # Load existing dataset existing_dataset = load_dataset(DATASET_NAME, CONFIG_NAME, split="train") existing_df = existing_dataset.to_pandas() # Find and update the entry mask = (existing_df["familia"] == original_identifier) | ( existing_df["hf_collection_url"] == original_identifier ) if not mask.any(): return f"❌ Entry '{original_identifier}' not found." # Update the entry current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") existing_df.loc[mask, "familia"] = familia existing_df.loc[mask, "available_sizes"] = available_sizes existing_df.loc[mask, "hf_collection_url"] = hf_collection_url existing_df.loc[mask, "website_url"] = website_url existing_df.loc[mask, "paper_url"] = paper_url existing_df.loc[mask, "date_submitted"] = current_time # Convert back to Dataset and push to hub updated_dataset = Dataset.from_pandas(existing_df) updated_dataset.push_to_hub( DATASET_NAME, config_name=CONFIG_NAME, commit_message=f"Update model entry: {familia or hf_collection_url} (edited by {username})", ) return f"✅ Successfully updated '{familia or hf_collection_url}'!" except Exception as e: return f"❌ Error updating entry: {str(e)}" def create_edit_tab(): """Create the edit tab for modifying existing entries.""" with gr.TabItem("✏️ Edit", id=f"{RESOURCE_TYPE}_edit"): gr.Markdown(f"### Edit Existing {RESOURCE_TITLE}") gr.Markdown("Please log in to edit entries:") login_button = gr.LoginButton(elem_id=f"{RESOURCE_TYPE}-edit-oauth-button") gr.Markdown("Search for an entry to edit:") with gr.Row(): search_input = gr.Textbox( label="Search by familia or collection URL", placeholder="Enter model familia or Hugging Face collection URL...", scale=3, ) search_btn = gr.Button("🔍 Search", scale=1) search_results = gr.Dropdown( label="Select entry to edit", choices=[], interactive=True ) gr.Markdown("---") gr.Markdown("**Edit the selected entry:**") with gr.Column(visible=False) as edit_form: hf_collection_url_input = gr.Textbox( label="Hugging Face Collection URL *", placeholder="https://huggingface.co/collections/...", ) familia_input = gr.Textbox( label="Familia", placeholder="e.g., BERT, GPT, T5..." ) available_sizes_input = gr.Textbox( label="Available Sizes (in B parameters)", placeholder="e.g., 0.1, 1.3, 7, 14", info="Comma-separated list of model sizes in billions of parameters", ) website_url_input = gr.Textbox( label="Website URL", placeholder="https://..." ) paper_url_input = gr.Textbox( label="Paper URL", placeholder="https://arxiv.org/..." ) update_btn = gr.Button("💾 Update Entry", variant="primary") result_msg = gr.Markdown() # Store the original identifier for updating original_identifier_state = gr.State("") def search_and_update_dropdown(query): results_df = search_entries(query) if results_df.empty: return gr.Dropdown(choices=[], value=None) else: # Use familia if available, otherwise use hf_collection_url choices = [ entry if entry else url for entry, url in zip( results_df["familia"].fillna(""), results_df["hf_collection_url"], ) ] return gr.Dropdown(choices=choices, value=None) def load_entry_and_show_form(selected_entry): if not selected_entry: return (gr.Column(visible=False), "", *[("",) * 5]) entry_data = load_entry_for_edit(selected_entry) return (gr.Column(visible=True), selected_entry, *entry_data) # Event handlers search_btn.click( fn=search_and_update_dropdown, inputs=[search_input], outputs=[search_results], ) search_results.change( fn=load_entry_and_show_form, inputs=[search_results], outputs=[ edit_form, original_identifier_state, familia_input, available_sizes_input, hf_collection_url_input, website_url_input, paper_url_input, ], ) update_btn.click( fn=update_entry, inputs=[ original_identifier_state, familia_input, available_sizes_input, hf_collection_url_input, website_url_input, paper_url_input, ], outputs=[result_msg], ) return ( search_input, search_btn, search_results, edit_form, familia_input, available_sizes_input, hf_collection_url_input, website_url_input, paper_url_input, update_btn, result_msg, ) def create_tab(): """Create the complete tab for this resource type.""" with gr.TabItem(f"🤖 {RESOURCE_TITLE}", id=RESOURCE_TYPE): with gr.Tabs(): table = create_all_tab() inputs = create_contribute_tab() edit_components = create_edit_tab() return table, inputs, edit_components