import gradio as gr from typing import Dict, Union import tiktoken from transformers import AutoTokenizer import logging import time # Set up logging for better error tracking logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # HUGGING FACE SPACES OPTIMIZATION: Smart caching with memory management # Note: For production deployment, a dedicated server with more RAM would allow # full caching of all tokenizers simultaneously for optimal performance. _tokenizer_cache = {} _tiktoken_cache = {} _cache_access_times = {} _max_cached_tokenizers = 3 # Conservative limit for HF Spaces free tier def _cleanup_old_tokenizers(): """Remove least recently used tokenizers to manage memory on HF Spaces.""" if len(_tokenizer_cache) <= _max_cached_tokenizers: return # Sort by access time, remove oldest sorted_cache = sorted(_cache_access_times.items(), key=lambda x: x[1]) oldest_key = sorted_cache[0][0] if oldest_key in _tokenizer_cache: del _tokenizer_cache[oldest_key] del _cache_access_times[oldest_key] logger.info(f"๐Ÿงน Removed cached tokenizer: {oldest_key} (memory management)") def count_tokens_openai_gpt4(text: str) -> Dict[str, Union[int, float, str]]: """ Count tokens using OpenAI GPT-4/GPT-3.5-turbo tokenizer. Args: text (str): The text to analyze for token count Returns: dict: Token count results with GPT-4 style tokenization including word count, character count, token count, analysis, tokenizer used, and tokens per word """ return _count_tokens_internal(text, "tiktoken-cl100k") def count_tokens_openai_davinci(text: str) -> Dict[str, Union[int, float, str]]: """ Count tokens using OpenAI text-davinci tokenizer. Args: text (str): The text to analyze for token count Returns: dict: Token count results with text-davinci style tokenization including word count, character count, token count, analysis, tokenizer used, and tokens per word """ return _count_tokens_internal(text, "tiktoken-p50k") def count_tokens_openai_gpt3(text: str) -> Dict[str, Union[int, float, str]]: """ Count tokens using OpenAI GPT-3 tokenizer. Args: text (str): The text to analyze for token count Returns: dict: Token count results with GPT-3 style tokenization including word count, character count, token count, analysis, tokenizer used, and tokens per word """ return _count_tokens_internal(text, "tiktoken-r50k") def count_tokens_bert_family(text: str) -> Dict[str, Union[int, float, str]]: """ Count tokens using BERT tokenizer for classification and understanding tasks. Args: text (str): The text to analyze for token count Returns: dict: Token count results with BERT style tokenization including word count, character count, token count, analysis, tokenizer used, and tokens per word """ return _count_tokens_internal(text, "hf-bert") def count_tokens_roberta_family(text: str) -> Dict[str, Union[int, float, str]]: """ Count tokens using RoBERTa tokenizer for modern classification models. Args: text (str): The text to analyze for token count Returns: dict: Token count results with RoBERTa style tokenization including word count, character count, token count, analysis, tokenizer used, and tokens per word """ return _count_tokens_internal(text, "hf-roberta") def count_tokens_gpt2_family(text: str) -> Dict[str, Union[int, float, str]]: """ Count tokens using GPT-2 tokenizer for open source LLMs like Llama and Mistral. Args: text (str): The text to analyze for token count Returns: dict: Token count results with GPT-2 style tokenization including word count, character count, token count, analysis, tokenizer used, and tokens per word """ return _count_tokens_internal(text, "hf-gpt2") def count_tokens_t5_family(text: str) -> Dict[str, Union[int, float, str]]: """ Count tokens using T5 tokenizer for sequence-to-sequence models. Args: text (str): The text to analyze for token count Returns: dict: Token count results with T5 style tokenization including word count, character count, token count, analysis, tokenizer used, and tokens per word """ return _count_tokens_internal(text, "hf-t5") def count_tokens_distilbert(text: str) -> Dict[str, Union[int, float, str]]: """ Count tokens using DistilBERT tokenizer for lightweight BERT variants. Args: text (str): The text to analyze for token count Returns: dict: Token count results with DistilBERT style tokenization including word count, character count, token count, analysis, tokenizer used, and tokens per word """ return _count_tokens_internal(text, "hf-distilbert") def _count_tokens_internal(text: str, tokenizer_type: str) -> Dict[str, Union[int, float, str]]: """ Internal helper function for token counting with HF Spaces optimizations. Note: On Hugging Face Spaces, first-time model loading may take 5-30 seconds due to downloading from HuggingFace Hub. For production use, consider a dedicated server with sufficient RAM to cache all tokenizers permanently. """ # Input validation if not isinstance(text, str): return { "word_count": 0, "character_count": 0, "token_count": "Error: Invalid input type", "analysis": "error", "tokenizer_used": tokenizer_type, "tokens_per_word": 0 } if not text.strip(): return { "word_count": 0, "character_count": 0, "token_count": 0, "analysis": "empty text", "tokenizer_used": tokenizer_type, "tokens_per_word": 0 } # Basic word counting words = text.split() word_count = len(words) char_count = len(text) token_count = -1 start_time = time.time() is_cached = False try: if tokenizer_type.startswith("tiktoken"): # โšก tiktoken is lightweight, always cache (minimal memory impact) if tokenizer_type not in _tiktoken_cache: encoding_map = { "tiktoken-cl100k": "cl100k_base", "tiktoken-p50k": "p50k_base", "tiktoken-r50k": "r50k_base" } encoding_name = encoding_map.get(tokenizer_type, "cl100k_base") _tiktoken_cache[tokenizer_type] = tiktoken.get_encoding(encoding_name) logger.info(f"โœ… Cached tiktoken: {tokenizer_type}") else: is_cached = True encoding = _tiktoken_cache[tokenizer_type] token_count = len(encoding.encode(text)) elif tokenizer_type.startswith("hf-"): # โšก Smart caching for transformers models (memory-aware) if tokenizer_type in _tokenizer_cache: # Cache hit logger.info(f"โšก Using cached tokenizer: {tokenizer_type}") tokenizer = _tokenizer_cache[tokenizer_type] is_cached = True else: # Cache miss - load and manage memory _cleanup_old_tokenizers() model_map = { "hf-bert": "bert-base-uncased", "hf-roberta": "roberta-base", "hf-gpt2": "gpt2", "hf-distilbert": "distilbert-base-uncased", "hf-t5": "t5-small" } model = model_map.get(tokenizer_type, "bert-base-uncased") logger.info(f"๐Ÿ”„ Loading tokenizer: {tokenizer_type} ({model}) - First time may take 5-30s") tokenizer = AutoTokenizer.from_pretrained(model) # Cache with memory management _tokenizer_cache[tokenizer_type] = tokenizer logger.info(f"โœ… Cached tokenizer: {tokenizer_type}") # Update access time for LRU management _cache_access_times[tokenizer_type] = time.time() tokens = tokenizer.tokenize(text) token_count = len(tokens) else: raise ValueError(f"Unsupported tokenizer type: {tokenizer_type}") except Exception as e: logger.error(f"Error in tokenization for {tokenizer_type}: {str(e)}") token_count = -1 processing_time = time.time() - start_time # Enhanced analysis with performance and caching info if token_count == -1: analysis = "tokenizer error" elif token_count <= 50: analysis = "short text" elif token_count <= 200: analysis = "medium text" elif token_count <= 1000: analysis = "long text" else: analysis = "very long text" # Add performance metadata for user feedback performance_info = f" (โฑ๏ธ {processing_time:.2f}s" if is_cached: performance_info += " โšก cached)" else: performance_info += " ๐Ÿ”„ loaded)" return { "word_count": word_count, "character_count": char_count, "token_count": token_count if token_count != -1 else "Error during tokenization", "analysis": analysis + performance_info, "tokenizer_used": tokenizer_type, "tokens_per_word": round(token_count / word_count, 2) if word_count > 0 and token_count > 0 else 0 } # Create individual interfaces for each tokenizer demo_gpt4 = gr.Interface( fn=count_tokens_openai_gpt4, inputs=gr.Textbox( placeholder="Enter text to count tokens for GPT-4/3.5...", label="Text", value="Count the tokens in this sample text." # Default value as per guide ), outputs=gr.JSON(), title="GPT-4/3.5 Token Counter โšก", description="Count tokens using OpenAI GPT-4/GPT-3.5-turbo tokenizer. Best for GPT-4, Claude, Gemini. (Fast - tiktoken)" ) demo_davinci = gr.Interface( fn=count_tokens_openai_davinci, inputs=gr.Textbox( placeholder="Enter text to count tokens for text-davinci...", label="Text", value="Count the tokens in this sample text." ), outputs=gr.JSON(), title="text-davinci Token Counter โšก", description="Count tokens using OpenAI text-davinci tokenizer. (Fast - tiktoken)" ) demo_gpt3 = gr.Interface( fn=count_tokens_openai_gpt3, inputs=gr.Textbox( placeholder="Enter text to count tokens for GPT-3...", label="Text", value="Count the tokens in this sample text." ), outputs=gr.JSON(), title="GPT-3 Token Counter โšก", description="Count tokens using OpenAI GPT-3 tokenizer. (Fast - tiktoken)" ) demo_bert = gr.Interface( fn=count_tokens_bert_family, inputs=gr.Textbox( placeholder="Enter text to count tokens for BERT...", label="Text", value="Count the tokens in this sample text." ), outputs=gr.JSON(), title="BERT Token Counter ๐Ÿ”„", description="Count tokens using BERT tokenizer for classification tasks. (First use: ~10s, then cached)" ) demo_roberta = gr.Interface( fn=count_tokens_roberta_family, inputs=gr.Textbox( placeholder="Enter text to count tokens for RoBERTa...", label="Text", value="Count the tokens in this sample text." ), outputs=gr.JSON(), title="RoBERTa Token Counter ๐Ÿ”„", description="Count tokens using RoBERTa tokenizer. (First use: ~10s, then cached)" ) demo_gpt2 = gr.Interface( fn=count_tokens_gpt2_family, inputs=gr.Textbox( placeholder="Enter text to count tokens for GPT-2/Llama/Mistral...", label="Text", value="Count the tokens in this sample text." ), outputs=gr.JSON(), title="GPT-2/Llama/Mistral Token Counter ๐Ÿ”„", description="Count tokens using GPT-2 tokenizer. Good approximation for Llama, Mistral, and open source LLMs. (First use: ~15s, then cached)" ) demo_t5 = gr.Interface( fn=count_tokens_t5_family, inputs=gr.Textbox( placeholder="Enter text to count tokens for T5...", label="Text", value="Count the tokens in this sample text." ), outputs=gr.JSON(), title="T5 Token Counter ๐Ÿ”„", description="Count tokens using T5 tokenizer for sequence-to-sequence models. (First use: ~10s, then cached)" ) demo_distilbert = gr.Interface( fn=count_tokens_distilbert, inputs=gr.Textbox( placeholder="Enter text to count tokens for DistilBERT...", label="Text", value="Count the tokens in this sample text." ), outputs=gr.JSON(), title="DistilBERT Token Counter ๐Ÿ”„", description="Count tokens using DistilBERT tokenizer for lightweight models. (First use: ~20s, then cached)" ) # Combine all interfaces into a single app with tabs demo = gr.TabbedInterface( [demo_gpt4, demo_davinci, demo_gpt3, demo_bert, demo_roberta, demo_gpt2, demo_t5, demo_distilbert], ["GPT-4/3.5 โšก", "text-davinci โšก", "GPT-3 โšก", "BERT ๐Ÿ”„", "RoBERTa ๐Ÿ”„", "GPT-2/Llama ๐Ÿ”„", "T5 ๐Ÿ”„", "DistilBERT ๐Ÿ”„"], title="๐Ÿ”ข Multi-Model Token Counter Suite", theme=gr.themes.Soft() ) # Add information about HF Spaces limitations demo.queue(max_size=10) # Limit concurrent requests for HF Spaces # Launch the interface and MCP server if __name__ == "__main__": print("๐Ÿš€ Launching Token Counter with HF Spaces optimizations") print("โ„น๏ธ Note: First-time model loading may take 5-30 seconds") print("โ„น๏ธ For production use, consider a dedicated server with more RAM") demo.launch(mcp_server=True)