Spaces:
Sleeping
Sleeping
| from typing import List,Dict | |
| import re | |
| def parse_model_entries(model_entries: List[str]) -> List[Dict[str, str]]: | |
| """ | |
| Parse a list of model entries into structured dictionaries with provider, model name, version, region, and type. | |
| Args: | |
| model_entries: List of model entry strings as found in models.txt | |
| Returns: | |
| List of dictionaries with parsed model information containing keys: | |
| - provider: Name of the provider (e.g., 'azure', 'openai', 'anthropic', etc.) | |
| - model_name: Base name of the model | |
| - version: Version of the model (if available) | |
| - region: Deployment region (if available) | |
| - model_type: Type of the model (text, image, audio based on pattern analysis) | |
| """ | |
| parsed_models = [] | |
| # Common provider prefixes to identify | |
| known_providers = [ | |
| 'azure', 'bedrock', 'anthropic', 'openai', 'cohere', 'google', | |
| 'mistral', 'meta', 'amazon', 'ai21', 'anyscale', 'stability', | |
| 'cloudflare', 'databricks', 'cerebras', 'assemblyai' | |
| ] | |
| # Image-related keywords to identify image models | |
| image_indicators = ['dall-e', 'stable-diffusion', 'image', 'canvas', 'x-', 'steps'] | |
| # Audio-related keywords to identify audio models | |
| audio_indicators = ['whisper', 'tts', 'audio', 'voice'] | |
| for entry in model_entries: | |
| model_info = { | |
| 'provider': '', | |
| 'model_name': '', | |
| 'version': '', | |
| 'region': '', | |
| 'model_type': 'text' # Default to text | |
| } | |
| # Check for image models | |
| if any(indicator in entry.lower() for indicator in image_indicators): | |
| model_info['model_type'] = 'image' | |
| # Check for audio models | |
| elif any(indicator in entry.lower() for indicator in audio_indicators): | |
| model_info['model_type'] = 'audio' | |
| # Parse the entry based on common patterns | |
| parts = entry.split('/') | |
| # Handle region and provider extraction | |
| if len(parts) >= 2: | |
| # Extract provider from the beginning (common pattern) | |
| if parts[0].lower() in known_providers: | |
| model_info['provider'] = parts[0].lower() | |
| # For bedrock and azure, the region is often the next part | |
| if parts[0].lower() in ['bedrock', 'azure'] and len(parts) >= 3: | |
| # Skip commitment parts if present | |
| if 'commitment' not in parts[1]: | |
| model_info['region'] = parts[1] | |
| # The last part typically contains the model name and possibly version | |
| model_with_version = parts[-1] | |
| else: | |
| # For single-part entries | |
| model_with_version = entry | |
| # Extract provider from model name if not already set | |
| if not model_info['provider']: | |
| # Look for known providers within the model name | |
| for provider in known_providers: | |
| if provider in model_with_version.lower() or f'{provider}.' in model_with_version.lower(): | |
| model_info['provider'] = provider | |
| # Remove provider prefix if it exists at the beginning | |
| if model_with_version.lower().startswith(f'{provider}.'): | |
| model_with_version = model_with_version[len(provider) + 1:] | |
| break | |
| # Extract version information | |
| version_match = re.search(r'[:.-]v(\d+(?:\.\d+)*(?:-\d+)?|\d+)(?::\d+)?$', model_with_version) | |
| if version_match: | |
| model_info['version'] = version_match.group(1) | |
| # Remove version from model name | |
| model_name = model_with_version[:version_match.start()] | |
| else: | |
| # Look for date-based versions like 2024-08-06 | |
| date_match = re.search(r'-(\d{4}-\d{2}-\d{2})$', model_with_version) | |
| if date_match: | |
| model_info['version'] = date_match.group(1) | |
| model_name = model_with_version[:date_match.start()] | |
| else: | |
| model_name = model_with_version | |
| # Clean up model name by removing trailing/leading separators | |
| model_info['model_name'] = model_name.strip('.-:') | |
| parsed_models.append(model_info) | |
| return parsed_models | |
| def create_model_hierarchy(model_entries: List[str]) -> Dict[str, Dict[str, Dict[str, Dict[str, str]]]]: | |
| """ | |
| Organize model entries into a nested dictionary structure by provider, model, version, and region. | |
| Args: | |
| model_entries: List of model entry strings as found in models.txt | |
| Returns: | |
| Nested dictionary with the structure: | |
| Provider -> Model -> Version -> Region = full model string | |
| If region or version is None, they are replaced with "NA". | |
| """ | |
| # Parse the model entries to get structured information | |
| parsed_models = parse_model_entries(model_entries) | |
| # Create the nested dictionary structure | |
| hierarchy = {} | |
| for i, model_info in enumerate(parsed_models): | |
| provider = model_info['provider'] if model_info['provider'] else 'unknown' | |
| model_name = model_info['model_name'] | |
| version = model_info['version'] if model_info['version'] else 'NA' | |
| # For Azure models, always use 'NA' as region since they are globally available | |
| region = 'NA' if provider == 'azure' else (model_info['region'] if model_info['region'] else 'NA') | |
| # Initialize nested dictionaries if they don't exist | |
| if provider not in hierarchy: | |
| hierarchy[provider] = {} | |
| if model_name not in hierarchy[provider]: | |
| hierarchy[provider][model_name] = {} | |
| if version not in hierarchy[provider][model_name]: | |
| hierarchy[provider][model_name][version] = {} | |
| # Store the full model string at the leaf node | |
| hierarchy[provider][model_name][version][region] = model_entries[i] | |
| return hierarchy |