Spaces:
Running
Running
Joseph Pollack
commited on
Commit
·
85f2fd9
1
Parent(s):
fa36a89
attempts to solve the api_key issue for huggingface , settings not appearing , set settings for audio , adds modal gpu , speech to text with mic input addon, adds graphs
Browse files- pyproject.toml +14 -8
- requirements.txt +19 -2
- src/app.py +69 -57
- src/services/multimodal_processing.py +20 -9
- src/utils/config.py +24 -0
- uv.lock +36 -7
pyproject.toml
CHANGED
|
@@ -14,7 +14,7 @@ dependencies = [
|
|
| 14 |
"beautifulsoup4>=4.12", # HTML parsing
|
| 15 |
"xmltodict>=0.13", # PubMed XML -> dict
|
| 16 |
"huggingface-hub>=0.20.0", # Hugging Face Inference API
|
| 17 |
-
"gradio[mcp]>=6.0.0", # Chat interface with MCP server support (6.0 required for css in launch())
|
| 18 |
"python-dotenv>=1.0", # .env loading
|
| 19 |
"tenacity>=8.2", # Retry logic
|
| 20 |
"structlog>=24.1", # Structured logging
|
|
@@ -26,15 +26,21 @@ dependencies = [
|
|
| 26 |
"llama-index-llms-huggingface-api>=0.6.1",
|
| 27 |
"llama-index-vector-stores-chroma>=0.5.3",
|
| 28 |
"llama-index>=0.14.8",
|
| 29 |
-
# Audio/Image processing
|
| 30 |
"gradio-client>=1.0.0", # For STT/OCR API calls
|
| 31 |
"soundfile>=0.12.0", # For audio file I/O
|
| 32 |
"pillow>=10.0.0", # For image processing
|
| 33 |
-
# TTS dependencies (for Modal GPU TTS)
|
| 34 |
"torch>=2.0.0", # Required by Kokoro TTS
|
| 35 |
-
"transformers>=4.
|
| 36 |
"modal>=0.63.0", # Required for TTS GPU execution
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
]
|
| 39 |
|
| 40 |
[project.optional-dependencies]
|
|
@@ -72,11 +78,11 @@ modal = [
|
|
| 72 |
# Mario's Modal code execution + LlamaIndex RAG
|
| 73 |
# Note: modal>=0.63.0 is now in main dependencies for TTS support
|
| 74 |
"llama-index>=0.11.0",
|
| 75 |
-
"llama-index-llms-openai",
|
| 76 |
-
"llama-index-embeddings-openai",
|
| 77 |
"llama-index-vector-stores-chroma",
|
| 78 |
"chromadb>=0.4.0",
|
| 79 |
-
"numpy<2.0",
|
| 80 |
]
|
| 81 |
|
| 82 |
[build-system]
|
|
|
|
| 14 |
"beautifulsoup4>=4.12", # HTML parsing
|
| 15 |
"xmltodict>=0.13", # PubMed XML -> dict
|
| 16 |
"huggingface-hub>=0.20.0", # Hugging Face Inference API
|
| 17 |
+
"gradio[mcp,oauth]>=6.0.0", # Chat interface with MCP server support (6.0 required for css in launch())
|
| 18 |
"python-dotenv>=1.0", # .env loading
|
| 19 |
"tenacity>=8.2", # Retry logic
|
| 20 |
"structlog>=24.1", # Structured logging
|
|
|
|
| 26 |
"llama-index-llms-huggingface-api>=0.6.1",
|
| 27 |
"llama-index-vector-stores-chroma>=0.5.3",
|
| 28 |
"llama-index>=0.14.8",
|
|
|
|
| 29 |
"gradio-client>=1.0.0", # For STT/OCR API calls
|
| 30 |
"soundfile>=0.12.0", # For audio file I/O
|
| 31 |
"pillow>=10.0.0", # For image processing
|
|
|
|
| 32 |
"torch>=2.0.0", # Required by Kokoro TTS
|
| 33 |
+
"transformers>=4.57.2", # Required by Kokoro TTS
|
| 34 |
"modal>=0.63.0", # Required for TTS GPU execution
|
| 35 |
+
"tokenizers>=0.22.0,<=0.23.0",
|
| 36 |
+
"rpds-py>=0.29.0",
|
| 37 |
+
"pydantic-ai-slim[huggingface]>=0.0.18",
|
| 38 |
+
"agent-framework-core>=1.0.0b251120,<2.0.0",
|
| 39 |
+
"chromadb>=0.4.0",
|
| 40 |
+
"sentence-transformers>=2.2.0",
|
| 41 |
+
"numpy<2.0",
|
| 42 |
+
"llama-index-llms-openai>=0.6.9",
|
| 43 |
+
"llama-index-embeddings-openai>=0.5.1",
|
| 44 |
]
|
| 45 |
|
| 46 |
[project.optional-dependencies]
|
|
|
|
| 78 |
# Mario's Modal code execution + LlamaIndex RAG
|
| 79 |
# Note: modal>=0.63.0 is now in main dependencies for TTS support
|
| 80 |
"llama-index>=0.11.0",
|
| 81 |
+
"llama-index-llms-openai>=0.6.9",
|
| 82 |
+
"llama-index-embeddings-openai>=0.5.1",
|
| 83 |
"llama-index-vector-stores-chroma",
|
| 84 |
"chromadb>=0.4.0",
|
| 85 |
+
"numpy<2.0", # chromadb compatibility: uses np.float_ removed in NumPy 2.0
|
| 86 |
]
|
| 87 |
|
| 88 |
[build-system]
|
requirements.txt
CHANGED
|
@@ -43,6 +43,20 @@ llama-index-llms-huggingface>=0.6.1
|
|
| 43 |
llama-index-llms-huggingface-api>=0.6.1
|
| 44 |
llama-index-vector-stores-chroma>=0.5.3
|
| 45 |
llama-index>=0.14.8
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
llama-index-llms-openai>=0.6.9
|
| 47 |
llama-index-embeddings-openai>=0.5.1
|
| 48 |
|
|
@@ -57,5 +71,8 @@ numpy<2.0
|
|
| 57 |
# Optional: Modal for code execution
|
| 58 |
modal>=0.63.0
|
| 59 |
|
| 60 |
-
#
|
| 61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
llama-index-llms-huggingface-api>=0.6.1
|
| 44 |
llama-index-vector-stores-chroma>=0.5.3
|
| 45 |
llama-index>=0.14.8
|
| 46 |
+
|
| 47 |
+
# Audio/Image processing
|
| 48 |
+
gradio-client>=1.0.0 # For STT/OCR API calls
|
| 49 |
+
soundfile>=0.12.0 # For audio file I/O
|
| 50 |
+
pillow>=10.0.0 # For image processing
|
| 51 |
+
|
| 52 |
+
# TTS dependencies (for Modal GPU TTS)
|
| 53 |
+
torch>=2.0.0 # Required by Kokoro TTS
|
| 54 |
+
transformers>=4.30.0 # Required by Kokoro TTS
|
| 55 |
+
modal>=0.63.0 # Required for TTS GPU execution
|
| 56 |
+
# Note: Kokoro is installed in Modal image from: git+https://github.com/hexgrad/kokoro.git
|
| 57 |
+
|
| 58 |
+
# Multi-agent orchestration (Advanced mode) - from optional magentic
|
| 59 |
+
agent-framework-core>=1.0.0b251120,<2.0.0
|
| 60 |
llama-index-llms-openai>=0.6.9
|
| 61 |
llama-index-embeddings-openai>=0.5.1
|
| 62 |
|
|
|
|
| 71 |
# Optional: Modal for code execution
|
| 72 |
modal>=0.63.0
|
| 73 |
|
| 74 |
+
# LlamaIndex RAG - from optional modal
|
| 75 |
+
llama-index-llms-openai
|
| 76 |
+
llama-index-embeddings-openai
|
| 77 |
+
|
| 78 |
+
pydantic-ai-slim[huggingface]>=0.0.18
|
src/app.py
CHANGED
|
@@ -501,16 +501,24 @@ async def research_agent(
|
|
| 501 |
audio_input_data: tuple[int, np.ndarray] | None = None
|
| 502 |
|
| 503 |
if isinstance(message, dict):
|
| 504 |
-
# MultimodalPostprocess format: {"text": str, "files": list[FileData]}
|
| 505 |
processed_text = message.get("text", "") or ""
|
| 506 |
files = message.get("files", [])
|
|
|
|
|
|
|
| 507 |
|
| 508 |
-
# Process multimodal input (images, audio files)
|
| 509 |
-
if files
|
|
|
|
| 510 |
try:
|
| 511 |
multimodal_service = get_multimodal_service()
|
|
|
|
| 512 |
processed_text = await multimodal_service.process_multimodal_input(
|
| 513 |
-
processed_text,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 514 |
)
|
| 515 |
except Exception as e:
|
| 516 |
logger.warning("multimodal_processing_failed", error=str(e))
|
|
@@ -636,8 +644,8 @@ def create_demo() -> gr.Blocks:
|
|
| 636 |
)
|
| 637 |
|
| 638 |
# Create settings components
|
| 639 |
-
#
|
| 640 |
-
#
|
| 641 |
mode_radio = gr.Radio(
|
| 642 |
choices=["simple", "advanced", "iterative", "deep", "auto"],
|
| 643 |
value="simple",
|
|
@@ -666,56 +674,59 @@ def create_demo() -> gr.Blocks:
|
|
| 666 |
info="Enable graph-based workflow execution",
|
| 667 |
)
|
| 668 |
|
| 669 |
-
# TTS Configuration
|
| 670 |
-
|
| 671 |
-
|
| 672 |
-
|
| 673 |
-
|
| 674 |
-
|
| 675 |
-
|
| 676 |
-
|
| 677 |
-
|
| 678 |
-
|
| 679 |
-
|
| 680 |
-
|
| 681 |
-
|
| 682 |
-
|
| 683 |
-
|
| 684 |
-
|
| 685 |
-
|
| 686 |
-
|
| 687 |
-
|
| 688 |
-
|
| 689 |
-
|
| 690 |
-
|
| 691 |
-
|
| 692 |
-
|
| 693 |
-
|
| 694 |
-
|
| 695 |
-
|
| 696 |
-
|
| 697 |
-
)
|
| 698 |
-
|
| 699 |
-
|
| 700 |
-
|
| 701 |
-
|
| 702 |
-
|
| 703 |
-
|
| 704 |
-
|
| 705 |
-
|
| 706 |
-
|
| 707 |
-
|
| 708 |
-
|
| 709 |
-
|
| 710 |
-
|
| 711 |
-
|
| 712 |
-
|
| 713 |
-
)
|
| 714 |
-
|
| 715 |
-
|
| 716 |
-
|
| 717 |
-
|
| 718 |
-
|
|
|
|
|
|
|
|
|
|
| 719 |
|
| 720 |
# Hidden text components for model/provider (not dropdowns to avoid value mismatch)
|
| 721 |
# These will be empty by default and use defaults in configure_orchestrator
|
|
@@ -787,7 +798,8 @@ def create_demo() -> gr.Blocks:
|
|
| 787 |
],
|
| 788 |
cache_examples=False, # CRITICAL: Disable example caching to prevent examples from running at startup
|
| 789 |
# Examples will only run when user explicitly clicks them (after login)
|
| 790 |
-
additional_inputs_accordion
|
|
|
|
| 791 |
additional_inputs=[
|
| 792 |
mode_radio,
|
| 793 |
hf_model_dropdown,
|
|
|
|
| 501 |
audio_input_data: tuple[int, np.ndarray] | None = None
|
| 502 |
|
| 503 |
if isinstance(message, dict):
|
| 504 |
+
# MultimodalPostprocess format: {"text": str, "files": list[FileData], "audio": tuple | None}
|
| 505 |
processed_text = message.get("text", "") or ""
|
| 506 |
files = message.get("files", [])
|
| 507 |
+
# Check for audio input in message (Gradio may include it as a separate field)
|
| 508 |
+
audio_input_data = message.get("audio") or None
|
| 509 |
|
| 510 |
+
# Process multimodal input (images, audio files, audio input)
|
| 511 |
+
# Always process if we have files or audio input, not just when enable_image_input is True
|
| 512 |
+
if files or (audio_input_data is not None and settings.enable_audio_input):
|
| 513 |
try:
|
| 514 |
multimodal_service = get_multimodal_service()
|
| 515 |
+
# Prepend audio/image text to original text (prepend_multimodal=True)
|
| 516 |
processed_text = await multimodal_service.process_multimodal_input(
|
| 517 |
+
processed_text,
|
| 518 |
+
files=files,
|
| 519 |
+
audio_input=audio_input_data,
|
| 520 |
+
hf_token=token_value,
|
| 521 |
+
prepend_multimodal=True, # Prepend audio/image text to text input
|
| 522 |
)
|
| 523 |
except Exception as e:
|
| 524 |
logger.warning("multimodal_processing_failed", error=str(e))
|
|
|
|
| 644 |
)
|
| 645 |
|
| 646 |
# Create settings components
|
| 647 |
+
# Note: ChatInterface doesn't support additional_inputs_accordion parameter in Gradio 6.0
|
| 648 |
+
# Components are created outside accordion context to ensure they're accessible for additional_inputs
|
| 649 |
mode_radio = gr.Radio(
|
| 650 |
choices=["simple", "advanced", "iterative", "deep", "auto"],
|
| 651 |
value="simple",
|
|
|
|
| 674 |
info="Enable graph-based workflow execution",
|
| 675 |
)
|
| 676 |
|
| 677 |
+
# TTS Configuration components
|
| 678 |
+
# Note: These are created outside accordion to ensure accessibility for additional_inputs
|
| 679 |
+
# The ChatInterface will display them, but grouping in accordion is not supported via additional_inputs_accordion
|
| 680 |
+
tts_voice_dropdown = gr.Dropdown(
|
| 681 |
+
choices=[
|
| 682 |
+
"af_heart",
|
| 683 |
+
"af_bella",
|
| 684 |
+
"af_nicole",
|
| 685 |
+
"af_aoede",
|
| 686 |
+
"af_kore",
|
| 687 |
+
"af_sarah",
|
| 688 |
+
"af_nova",
|
| 689 |
+
"af_sky",
|
| 690 |
+
"af_alloy",
|
| 691 |
+
"af_jessica",
|
| 692 |
+
"af_river",
|
| 693 |
+
"am_michael",
|
| 694 |
+
"am_fenrir",
|
| 695 |
+
"am_puck",
|
| 696 |
+
"am_echo",
|
| 697 |
+
"am_eric",
|
| 698 |
+
"am_liam",
|
| 699 |
+
"am_onyx",
|
| 700 |
+
"am_santa",
|
| 701 |
+
"am_adam",
|
| 702 |
+
],
|
| 703 |
+
value=settings.tts_voice,
|
| 704 |
+
label="TTS Voice",
|
| 705 |
+
info="Select TTS voice (American English voices: af_*, am_*)",
|
| 706 |
+
visible=settings.enable_audio_output,
|
| 707 |
+
)
|
| 708 |
+
tts_speed_slider = gr.Slider(
|
| 709 |
+
minimum=0.5,
|
| 710 |
+
maximum=2.0,
|
| 711 |
+
value=settings.tts_speed,
|
| 712 |
+
step=0.1,
|
| 713 |
+
label="TTS Speech Speed",
|
| 714 |
+
info="Adjust TTS speech speed (0.5x to 2.0x)",
|
| 715 |
+
visible=settings.enable_audio_output,
|
| 716 |
+
)
|
| 717 |
+
tts_gpu_dropdown = gr.Dropdown(
|
| 718 |
+
choices=["T4", "A10", "A100", "L4", "L40S"],
|
| 719 |
+
value=settings.tts_gpu or "T4",
|
| 720 |
+
label="TTS GPU Type",
|
| 721 |
+
info="Modal GPU type for TTS (T4 is cheapest, A100 is fastest). Note: GPU changes require app restart.",
|
| 722 |
+
visible=settings.modal_available and settings.enable_audio_output,
|
| 723 |
+
interactive=False, # GPU type set at function definition time, requires restart
|
| 724 |
+
)
|
| 725 |
+
enable_audio_output_checkbox = gr.Checkbox(
|
| 726 |
+
value=settings.enable_audio_output,
|
| 727 |
+
label="Enable Audio Output",
|
| 728 |
+
info="Generate audio responses using TTS",
|
| 729 |
+
)
|
| 730 |
|
| 731 |
# Hidden text components for model/provider (not dropdowns to avoid value mismatch)
|
| 732 |
# These will be empty by default and use defaults in configure_orchestrator
|
|
|
|
| 798 |
],
|
| 799 |
cache_examples=False, # CRITICAL: Disable example caching to prevent examples from running at startup
|
| 800 |
# Examples will only run when user explicitly clicks them (after login)
|
| 801 |
+
# Note: additional_inputs_accordion is not a valid parameter in Gradio 6.0 ChatInterface
|
| 802 |
+
# Components will be displayed in the order provided
|
| 803 |
additional_inputs=[
|
| 804 |
mode_radio,
|
| 805 |
hf_model_dropdown,
|
src/services/multimodal_processing.py
CHANGED
|
@@ -36,6 +36,7 @@ class MultimodalService:
|
|
| 36 |
files: list[FileData] | None = None,
|
| 37 |
audio_input: tuple[int, Any] | None = None,
|
| 38 |
hf_token: str | None = None,
|
|
|
|
| 39 |
) -> str:
|
| 40 |
"""Process multimodal input (text + images + audio) and return combined text.
|
| 41 |
|
|
@@ -44,26 +45,24 @@ class MultimodalService:
|
|
| 44 |
files: List of uploaded files (images, audio, etc.)
|
| 45 |
audio_input: Audio input tuple (sample_rate, audio_array)
|
| 46 |
hf_token: HuggingFace token for authenticated Gradio Spaces
|
|
|
|
| 47 |
|
| 48 |
Returns:
|
| 49 |
Combined text from all inputs
|
| 50 |
"""
|
|
|
|
| 51 |
text_parts: list[str] = []
|
| 52 |
|
| 53 |
-
#
|
| 54 |
-
if text and text.strip():
|
| 55 |
-
text_parts.append(text.strip())
|
| 56 |
-
|
| 57 |
-
# Process audio input
|
| 58 |
if audio_input is not None and settings.enable_audio_input:
|
| 59 |
try:
|
| 60 |
transcribed = await self.audio.process_audio_input(audio_input, hf_token=hf_token)
|
| 61 |
if transcribed:
|
| 62 |
-
|
| 63 |
except Exception as e:
|
| 64 |
logger.warning("audio_processing_failed", error=str(e))
|
| 65 |
|
| 66 |
-
# Process uploaded files
|
| 67 |
if files:
|
| 68 |
for file_data in files:
|
| 69 |
file_path = file_data.path if isinstance(file_data, FileData) else str(file_data)
|
|
@@ -73,7 +72,7 @@ class MultimodalService:
|
|
| 73 |
try:
|
| 74 |
extracted_text = await self.ocr.extract_text(file_path, hf_token=hf_token)
|
| 75 |
if extracted_text:
|
| 76 |
-
|
| 77 |
except Exception as e:
|
| 78 |
logger.warning("image_ocr_failed", file_path=file_path, error=str(e))
|
| 79 |
|
|
@@ -86,8 +85,20 @@ class MultimodalService:
|
|
| 86 |
except Exception as e:
|
| 87 |
logger.warning("audio_file_processing_failed", file_path=file_path, error=str(e))
|
| 88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
# Combine all text parts
|
| 90 |
-
combined_text = "\n\n".join(
|
| 91 |
|
| 92 |
logger.info(
|
| 93 |
"multimodal_input_processed",
|
|
|
|
| 36 |
files: list[FileData] | None = None,
|
| 37 |
audio_input: tuple[int, Any] | None = None,
|
| 38 |
hf_token: str | None = None,
|
| 39 |
+
prepend_multimodal: bool = True,
|
| 40 |
) -> str:
|
| 41 |
"""Process multimodal input (text + images + audio) and return combined text.
|
| 42 |
|
|
|
|
| 45 |
files: List of uploaded files (images, audio, etc.)
|
| 46 |
audio_input: Audio input tuple (sample_rate, audio_array)
|
| 47 |
hf_token: HuggingFace token for authenticated Gradio Spaces
|
| 48 |
+
prepend_multimodal: If True, prepend audio/image text to original text; otherwise append
|
| 49 |
|
| 50 |
Returns:
|
| 51 |
Combined text from all inputs
|
| 52 |
"""
|
| 53 |
+
multimodal_parts: list[str] = []
|
| 54 |
text_parts: list[str] = []
|
| 55 |
|
| 56 |
+
# Process audio input first
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
if audio_input is not None and settings.enable_audio_input:
|
| 58 |
try:
|
| 59 |
transcribed = await self.audio.process_audio_input(audio_input, hf_token=hf_token)
|
| 60 |
if transcribed:
|
| 61 |
+
multimodal_parts.append(transcribed)
|
| 62 |
except Exception as e:
|
| 63 |
logger.warning("audio_processing_failed", error=str(e))
|
| 64 |
|
| 65 |
+
# Process uploaded files (images and audio files)
|
| 66 |
if files:
|
| 67 |
for file_data in files:
|
| 68 |
file_path = file_data.path if isinstance(file_data, FileData) else str(file_data)
|
|
|
|
| 72 |
try:
|
| 73 |
extracted_text = await self.ocr.extract_text(file_path, hf_token=hf_token)
|
| 74 |
if extracted_text:
|
| 75 |
+
multimodal_parts.append(extracted_text)
|
| 76 |
except Exception as e:
|
| 77 |
logger.warning("image_ocr_failed", file_path=file_path, error=str(e))
|
| 78 |
|
|
|
|
| 85 |
except Exception as e:
|
| 86 |
logger.warning("audio_file_processing_failed", file_path=file_path, error=str(e))
|
| 87 |
|
| 88 |
+
# Add original text if present
|
| 89 |
+
if text and text.strip():
|
| 90 |
+
text_parts.append(text.strip())
|
| 91 |
+
|
| 92 |
+
# Combine parts based on prepend_multimodal flag
|
| 93 |
+
if prepend_multimodal:
|
| 94 |
+
# Prepend: multimodal content first, then original text
|
| 95 |
+
combined_parts = multimodal_parts + text_parts
|
| 96 |
+
else:
|
| 97 |
+
# Append: original text first, then multimodal content
|
| 98 |
+
combined_parts = text_parts + multimodal_parts
|
| 99 |
+
|
| 100 |
# Combine all text parts
|
| 101 |
+
combined_text = "\n\n".join(combined_parts) if combined_parts else ""
|
| 102 |
|
| 103 |
logger.info(
|
| 104 |
"multimodal_input_processed",
|
src/utils/config.py
CHANGED
|
@@ -140,6 +140,30 @@ class Settings(BaseSettings):
|
|
| 140 |
description="Automatically ingest evidence into RAG",
|
| 141 |
)
|
| 142 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
@property
|
| 144 |
def modal_available(self) -> bool:
|
| 145 |
"""Check if Modal credentials are configured."""
|
|
|
|
| 140 |
description="Automatically ingest evidence into RAG",
|
| 141 |
)
|
| 142 |
|
| 143 |
+
# Audio/TTS Configuration
|
| 144 |
+
enable_audio_input: bool = Field(
|
| 145 |
+
default=True,
|
| 146 |
+
description="Enable audio input (speech-to-text) in multimodal interface",
|
| 147 |
+
)
|
| 148 |
+
enable_audio_output: bool = Field(
|
| 149 |
+
default=True,
|
| 150 |
+
description="Enable audio output (text-to-speech) for responses",
|
| 151 |
+
)
|
| 152 |
+
tts_voice: str = Field(
|
| 153 |
+
default="af_heart",
|
| 154 |
+
description="TTS voice ID for Kokoro TTS (e.g., af_heart, am_michael)",
|
| 155 |
+
)
|
| 156 |
+
tts_speed: float = Field(
|
| 157 |
+
default=1.0,
|
| 158 |
+
ge=0.5,
|
| 159 |
+
le=2.0,
|
| 160 |
+
description="TTS speech speed multiplier (0.5x to 2.0x)",
|
| 161 |
+
)
|
| 162 |
+
tts_gpu: str | None = Field(
|
| 163 |
+
default=None,
|
| 164 |
+
description="Modal GPU type for TTS (T4, A10, A100, L4, L40S). None uses default T4.",
|
| 165 |
+
)
|
| 166 |
+
|
| 167 |
@property
|
| 168 |
def modal_available(self) -> bool:
|
| 169 |
"""Check if Modal credentials are configured."""
|
uv.lock
CHANGED
|
@@ -1108,30 +1108,39 @@ name = "deepcritical"
|
|
| 1108 |
version = "0.1.0"
|
| 1109 |
source = { editable = "." }
|
| 1110 |
dependencies = [
|
|
|
|
| 1111 |
{ name = "anthropic" },
|
| 1112 |
{ name = "beautifulsoup4" },
|
|
|
|
| 1113 |
{ name = "duckduckgo-search" },
|
| 1114 |
-
{ name = "gradio", extra = ["mcp"] },
|
| 1115 |
{ name = "gradio-client" },
|
| 1116 |
{ name = "httpx" },
|
| 1117 |
{ name = "huggingface-hub" },
|
| 1118 |
{ name = "limits" },
|
| 1119 |
{ name = "llama-index" },
|
|
|
|
| 1120 |
{ name = "llama-index-llms-huggingface" },
|
| 1121 |
{ name = "llama-index-llms-huggingface-api" },
|
|
|
|
| 1122 |
{ name = "llama-index-vector-stores-chroma" },
|
| 1123 |
{ name = "modal" },
|
|
|
|
| 1124 |
{ name = "openai" },
|
| 1125 |
{ name = "pillow" },
|
| 1126 |
{ name = "pydantic" },
|
| 1127 |
{ name = "pydantic-ai" },
|
|
|
|
| 1128 |
{ name = "pydantic-graph" },
|
| 1129 |
{ name = "pydantic-settings" },
|
| 1130 |
{ name = "python-dotenv" },
|
| 1131 |
{ name = "requests" },
|
|
|
|
|
|
|
| 1132 |
{ name = "soundfile" },
|
| 1133 |
{ name = "structlog" },
|
| 1134 |
{ name = "tenacity" },
|
|
|
|
| 1135 |
{ name = "torch" },
|
| 1136 |
{ name = "transformers" },
|
| 1137 |
{ name = "xmltodict" },
|
|
@@ -1169,7 +1178,6 @@ modal = [
|
|
| 1169 |
{ name = "llama-index-embeddings-openai" },
|
| 1170 |
{ name = "llama-index-llms-openai" },
|
| 1171 |
{ name = "llama-index-vector-stores-chroma" },
|
| 1172 |
-
{ name = "modal" },
|
| 1173 |
{ name = "numpy" },
|
| 1174 |
]
|
| 1175 |
|
|
@@ -1181,23 +1189,27 @@ dev = [
|
|
| 1181 |
|
| 1182 |
[package.metadata]
|
| 1183 |
requires-dist = [
|
|
|
|
| 1184 |
{ name = "agent-framework-core", marker = "extra == 'magentic'", specifier = ">=1.0.0b251120,<2.0.0" },
|
| 1185 |
{ name = "anthropic", specifier = ">=0.18.0" },
|
| 1186 |
{ name = "beautifulsoup4", specifier = ">=4.12" },
|
|
|
|
| 1187 |
{ name = "chromadb", marker = "extra == 'embeddings'", specifier = ">=0.4.0" },
|
| 1188 |
{ name = "chromadb", marker = "extra == 'modal'", specifier = ">=0.4.0" },
|
| 1189 |
{ name = "duckduckgo-search", specifier = ">=5.0" },
|
| 1190 |
-
{ name = "gradio", extras = ["mcp"], specifier = ">=6.0.0" },
|
| 1191 |
{ name = "gradio-client", specifier = ">=1.0.0" },
|
| 1192 |
{ name = "httpx", specifier = ">=0.27" },
|
| 1193 |
{ name = "huggingface-hub", specifier = ">=0.20.0" },
|
| 1194 |
{ name = "limits", specifier = ">=3.0" },
|
| 1195 |
{ name = "llama-index", specifier = ">=0.14.8" },
|
| 1196 |
{ name = "llama-index", marker = "extra == 'modal'", specifier = ">=0.11.0" },
|
| 1197 |
-
{ name = "llama-index-embeddings-openai",
|
|
|
|
| 1198 |
{ name = "llama-index-llms-huggingface", specifier = ">=0.6.1" },
|
| 1199 |
{ name = "llama-index-llms-huggingface-api", specifier = ">=0.6.1" },
|
| 1200 |
-
{ name = "llama-index-llms-openai",
|
|
|
|
| 1201 |
{ name = "llama-index-vector-stores-chroma", specifier = ">=0.5.3" },
|
| 1202 |
{ name = "llama-index-vector-stores-chroma", marker = "extra == 'modal'" },
|
| 1203 |
{ name = "mkdocs", marker = "extra == 'dev'", specifier = ">=1.6.0" },
|
|
@@ -1206,8 +1218,8 @@ requires-dist = [
|
|
| 1206 |
{ name = "mkdocs-mermaid2-plugin", marker = "extra == 'dev'", specifier = ">=1.1.0" },
|
| 1207 |
{ name = "mkdocs-minify-plugin", marker = "extra == 'dev'", specifier = ">=0.8.0" },
|
| 1208 |
{ name = "modal", specifier = ">=0.63.0" },
|
| 1209 |
-
{ name = "modal", marker = "extra == 'modal'", specifier = ">=0.63.0" },
|
| 1210 |
{ name = "mypy", marker = "extra == 'dev'", specifier = ">=1.10" },
|
|
|
|
| 1211 |
{ name = "numpy", marker = "extra == 'embeddings'", specifier = "<2.0" },
|
| 1212 |
{ name = "numpy", marker = "extra == 'modal'", specifier = "<2.0" },
|
| 1213 |
{ name = "openai", specifier = ">=1.0.0" },
|
|
@@ -1215,6 +1227,7 @@ requires-dist = [
|
|
| 1215 |
{ name = "pre-commit", marker = "extra == 'dev'", specifier = ">=3.7" },
|
| 1216 |
{ name = "pydantic", specifier = ">=2.7" },
|
| 1217 |
{ name = "pydantic-ai", specifier = ">=0.0.16" },
|
|
|
|
| 1218 |
{ name = "pydantic-graph", specifier = ">=1.22.0" },
|
| 1219 |
{ name = "pydantic-settings", specifier = ">=2.2" },
|
| 1220 |
{ name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0" },
|
|
@@ -1225,13 +1238,16 @@ requires-dist = [
|
|
| 1225 |
{ name = "python-dotenv", specifier = ">=1.0" },
|
| 1226 |
{ name = "requests", specifier = ">=2.32.5" },
|
| 1227 |
{ name = "respx", marker = "extra == 'dev'", specifier = ">=0.21" },
|
|
|
|
| 1228 |
{ name = "ruff", marker = "extra == 'dev'", specifier = ">=0.4.0" },
|
|
|
|
| 1229 |
{ name = "sentence-transformers", marker = "extra == 'embeddings'", specifier = ">=2.2.0" },
|
| 1230 |
{ name = "soundfile", specifier = ">=0.12.0" },
|
| 1231 |
{ name = "structlog", specifier = ">=24.1" },
|
| 1232 |
{ name = "tenacity", specifier = ">=8.2" },
|
|
|
|
| 1233 |
{ name = "torch", specifier = ">=2.0.0" },
|
| 1234 |
-
{ name = "transformers", specifier = ">=4.
|
| 1235 |
{ name = "typer", marker = "extra == 'dev'", specifier = ">=0.9.0" },
|
| 1236 |
{ name = "xmltodict", specifier = ">=0.13" },
|
| 1237 |
]
|
|
@@ -1748,6 +1764,10 @@ mcp = [
|
|
| 1748 |
{ name = "mcp" },
|
| 1749 |
{ name = "pydantic" },
|
| 1750 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1751 |
|
| 1752 |
[[package]]
|
| 1753 |
name = "gradio-client"
|
|
@@ -2160,6 +2180,15 @@ wheels = [
|
|
| 2160 |
{ url = "https://files.pythonhosted.org/packages/32/4b/b99e37f88336009971405cbb7630610322ed6fbfa31e1d7ab3fbf3049a2d/invoke-2.2.1-py3-none-any.whl", hash = "sha256:2413bc441b376e5cd3f55bb5d364f973ad8bdd7bf87e53c79de3c11bf3feecc8", size = 160287, upload-time = "2025-10-11T00:36:33.703Z" },
|
| 2161 |
]
|
| 2162 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2163 |
[[package]]
|
| 2164 |
name = "jaraco-classes"
|
| 2165 |
version = "3.4.0"
|
|
|
|
| 1108 |
version = "0.1.0"
|
| 1109 |
source = { editable = "." }
|
| 1110 |
dependencies = [
|
| 1111 |
+
{ name = "agent-framework-core" },
|
| 1112 |
{ name = "anthropic" },
|
| 1113 |
{ name = "beautifulsoup4" },
|
| 1114 |
+
{ name = "chromadb" },
|
| 1115 |
{ name = "duckduckgo-search" },
|
| 1116 |
+
{ name = "gradio", extra = ["mcp", "oauth"] },
|
| 1117 |
{ name = "gradio-client" },
|
| 1118 |
{ name = "httpx" },
|
| 1119 |
{ name = "huggingface-hub" },
|
| 1120 |
{ name = "limits" },
|
| 1121 |
{ name = "llama-index" },
|
| 1122 |
+
{ name = "llama-index-embeddings-openai" },
|
| 1123 |
{ name = "llama-index-llms-huggingface" },
|
| 1124 |
{ name = "llama-index-llms-huggingface-api" },
|
| 1125 |
+
{ name = "llama-index-llms-openai" },
|
| 1126 |
{ name = "llama-index-vector-stores-chroma" },
|
| 1127 |
{ name = "modal" },
|
| 1128 |
+
{ name = "numpy" },
|
| 1129 |
{ name = "openai" },
|
| 1130 |
{ name = "pillow" },
|
| 1131 |
{ name = "pydantic" },
|
| 1132 |
{ name = "pydantic-ai" },
|
| 1133 |
+
{ name = "pydantic-ai-slim", extra = ["huggingface"] },
|
| 1134 |
{ name = "pydantic-graph" },
|
| 1135 |
{ name = "pydantic-settings" },
|
| 1136 |
{ name = "python-dotenv" },
|
| 1137 |
{ name = "requests" },
|
| 1138 |
+
{ name = "rpds-py" },
|
| 1139 |
+
{ name = "sentence-transformers" },
|
| 1140 |
{ name = "soundfile" },
|
| 1141 |
{ name = "structlog" },
|
| 1142 |
{ name = "tenacity" },
|
| 1143 |
+
{ name = "tokenizers" },
|
| 1144 |
{ name = "torch" },
|
| 1145 |
{ name = "transformers" },
|
| 1146 |
{ name = "xmltodict" },
|
|
|
|
| 1178 |
{ name = "llama-index-embeddings-openai" },
|
| 1179 |
{ name = "llama-index-llms-openai" },
|
| 1180 |
{ name = "llama-index-vector-stores-chroma" },
|
|
|
|
| 1181 |
{ name = "numpy" },
|
| 1182 |
]
|
| 1183 |
|
|
|
|
| 1189 |
|
| 1190 |
[package.metadata]
|
| 1191 |
requires-dist = [
|
| 1192 |
+
{ name = "agent-framework-core", specifier = ">=1.0.0b251120,<2.0.0" },
|
| 1193 |
{ name = "agent-framework-core", marker = "extra == 'magentic'", specifier = ">=1.0.0b251120,<2.0.0" },
|
| 1194 |
{ name = "anthropic", specifier = ">=0.18.0" },
|
| 1195 |
{ name = "beautifulsoup4", specifier = ">=4.12" },
|
| 1196 |
+
{ name = "chromadb", specifier = ">=0.4.0" },
|
| 1197 |
{ name = "chromadb", marker = "extra == 'embeddings'", specifier = ">=0.4.0" },
|
| 1198 |
{ name = "chromadb", marker = "extra == 'modal'", specifier = ">=0.4.0" },
|
| 1199 |
{ name = "duckduckgo-search", specifier = ">=5.0" },
|
| 1200 |
+
{ name = "gradio", extras = ["mcp", "oauth"], specifier = ">=6.0.0" },
|
| 1201 |
{ name = "gradio-client", specifier = ">=1.0.0" },
|
| 1202 |
{ name = "httpx", specifier = ">=0.27" },
|
| 1203 |
{ name = "huggingface-hub", specifier = ">=0.20.0" },
|
| 1204 |
{ name = "limits", specifier = ">=3.0" },
|
| 1205 |
{ name = "llama-index", specifier = ">=0.14.8" },
|
| 1206 |
{ name = "llama-index", marker = "extra == 'modal'", specifier = ">=0.11.0" },
|
| 1207 |
+
{ name = "llama-index-embeddings-openai", specifier = ">=0.5.1" },
|
| 1208 |
+
{ name = "llama-index-embeddings-openai", marker = "extra == 'modal'", specifier = ">=0.5.1" },
|
| 1209 |
{ name = "llama-index-llms-huggingface", specifier = ">=0.6.1" },
|
| 1210 |
{ name = "llama-index-llms-huggingface-api", specifier = ">=0.6.1" },
|
| 1211 |
+
{ name = "llama-index-llms-openai", specifier = ">=0.6.9" },
|
| 1212 |
+
{ name = "llama-index-llms-openai", marker = "extra == 'modal'", specifier = ">=0.6.9" },
|
| 1213 |
{ name = "llama-index-vector-stores-chroma", specifier = ">=0.5.3" },
|
| 1214 |
{ name = "llama-index-vector-stores-chroma", marker = "extra == 'modal'" },
|
| 1215 |
{ name = "mkdocs", marker = "extra == 'dev'", specifier = ">=1.6.0" },
|
|
|
|
| 1218 |
{ name = "mkdocs-mermaid2-plugin", marker = "extra == 'dev'", specifier = ">=1.1.0" },
|
| 1219 |
{ name = "mkdocs-minify-plugin", marker = "extra == 'dev'", specifier = ">=0.8.0" },
|
| 1220 |
{ name = "modal", specifier = ">=0.63.0" },
|
|
|
|
| 1221 |
{ name = "mypy", marker = "extra == 'dev'", specifier = ">=1.10" },
|
| 1222 |
+
{ name = "numpy", specifier = "<2.0" },
|
| 1223 |
{ name = "numpy", marker = "extra == 'embeddings'", specifier = "<2.0" },
|
| 1224 |
{ name = "numpy", marker = "extra == 'modal'", specifier = "<2.0" },
|
| 1225 |
{ name = "openai", specifier = ">=1.0.0" },
|
|
|
|
| 1227 |
{ name = "pre-commit", marker = "extra == 'dev'", specifier = ">=3.7" },
|
| 1228 |
{ name = "pydantic", specifier = ">=2.7" },
|
| 1229 |
{ name = "pydantic-ai", specifier = ">=0.0.16" },
|
| 1230 |
+
{ name = "pydantic-ai-slim", extras = ["huggingface"], specifier = ">=0.0.18" },
|
| 1231 |
{ name = "pydantic-graph", specifier = ">=1.22.0" },
|
| 1232 |
{ name = "pydantic-settings", specifier = ">=2.2" },
|
| 1233 |
{ name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0" },
|
|
|
|
| 1238 |
{ name = "python-dotenv", specifier = ">=1.0" },
|
| 1239 |
{ name = "requests", specifier = ">=2.32.5" },
|
| 1240 |
{ name = "respx", marker = "extra == 'dev'", specifier = ">=0.21" },
|
| 1241 |
+
{ name = "rpds-py", specifier = ">=0.29.0" },
|
| 1242 |
{ name = "ruff", marker = "extra == 'dev'", specifier = ">=0.4.0" },
|
| 1243 |
+
{ name = "sentence-transformers", specifier = ">=2.2.0" },
|
| 1244 |
{ name = "sentence-transformers", marker = "extra == 'embeddings'", specifier = ">=2.2.0" },
|
| 1245 |
{ name = "soundfile", specifier = ">=0.12.0" },
|
| 1246 |
{ name = "structlog", specifier = ">=24.1" },
|
| 1247 |
{ name = "tenacity", specifier = ">=8.2" },
|
| 1248 |
+
{ name = "tokenizers", specifier = ">=0.22.0,<=0.23.0" },
|
| 1249 |
{ name = "torch", specifier = ">=2.0.0" },
|
| 1250 |
+
{ name = "transformers", specifier = ">=4.57.2" },
|
| 1251 |
{ name = "typer", marker = "extra == 'dev'", specifier = ">=0.9.0" },
|
| 1252 |
{ name = "xmltodict", specifier = ">=0.13" },
|
| 1253 |
]
|
|
|
|
| 1764 |
{ name = "mcp" },
|
| 1765 |
{ name = "pydantic" },
|
| 1766 |
]
|
| 1767 |
+
oauth = [
|
| 1768 |
+
{ name = "authlib" },
|
| 1769 |
+
{ name = "itsdangerous" },
|
| 1770 |
+
]
|
| 1771 |
|
| 1772 |
[[package]]
|
| 1773 |
name = "gradio-client"
|
|
|
|
| 2180 |
{ url = "https://files.pythonhosted.org/packages/32/4b/b99e37f88336009971405cbb7630610322ed6fbfa31e1d7ab3fbf3049a2d/invoke-2.2.1-py3-none-any.whl", hash = "sha256:2413bc441b376e5cd3f55bb5d364f973ad8bdd7bf87e53c79de3c11bf3feecc8", size = 160287, upload-time = "2025-10-11T00:36:33.703Z" },
|
| 2181 |
]
|
| 2182 |
|
| 2183 |
+
[[package]]
|
| 2184 |
+
name = "itsdangerous"
|
| 2185 |
+
version = "2.2.0"
|
| 2186 |
+
source = { registry = "https://pypi.org/simple" }
|
| 2187 |
+
sdist = { url = "https://files.pythonhosted.org/packages/9c/cb/8ac0172223afbccb63986cc25049b154ecfb5e85932587206f42317be31d/itsdangerous-2.2.0.tar.gz", hash = "sha256:e0050c0b7da1eea53ffaf149c0cfbb5c6e2e2b69c4bef22c81fa6eb73e5f6173", size = 54410, upload-time = "2024-04-16T21:28:15.614Z" }
|
| 2188 |
+
wheels = [
|
| 2189 |
+
{ url = "https://files.pythonhosted.org/packages/04/96/92447566d16df59b2a776c0fb82dbc4d9e07cd95062562af01e408583fc4/itsdangerous-2.2.0-py3-none-any.whl", hash = "sha256:c6242fc49e35958c8b15141343aa660db5fc54d4f13a1db01a3f5891b98700ef", size = 16234, upload-time = "2024-04-16T21:28:14.499Z" },
|
| 2190 |
+
]
|
| 2191 |
+
|
| 2192 |
[[package]]
|
| 2193 |
name = "jaraco-classes"
|
| 2194 |
version = "3.4.0"
|