Spaces:

DroolingPanda
/

teachingAssistant

Sleeping

App Files Files Community

Michael Hu commited on Jul 29

Commit

c7f7521

1 Parent(s): cf7f5a3

remove legacy impl

Browse files

Files changed (9) hide show

utils/stt.py +0 -175
utils/translation.py +0 -65
utils/tts.py +0 -126
utils/tts_README.md +0 -64
utils/tts_base.py +0 -69
utils/tts_cosyvoice2.py +0 -209
utils/tts_dia.py +0 -201
utils/tts_dummy.py +0 -65
utils/tts_kokoro.py +0 -144

utils/stt.py DELETED Viewed

@@ -1,175 +0,0 @@
-"""
-Speech Recognition Module
-Supports multiple ASR models including Whisper and Parakeet
-Handles audio preprocessing and transcription
-"""
-import logging
-import numpy as np
-import os
-from abc import ABC, abstractmethod
-logger = logging.getLogger(__name__)
-from faster_whisper import WhisperModel as FasterWhisperModel
-from pydub import AudioSegment
-class ASRModel(ABC):
-    """Base class for ASR models"""
-    @abstractmethod
-    def load_model(self):
-        """Load the ASR model"""
-        pass
-    @abstractmethod
-    def transcribe(self, audio_path):
-        """Transcribe audio to text"""
-        pass
-    def preprocess_audio(self, audio_path):
-        """Convert audio to required format"""
-        logger.info("Converting audio format")
-        audio = AudioSegment.from_file(audio_path)
-        processed_audio = audio.set_frame_rate(16000).set_channels(1)
-        wav_path = audio_path.replace(".mp3", ".wav") if audio_path.endswith(".mp3") else audio_path
-        if not wav_path.endswith(".wav"):
-            wav_path = f"{os.path.splitext(wav_path)[0]}.wav"
-        processed_audio.export(wav_path, format="wav")
-        logger.info(f"Audio converted to: {wav_path}")
-        return wav_path
-class WhisperModel(ASRModel):
-    """Faster Whisper ASR model implementation"""
-    def __init__(self):
-        self.model = None
-        # Check for CUDA availability without torch dependency
-        try:
-            import torch
-            self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        except ImportError:
-            # Fallback to CPU if torch is not available
-            self.device = "cpu"
-        self.compute_type = "float16" if self.device == "cuda" else "int8"
-    def load_model(self):
-        """Load Faster Whisper model"""
-        logger.info("Loading Faster Whisper model")
-        logger.info(f"Using device: {self.device}")
-        logger.info(f"Using compute type: {self.compute_type}")
-        # Use large-v3 model with appropriate compute type based on device
-        self.model = FasterWhisperModel(
-            "large-v3",
-            device=self.device,
-            compute_type=self.compute_type
-        )
-        logger.info("Faster Whisper model loaded successfully")
-    def transcribe(self, audio_path):
-        """Transcribe audio using Faster Whisper"""
-        if self.model is None:
-            self.load_model()
-        wav_path = self.preprocess_audio(audio_path)
-        # Transcription with Faster Whisper
-        logger.info("Generating transcription with Faster Whisper")
-        segments, info = self.model.transcribe(
-            wav_path,
-            beam_size=5,
-            language="en",
-            task="transcribe"
-        )
-        logger.info(f"Detected language '{info.language}' with probability {info.language_probability}")
-        # Collect all segments into a single text
-        result_text = ""
-        for segment in segments:
-            result_text += segment.text + " "
-            logger.info(f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}")
-        result = result_text.strip()
-        logger.info(f"Transcription completed successfully")
-        return result
-class ParakeetModel(ASRModel):
-    """Parakeet ASR model implementation"""
-    def __init__(self):
-        self.model = None
-    def load_model(self):
-        """Load Parakeet model"""
-        try:
-            import nemo.collections.asr as nemo_asr
-            logger.info("Loading Parakeet model")
-            self.model = nemo_asr.models.ASRModel.from_pretrained(model_name="nvidia/parakeet-tdt-0.6b-v2")
-            logger.info("Parakeet model loaded successfully")
-        except ImportError:
-            logger.error("Failed to import nemo_toolkit. Please install with: pip install -U 'nemo_toolkit[asr]'")
-            raise
-    def transcribe(self, audio_path):
-        """Transcribe audio using Parakeet"""
-        if self.model is None:
-            self.load_model()
-        wav_path = self.preprocess_audio(audio_path)
-        # Transcription
-        logger.info("Generating transcription with Parakeet")
-        output = self.model.transcribe([wav_path])
-        result = output[0].text
-        logger.info(f"Transcription completed successfully")
-        return result
-class ASRFactory:
-    """Factory for creating ASR model instances"""
-    @staticmethod
-    def get_model(model_name="parakeet"):
-        """
-        Get ASR model by name
-        Args:
-            model_name: Name of the model to use (whisper or parakeet)
-        Returns:
-            ASR model instance
-        """
-        if model_name.lower() == "whisper":
-            return WhisperModel()
-        elif model_name.lower() == "parakeet":
-            return ParakeetModel()
-        else:
-            logger.warning(f"Unknown model: {model_name}, falling back to Whisper")
-            return WhisperModel()
-def transcribe_audio(audio_path, model_name="parakeet"):
-    """
-    Convert audio file to text using specified ASR model
-    Args:
-        audio_path: Path to input audio file
-        model_name: Name of the ASR model to use (whisper or parakeet)
-    Returns:
-        Transcribed English text
-    """
-    logger.info(f"Starting transcription for: {audio_path} using {model_name} model")
-    try:
-        # Get the appropriate model
-        asr_model = ASRFactory.get_model(model_name)
-        # Transcribe audio
-        result = asr_model.transcribe(audio_path)
-        logger.info(f"transcription: %s" % result)
-        return result
-    except Exception as e:
-        logger.error(f"Transcription failed: {str(e)}", exc_info=True)
-        raise

utils/translation.py DELETED Viewed

@@ -1,65 +0,0 @@
-"""
-Text Translation Module using NLLB-3.3B model
-Handles text segmentation and batch translation
-"""
-import logging
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-logger = logging.getLogger(__name__)
-def translate_text(text):
-    """
-    Translate English text to Simplified Chinese
-    Args:
-        text: Input English text
-    Returns:
-        Translated Chinese text
-    """
-    logger.info(f"Starting translation for text length: {len(text)}")
-    try:
-        # Model initialization with explicit language codes
-        logger.info("Loading NLLB model")
-        tokenizer = AutoTokenizer.from_pretrained(
-            "facebook/nllb-200-3.3B",
-            src_lang="eng_Latn"  # Specify source language
-        )
-        model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-3.3B")
-        logger.info("Translation model loaded")
-        # Text processing
-        max_chunk_length = 1000
-        text_chunks = [text[i:i+max_chunk_length] for i in range(0, len(text), max_chunk_length)]
-        logger.info(f"Split text into {len(text_chunks)} chunks")
-        translated_chunks = []
-        for i, chunk in enumerate(text_chunks):
-            logger.info(f"Processing chunk {i+1}/{len(text_chunks)}")
-            # Tokenize with source language specification
-            inputs = tokenizer(
-                chunk,
-                return_tensors="pt",
-                max_length=1024,
-                truncation=True
-            )
-            # Generate translation with target language specification
-            outputs = model.generate(
-                **inputs,
-                forced_bos_token_id=tokenizer.convert_tokens_to_ids("zho_Hans"),
-                max_new_tokens=1024
-            )
-            translated = tokenizer.decode(outputs[0], skip_special_tokens=True)
-            translated_chunks.append(translated)
-            logger.info(f"Chunk {i+1} translated successfully")
-        result = "".join(translated_chunks)
-        logger.info(f"Translation completed. Total length: {len(result)}")
-        return result
-    except Exception as e:
-        logger.error(f"Translation failed: {str(e)}", exc_info=True)
-        raise

utils/tts.py DELETED Viewed

@@ -1,126 +0,0 @@
-import logging
-from typing import Optional, Generator, Tuple, List, Dict, Any
-import numpy as np
-# Import the base class and dummy implementation
-from utils.tts_base import TTSBase
-from utils.tts_dummy import DummyTTS
-# Import the specific TTS implementations
-from utils.tts_kokoro import KokoroTTS, KOKORO_AVAILABLE
-from utils.tts_dia import DiaTTS, DIA_AVAILABLE
-from utils.tts_cosyvoice2 import CosyVoice2TTS, COSYVOICE2_AVAILABLE
-# Configure logging
-logger = logging.getLogger(__name__)
-def get_available_engines() -> List[str]:
-    """Get a list of available TTS engines
-    Returns:
-        List[str]: List of available engine names
-    """
-    available = []
-    if KOKORO_AVAILABLE:
-        available.append('kokoro')
-    if DIA_AVAILABLE:
-        available.append('dia')
-    if COSYVOICE2_AVAILABLE:
-        available.append('cosyvoice2')
-    # Dummy is always available
-    available.append('dummy')
-    return available
-def get_tts_engine(engine_type: Optional[str] = None, lang_code: str = 'z') -> TTSBase:
-    """Get a TTS engine instance
-    Args:
-        engine_type (str, optional): Type of engine to create ('kokoro', 'dia', 'cosyvoice2', 'dummy')
-                                    If None, the best available engine will be used
-        lang_code (str): Language code for the engine
-    Returns:
-        TTSBase: An instance of a TTS engine
-    """
-    # Get available engines
-    available_engines = get_available_engines()
-    logger.info(f"Available TTS engines: {available_engines}")
-    # If engine_type is specified, try to create that specific engine
-    if engine_type is not None:
-        if engine_type == 'kokoro' and KOKORO_AVAILABLE:
-            logger.info("Creating Kokoro TTS engine")
-            return KokoroTTS(lang_code)
-        elif engine_type == 'dia' and DIA_AVAILABLE:
-            logger.info("Creating Dia TTS engine")
-            return DiaTTS(lang_code)
-        elif engine_type == 'cosyvoice2' and COSYVOICE2_AVAILABLE:
-            logger.info("Creating CosyVoice2 TTS engine")
-            return CosyVoice2TTS(lang_code)
-        elif engine_type == 'dummy':
-            logger.info("Creating Dummy TTS engine")
-            return DummyTTS(lang_code)
-        else:
-            logger.warning(f"Requested engine '{engine_type}' is not available")
-    # If no specific engine is requested or the requested engine is not available,
-    # use the best available engine based on priority
-    priority_order = ['cosyvoice2', 'kokoro', 'dia', 'dummy']
-    for engine in priority_order:
-        if engine in available_engines:
-            logger.info(f"Using best available engine: {engine}")
-            if engine == 'kokoro':
-                return KokoroTTS(lang_code)
-            elif engine == 'dia':
-                return DiaTTS(lang_code)
-            elif engine == 'cosyvoice2':
-                return CosyVoice2TTS(lang_code)
-            elif engine == 'dummy':
-                return DummyTTS(lang_code)
-    # Fallback to dummy engine if no engines are available
-    logger.warning("No TTS engines available, falling back to dummy engine")
-    return DummyTTS(lang_code)
-def generate_speech(text: str, engine_type: Optional[str] = None, lang_code: str = 'z',
-                   voice: str = 'default', speed: float = 1.0) -> Optional[str]:
-    """Generate speech using the specified or best available TTS engine
-    Args:
-        text (str): Input text to synthesize
-        engine_type (str, optional): Type of engine to use
-        lang_code (str): Language code
-        voice (str): Voice ID to use
-        speed (float): Speech speed multiplier
-    Returns:
-        Optional[str]: Path to the generated audio file or None if generation fails
-    """
-    engine = get_tts_engine(engine_type, lang_code)
-    return engine.generate_speech(text, voice, speed)
-def generate_speech_stream(text: str, engine_type: Optional[str] = None, lang_code: str = 'z',
-                          voice: str = 'default', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
-    """Generate speech stream using the specified or best available TTS engine
-    Args:
-        text (str): Input text to synthesize
-        engine_type (str, optional): Type of engine to use
-        lang_code (str): Language code
-        voice (str): Voice ID to use
-        speed (float): Speech speed multiplier
-    Yields:
-        tuple: (sample_rate, audio_data) pairs for each segment
-    """
-    engine = get_tts_engine(engine_type, lang_code)
-    yield from engine.generate_speech_stream(text, voice, speed)

utils/tts_README.md DELETED Viewed

@@ -1,64 +0,0 @@
-# TTS Structure
-This directory contains a Text-to-Speech (TTS) implementation that supports three specific models:
-1. Kokoro: https://github.com/hexgrad/kokoro
-2. Dia: https://github.com/nari-labs/dia
-3. CosyVoice2: https://github.com/nari-labs/dia
-## Structure
-The TTS implementation follows a simple, clean structure:
-- `tts.py`: Contains the base `TTSBase` abstract class and `DummyTTS` implementation
-- `tts_kokoro.py`: Kokoro TTS implementation
-- `tts_dia.py`: Dia TTS implementation
-- `tts_cosyvoice2.py`: CosyVoice2 TTS implementation
-- `tts_main.py`: Main entry point for TTS functionality
-## Usage
-```python
-# Import the main TTS functions
-from utils.tts_main import generate_speech, generate_speech_stream, get_tts_engine
-# Generate speech using the best available engine
-audio_path = generate_speech("Hello, world!")
-# Generate speech using a specific engine
-audio_path = generate_speech("Hello, world!", engine_type="kokoro")
-# Generate speech with specific parameters
-audio_path = generate_speech(
-    "Hello, world!",
-    engine_type="dia",
-    lang_code="en",
-    voice="default",
-    speed=1.0
-)
-# Generate speech stream
-for sample_rate, audio_data in generate_speech_stream("Hello, world!"):
-    # Process audio data
-    pass
-# Get a specific TTS engine instance
-engine = get_tts_engine("kokoro")
-audio_path = engine.generate_speech("Hello, world!")
-```
-## Error Handling
-All TTS implementations include robust error handling:
-1. Each implementation checks for the availability of its dependencies
-2. If a specific engine fails, it automatically falls back to the `DummyTTS` implementation
-3. The main module prioritizes engines based on availability
-## Adding New Engines
-To add a new TTS engine:
-1. Create a new file `tts_<engine_name>.py`
-2. Implement a class that inherits from `TTSBase`
-3. Add the engine to the available engines list in `tts_main.py`

utils/tts_base.py DELETED Viewed

@@ -1,69 +0,0 @@
-import logging
-import os
-import time
-import numpy as np
-import soundfile as sf
-from typing import Optional, Generator, Tuple, List
-from abc import ABC, abstractmethod
-# Configure logging
-logger = logging.getLogger(__name__)
-class TTSBase(ABC):
-    """Base class for all TTS engines
-    This abstract class defines the interface that all TTS engines must implement.
-    """
-    def __init__(self, lang_code: str = 'z'):
-        """Initialize the TTS engine
-        Args:
-            lang_code (str): Language code for the engine
-        """
-        self.lang_code = lang_code
-    @abstractmethod
-    def generate_speech(self, text: str, voice: str = 'default', speed: float = 1.0) -> Optional[str]:
-        """Generate speech from text
-        Args:
-            text (str): Input text to synthesize
-            voice (str): Voice ID to use
-            speed (float): Speech speed multiplier
-        Returns:
-            Optional[str]: Path to the generated audio file or None if generation fails
-        """
-        pass
-    @abstractmethod
-    def generate_speech_stream(self, text: str, voice: str = 'default', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
-        """Generate speech stream from text
-        Args:
-            text (str): Input text to synthesize
-            voice (str): Voice ID to use
-            speed (float): Speech speed multiplier
-        Yields:
-            tuple: (sample_rate, audio_data) pairs for each segment
-        """
-        pass
-    def _generate_output_path(self, prefix: str = "tts", extension: str = "wav") -> str:
-        """Generate a unique output path for the audio file
-        Args:
-            prefix (str): Prefix for the filename
-            extension (str): File extension
-        Returns:
-            str: Path to the output file
-        """
-        timestamp = int(time.time() * 1000)
-        filename = f"{prefix}_{timestamp}.{extension}"
-        output_dir = os.path.join(os.getcwd(), "output")
-        os.makedirs(output_dir, exist_ok=True)
-        return os.path.join(output_dir, filename)

utils/tts_cosyvoice2.py DELETED Viewed

@@ -1,209 +0,0 @@
-import logging
-import numpy as np
-import soundfile as sf
-from typing import Optional, Generator, Tuple
-from utils.tts_base import TTSBase
-# Configure logging
-logger = logging.getLogger(__name__)
-# Flag to track CosyVoice2 availability
-COSYVOICE2_AVAILABLE = False
-DEFAULT_SAMPLE_RATE = 24000
-# Try to import CosyVoice2 dependencies
-try:
-    import torch
-    import torchaudio
-    # Import CosyVoice2 from the correct package
-    # Based on https://github.com/FunAudioLLM/CosyVoice
-    from cosyvoice.cli.cosyvoice import CosyVoice
-    COSYVOICE2_AVAILABLE = True
-    logger.info("CosyVoice2 TTS engine is available")
-except ImportError as e:
-    logger.warning(f"CosyVoice2 TTS engine is not available - ImportError: {str(e)}")
-    COSYVOICE2_AVAILABLE = False
-except ModuleNotFoundError as e:
-    logger.warning(f"CosyVoice2 TTS engine is not available - ModuleNotFoundError: {str(e)}")
-    COSYVOICE2_AVAILABLE = False
-def _get_model():
-    """Lazy-load the CosyVoice2 model
-    Returns:
-        CosyVoice2 or None: The CosyVoice2 model or None if not available
-    """
-    if not COSYVOICE2_AVAILABLE:
-        logger.warning("CosyVoice2 TTS engine is not available")
-        return None
-    try:
-        import torch
-        import torchaudio
-        from cosyvoice.cli.cosyvoice import CosyVoice
-        # Initialize the model with correct path
-        model = CosyVoice('pretrained_models/CosyVoice-300M')
-        logger.info("CosyVoice2 model successfully loaded")
-        return model
-    except ImportError as e:
-        logger.error(f"Failed to import CosyVoice2 dependencies: {str(e)}")
-        return None
-    except FileNotFoundError as e:
-        logger.error(f"Failed to load CosyVoice2 model files: {str(e)}")
-        return None
-    except Exception as e:
-        logger.error(f"Failed to initialize CosyVoice2 model: {str(e)}")
-        return None
-class CosyVoice2TTS(TTSBase):
-    """CosyVoice2 TTS engine implementation
-    This engine uses the CosyVoice2 model for TTS generation.
-    """
-    def __init__(self, lang_code: str = 'z'):
-        """Initialize the CosyVoice2 TTS engine
-        Args:
-            lang_code (str): Language code for the engine
-        """
-        super().__init__(lang_code)
-        self.model = None
-    def _ensure_model(self):
-        """Ensure the model is loaded
-        Returns:
-            bool: True if model is available, False otherwise
-        """
-        if self.model is None:
-            self.model = _get_model()
-        return self.model is not None
-    def generate_speech(self, text: str, voice: str = 'default', speed: float = 1.0) -> Optional[str]:
-        """Generate speech using CosyVoice2 TTS engine
-        Args:
-            text (str): Input text to synthesize
-            voice (str): Voice ID (may not be used in CosyVoice2)
-            speed (float): Speech speed multiplier (may not be used in CosyVoice2)
-        Returns:
-            Optional[str]: Path to the generated audio file or None if generation fails
-        """
-        logger.info(f"Generating speech with CosyVoice2 for text length: {len(text)}")
-        # Check if CosyVoice2 is available
-        if not COSYVOICE2_AVAILABLE:
-            logger.error("CosyVoice2 TTS engine is not available")
-            return None
-        # Ensure model is loaded
-        if not self._ensure_model():
-            logger.error("Failed to load CosyVoice2 model")
-            return None
-        try:
-            import torch
-            # Generate unique output path
-            output_path = self._generate_output_path(prefix="cosyvoice2")
-            # Generate audio using CosyVoice2
-            try:
-                # Use the inference method from CosyVoice
-                output_audio_tensor = self.model.inference_sft(text, '中文女')
-                # Convert tensor to numpy array
-                if isinstance(output_audio_tensor, torch.Tensor):
-                    output_audio_np = output_audio_tensor.cpu().numpy()
-                else:
-                    output_audio_np = output_audio_tensor
-            except Exception as api_error:
-                # Try alternative API if the first one fails
-                try:
-                    output_audio_tensor = self.model.inference_zero_shot(text, '请输入提示文本', '中文女')
-                    if isinstance(output_audio_tensor, torch.Tensor):
-                        output_audio_np = output_audio_tensor.cpu().numpy()
-                    else:
-                        output_audio_np = output_audio_tensor
-                except Exception as alt_error:
-                    logger.error(f"CosyVoice2 inference failed: {str(api_error)}")
-                    return None
-            if output_audio_np is not None:
-                logger.info(f"Successfully generated audio with CosyVoice2 (length: {len(output_audio_np)})")
-                sf.write(output_path, output_audio_np, DEFAULT_SAMPLE_RATE)
-                logger.info(f"CosyVoice2 audio generation complete: {output_path}")
-                return output_path
-            else:
-                logger.error("CosyVoice2 model returned None for audio output")
-                return None
-        except Exception as e:
-            logger.error(f"Error generating speech with CosyVoice2: {str(e)}", exc_info=True)
-            return None
-    def generate_speech_stream(self, text: str, voice: str = 'default', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
-        """Generate speech stream using CosyVoice2 TTS engine
-        Args:
-            text (str): Input text to synthesize
-            voice (str): Voice ID (may not be used in CosyVoice2)
-            speed (float): Speech speed multiplier (may not be used in CosyVoice2)
-        Yields:
-            tuple: (sample_rate, audio_data) pairs for each segment
-        """
-        logger.info(f"Generating speech stream with CosyVoice2 for text length: {len(text)}")
-        # Check if CosyVoice2 is available
-        if not COSYVOICE2_AVAILABLE:
-            logger.error("CosyVoice2 TTS engine is not available")
-            return
-        # Ensure model is loaded
-        if not self._ensure_model():
-            logger.error("Failed to load CosyVoice2 model")
-            return
-        try:
-            import torch
-            # Generate audio using CosyVoice2
-            try:
-                # Use the inference method from CosyVoice
-                output_audio_tensor = self.model.inference_sft(text, '中文女')
-                # Convert tensor to numpy array
-                if isinstance(output_audio_tensor, torch.Tensor):
-                    output_audio_np = output_audio_tensor.cpu().numpy()
-                else:
-                    output_audio_np = output_audio_tensor
-            except Exception as api_error:
-                # Try alternative API if the first one fails
-                try:
-                    output_audio_tensor = self.model.inference_zero_shot(text, '请输入提示文本', '中文女')
-                    if isinstance(output_audio_tensor, torch.Tensor):
-                        output_audio_np = output_audio_tensor.cpu().numpy()
-                    else:
-                        output_audio_np = output_audio_tensor
-                except Exception as alt_error:
-                    logger.error(f"CosyVoice2 inference failed: {str(api_error)}")
-                    return
-            if output_audio_np is not None:
-                logger.info(f"Successfully generated audio with CosyVoice2 (length: {len(output_audio_np)})")
-                yield DEFAULT_SAMPLE_RATE, output_audio_np
-            else:
-                logger.error("CosyVoice2 model returned None for audio output")
-                return
-        except Exception as e:
-            logger.error(f"Error generating speech stream with CosyVoice2: {str(e)}", exc_info=True)
-            return

utils/tts_dia.py DELETED Viewed

@@ -1,201 +0,0 @@
-import logging
-import numpy as np
-import soundfile as sf
-from typing import Optional, Generator, Tuple
-from utils.tts_base import TTSBase
-# Configure logging
-logger = logging.getLogger(__name__)
-# Flag to track Dia availability
-DIA_AVAILABLE = False
-DEFAULT_SAMPLE_RATE = 24000
-# Try to import Dia dependencies
-try:
-    import torch
-    from dia.model import Dia
-    DIA_AVAILABLE = True
-    logger.info("Dia TTS engine is available")
-except ImportError:
-    logger.warning("Dia TTS engine is not available")
-except ModuleNotFoundError as e:
-    if "dac" in str(e):
-        logger.warning("Dia TTS engine is not available due to missing 'dac' module")
-    else:
-        logger.warning(f"Dia TTS engine is not available: {str(e)}")
-    DIA_AVAILABLE = False
-def _get_model():
-    """Lazy-load the Dia model
-    Returns:
-        Dia or None: The Dia model or None if not available
-    """
-    if not DIA_AVAILABLE:
-        logger.warning("Dia TTS engine is not available")
-        return None
-    try:
-        import torch
-        from dia.model import Dia
-        # Initialize the model
-        model = Dia.from_pretrained()
-        logger.info("Dia model successfully loaded")
-        return model
-    except ImportError as e:
-        logger.error(f"Failed to import Dia dependencies: {str(e)}")
-        return None
-    except FileNotFoundError as e:
-        logger.error(f"Failed to load Dia model files: {str(e)}")
-        return None
-    except Exception as e:
-        logger.error(f"Failed to initialize Dia model: {str(e)}")
-        return None
-class DiaTTS(TTSBase):
-    """Dia TTS engine implementation
-    This engine uses the Dia model for TTS generation.
-    """
-    def __init__(self, lang_code: str = 'z'):
-        """Initialize the Dia TTS engine
-        Args:
-            lang_code (str): Language code for the engine
-        """
-        super().__init__(lang_code)
-        self.model = None
-    def _ensure_model(self):
-        """Ensure the model is loaded
-        Returns:
-            bool: True if model is available, False otherwise
-        """
-        if self.model is None:
-            self.model = _get_model()
-        return self.model is not None
-    def generate_speech(self, text: str, voice: str = 'default', speed: float = 1.0) -> Optional[str]:
-        """Generate speech using Dia TTS engine
-        Args:
-            text (str): Input text to synthesize
-            voice (str): Voice ID (not used in Dia)
-            speed (float): Speech speed multiplier (not used in Dia)
-        Returns:
-            Optional[str]: Path to the generated audio file or None if generation fails
-        """
-        logger.info(f"Generating speech with Dia for text length: {len(text)}")
-        # Check if Dia is available
-        if not DIA_AVAILABLE:
-            logger.error("Dia TTS engine is not available")
-            return None
-        # Ensure model is loaded
-        if not self._ensure_model():
-            logger.error("Failed to load Dia model")
-            return None
-        try:
-            import torch
-            # Generate unique output path
-            output_path = self._generate_output_path(prefix="dia")
-            # Generate audio
-            with torch.inference_mode():
-                output_audio_np = self.model.generate(
-                    text,
-                    max_tokens=None,
-                    cfg_scale=3.0,
-                    temperature=1.3,
-                    top_p=0.95,
-                    cfg_filter_top_k=35,
-                    use_torch_compile=False,
-                    verbose=False
-                )
-            if output_audio_np is not None:
-                logger.info(f"Successfully generated audio with Dia (length: {len(output_audio_np)})")
-                sf.write(output_path, output_audio_np, DEFAULT_SAMPLE_RATE)
-                logger.info(f"Dia audio generation complete: {output_path}")
-                return output_path
-            else:
-                logger.error("Dia model returned None for audio output")
-                return None
-        except ModuleNotFoundError as e:
-            if "dac" in str(e):
-                logger.error("Dia TTS engine failed due to missing 'dac' module")
-            else:
-                logger.error(f"Module not found error in Dia TTS: {str(e)}")
-            return None
-        except Exception as e:
-            logger.error(f"Error generating speech with Dia: {str(e)}", exc_info=True)
-            return None
-    def generate_speech_stream(self, text: str, voice: str = 'default', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
-        """Generate speech stream using Dia TTS engine
-        Args:
-            text (str): Input text to synthesize
-            voice (str): Voice ID (not used in Dia)
-            speed (float): Speech speed multiplier (not used in Dia)
-        Yields:
-            tuple: (sample_rate, audio_data) pairs for each segment
-        """
-        logger.info(f"Generating speech stream with Dia for text length: {len(text)}")
-        # Check if Dia is available
-        if not DIA_AVAILABLE:
-            logger.error("Dia TTS engine is not available")
-            return
-        # Ensure model is loaded
-        if not self._ensure_model():
-            logger.error("Failed to load Dia model")
-            return
-        try:
-            import torch
-            # Generate audio
-            with torch.inference_mode():
-                output_audio_np = self.model.generate(
-                    text,
-                    max_tokens=None,
-                    cfg_scale=3.0,
-                    temperature=1.3,
-                    top_p=0.95,
-                    cfg_filter_top_k=35,
-                    use_torch_compile=False,
-                    verbose=False
-                )
-            if output_audio_np is not None:
-                logger.info(f"Successfully generated audio with Dia (length: {len(output_audio_np)})")
-                yield DEFAULT_SAMPLE_RATE, output_audio_np
-            else:
-                logger.error("Dia model returned None for audio output")
-                return
-        except ModuleNotFoundError as e:
-            if "dac" in str(e):
-                logger.error("Dia TTS engine failed due to missing 'dac' module")
-            else:
-                logger.error(f"Module not found error in Dia TTS: {str(e)}")
-            return
-        except Exception as e:
-            logger.error(f"Error generating speech stream with Dia: {str(e)}", exc_info=True)
-            return

utils/tts_dummy.py DELETED Viewed

@@ -1,65 +0,0 @@
-import logging
-import os
-import time
-import numpy as np
-import soundfile as sf
-from typing import Optional, Generator, Tuple, List
-from .tts_base import TTSBase
-# Configure logging
-logger = logging.getLogger(__name__)
-class DummyTTS(TTSBase):
-    """Dummy TTS engine that generates sine wave audio
-    This class is used as a fallback when no other TTS engine is available.
-    """
-    def generate_speech(self, text: str, voice: str = 'default', speed: float = 1.0) -> str:
-        """Generate a dummy sine wave audio file
-        Args:
-            text (str): Input text (not used)
-            voice (str): Voice ID (not used)
-            speed (float): Speech speed multiplier (not used)
-        Returns:
-            str: Path to the generated audio file
-        """
-        logger.info(f"Generating dummy speech for text length: {len(text)}")
-        # Generate a simple sine wave
-        sample_rate = 24000
-        duration = min(len(text) / 20, 10)  # Rough approximation of speech duration
-        t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False)
-        audio = 0.5 * np.sin(2 * np.pi * 440 * t)  # 440 Hz sine wave
-        # Save to file
-        output_path = self._generate_output_path(prefix="dummy")
-        sf.write(output_path, audio, sample_rate)
-        logger.info(f"Generated dummy audio: {output_path}")
-        return output_path
-    def generate_speech_stream(self, text: str, voice: str = 'default', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
-        """Generate a dummy sine wave audio stream
-        Args:
-            text (str): Input text (not used)
-            voice (str): Voice ID (not used)
-            speed (float): Speech speed multiplier (not used)
-        Yields:
-            tuple: (sample_rate, audio_data) pairs
-        """
-        logger.info(f"Generating dummy speech stream for text length: {len(text)}")
-        # Generate a simple sine wave
-        sample_rate = 24000
-        duration = min(len(text) / 20, 10)  # Rough approximation of speech duration
-        t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False)
-        audio = 0.5 * np.sin(2 * np.pi * 440 * t)  # 440 Hz sine wave
-        # Yield the audio data
-        yield sample_rate, audio

utils/tts_kokoro.py DELETED Viewed

@@ -1,144 +0,0 @@
-import logging
-import numpy as np
-import soundfile as sf
-from typing import Optional, Generator, Tuple
-from utils.tts_base import TTSBase
-# Configure logging
-logger = logging.getLogger(__name__)
-# Flag to track Kokoro availability
-KOKORO_AVAILABLE = False
-# Try to import Kokoro
-try:
-    from kokoro import KPipeline
-    KOKORO_AVAILABLE = True
-    logger.info("Kokoro TTS engine is available")
-except ImportError:
-    logger.warning("Kokoro TTS engine is not available")
-except Exception as e:
-    logger.error(f"Kokoro import failed with unexpected error: {str(e)}")
-    KOKORO_AVAILABLE = False
-def _get_pipeline(lang_code: str = 'z'):
-    """Lazy-load the Kokoro pipeline
-    Args:
-        lang_code (str): Language code for the pipeline
-    Returns:
-        KPipeline or None: The Kokoro pipeline or None if not available
-    """
-    if not KOKORO_AVAILABLE:
-        logger.warning("Kokoro TTS engine is not available")
-        return None
-    try:
-        pipeline = KPipeline(lang_code=lang_code)
-        logger.info("Kokoro pipeline successfully loaded")
-        return pipeline
-    except Exception as e:
-        logger.error(f"Failed to initialize Kokoro pipeline: {str(e)}")
-        return None
-class KokoroTTS(TTSBase):
-    """Kokoro TTS engine implementation
-    This engine uses the Kokoro library for TTS generation.
-    """
-    def __init__(self, lang_code: str = 'z'):
-        """Initialize the Kokoro TTS engine
-        Args:
-            lang_code (str): Language code for the engine
-        """
-        super().__init__(lang_code)
-        self.pipeline = None
-    def _ensure_pipeline(self):
-        """Ensure the pipeline is loaded
-        Returns:
-            bool: True if pipeline is available, False otherwise
-        """
-        if self.pipeline is None:
-            self.pipeline = _get_pipeline(self.lang_code)
-        return self.pipeline is not None
-    def generate_speech(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> Optional[str]:
-        """Generate speech using Kokoro TTS engine
-        Args:
-            text (str): Input text to synthesize
-            voice (str): Voice ID to use (e.g., 'af_heart', 'af_bella', etc.)
-            speed (float): Speech speed multiplier (0.5 to 2.0)
-        Returns:
-            Optional[str]: Path to the generated audio file or None if generation fails
-        """
-        logger.info(f"Generating speech with Kokoro for text length: {len(text)}")
-        # Check if Kokoro is available
-        if not KOKORO_AVAILABLE:
-            logger.error("Kokoro TTS engine is not available")
-            return None
-        # Ensure pipeline is loaded
-        if not self._ensure_pipeline():
-            logger.error("Failed to load Kokoro pipeline")
-            return None
-        try:
-            # Generate unique output path
-            output_path = self._generate_output_path(prefix="kokoro")
-            # Generate speech
-            generator = self.pipeline(text, voice=voice, speed=speed)
-            for _, _, audio in generator:
-                logger.info(f"Saving Kokoro audio to {output_path}")
-                sf.write(output_path, audio, 24000)
-                break
-            logger.info(f"Kokoro audio generation complete: {output_path}")
-            return output_path
-        except Exception as e:
-            logger.error(f"Error generating speech with Kokoro: {str(e)}", exc_info=True)
-            return None
-    def generate_speech_stream(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
-        """Generate speech stream using Kokoro TTS engine
-        Args:
-            text (str): Input text to synthesize
-            voice (str): Voice ID to use
-            speed (float): Speech speed multiplier
-        Yields:
-            tuple: (sample_rate, audio_data) pairs for each segment
-        """
-        logger.info(f"Generating speech stream with Kokoro for text length: {len(text)}")
-        # Check if Kokoro is available
-        if not KOKORO_AVAILABLE:
-            logger.error("Kokoro TTS engine is not available")
-            return
-        # Ensure pipeline is loaded
-        if not self._ensure_pipeline():
-            logger.error("Failed to load Kokoro pipeline")
-            return
-        try:
-            # Generate speech stream
-            generator = self.pipeline(text, voice=voice, speed=speed)
-            for _, _, audio in generator:
-                yield 24000, audio
-        except Exception as e:
-            logger.error(f"Error generating speech stream with Kokoro: {str(e)}", exc_info=True)
-            return