Newer
Older
voice / src / voice_tts / tts / engine.py
"""TTS engine abstraction and dummy backend."""

import asyncio
from abc import ABC, abstractmethod
from pathlib import Path

import numpy as np


class TTSEngine(ABC):
    """Base interface for a TTS backend."""

    sample_rate: int = 24_000

    @abstractmethod
    async def synthesize(
        self,
        text: str,
        ref_audio_path: Path | None,
        language: str,
        speed: float,
        emotion: str,
    ) -> np.ndarray:
        """Return audio as float32 ndarray normalized to [-1, 1]."""
        ...

    @abstractmethod
    async def warm_up(self) -> None:
        """Optional warm-up inference."""
        ...


class DummyTTSEngine(TTSEngine):
    """Generates a silent/sine beep segment for testing without a GPU model."""

    def __init__(self, sample_rate: int = 24_000):
        self.sample_rate = sample_rate

    async def synthesize(
        self,
        text: str,
        ref_audio_path: Path | None,
        language: str,
        speed: float,
        emotion: str,
    ) -> np.ndarray:
        duration_sec = max(0.5, len(text) * 0.08) / speed
        num_samples = int(self.sample_rate * duration_sec)
        t = np.linspace(0, duration_sec, num_samples, endpoint=False)
        # 440 Hz tone with slight fade to avoid clicks
        audio = 0.3 * np.sin(2 * np.pi * 440 * t)
        audio *= np.hanning(num_samples)
        return audio.astype(np.float32)


    async def warm_up(self) -> None:
        pass