voice/src/voice_tts/config.py at fcb0f02753674e6a7c3ee82bf4cf42663baf451e

Fork: 0
root / voice
Find file
Newer
Older
voice / src / voice_tts / config.py
Eugene Sukhodolskiy 5 hours ago 2 KB feat: backend registry, S2-Pro INT4, progressive segmentation, text cleaning
Raw Blame History
from pathlib import Path
from pydantic_settings import BaseSettings


class Settings(BaseSettings):
    """Application configuration loaded from environment variables."""

    host: str = "0.0.0.0"
    port: int = 8765
    log_level: str = "INFO"

    # TTS model configuration
    tts_backend: str = "fish_speech"  # "dummy" / "f5_tts" / "xtts_v2" / "fish_speech"
    # XTTS-v2 model name (Coqui model manager path); used when backend is xtts_v2.
    tts_model_name: str = "tts_models/multilingual/multi-dataset/xtts_v2"
    # Local checkpoint path. For Fish Speech this is the folder containing model.pth,
    # firefly-gan-vq-fsq-8x1024-21hz-generator.pth, tokenizer.tiktoken, config.json, etc.
    tts_model_path: Path | None = None
    # Source tree path for Fish Speech modules (e.g. models/fish-speech-v1.5.1).
    tts_vocab_path: Path | None = None
    tts_sample_rate: int = 44_100
    tts_speed: float = 1.2  # env: TTS_SPEED

    # Reference voices directory
    voices_dir: Path = Path("voices")

    # Segmentation thresholds
    min_segment_length: int = 30
    max_segment_length: int = 200
    max_buffer_wait_ms: int = 500
    fast_start_initial: int = 12  # first segment threshold for lower latency
    fast_start_count: int = 3  # how many segments use progressive sizing

    # GPU / inference
    device: str = "cuda"  # or "cpu"
    dtype: str = "bfloat16"

    # Voice reference
    default_voice_ref: Path | None = None  # env: DEFAULT_VOICE_REF
    default_ref_text: str | None = None  # env: DEFAULT_REF_TEXT

    # S2-Pro backend settings
    s2_api_url: str = "http://127.0.0.1:8081"

    # Fish Speech-specific settings
    fish_compile: bool = False  # torch.compile the LLaMA model (slow first run)
    fish_chunk_length: int = 200  # 100-300; higher = longer coherent chunks
    fish_use_memory_cache: str = "on"  # "on" / "off" reference VQ cache
    fish_top_p: float = 0.7  # nucleus sampling (0-1); lower = more deterministic
    fish_temperature: float = 0.7  # sampling temperature; lower = more stable
    fish_repetition_penalty: float = 1.2  # >1 reduces repeated tokens
    fish_seed: int | None = None  # None = random; set for reproducible output
    fish_tail_silence_threshold: float = 0.02  # trim trailing silence below this RMS
    fish_lowpass_cutoff: int = 0  # Hz; low-pass filter output to reduce VQ noise (0 = off)

    # Warm-up
    warmup: bool = False  # run a dummy inference at startup
    warmup_text: str = "Привет. Это тестовая фраза."

    class Config:
        env_file = ".env"
        env_file_encoding = "utf-8"


settings = Settings()