diff --git a/.env.example b/.env.example index beec715..b875020 100644 --- a/.env.example +++ b/.env.example @@ -6,6 +6,9 @@ LOG_LEVEL=INFO TTS_BACKEND=f5_tts +# F5-TTS model name. Built-in options: F5TTS_v1_Base, F5TTS_v1_Small. +# Downloaded automatically on first use via HuggingFace. +TTS_MODEL_NAME=F5TTS_v1_Base # TTS_MODEL_PATH=models/f5-tts/model.pt # TTS_VOCAB_PATH=models/f5-tts/vocab.txt TTS_SAMPLE_RATE=24000 @@ -13,9 +16,9 @@ VOICES_DIR=voices # Path to default reference audio (relative to project root or absolute). # Providing DEFAULT_VOICE_REF enables instant warm-up and voice cloning. -# DEFAULT_VOICE_REF=voices/rick_ref_clean.wav +DEFAULT_VOICE_REF=voices/rick_ref_clean.wav # Exact transcript of the reference audio. When set, Whisper transcription is skipped. -# DEFAULT_REF_TEXT=Ва-ба-ла-ба-дап-дап! Рикки-тики-тави, сученька! И вот такие у нас новости! Иди. +DEFAULT_REF_TEXT="Ва-ба-ла-ба-дап-дап! Рикки-тики-тави, сученька! И вот такие у нас новости! Иди." MIN_SEGMENT_LENGTH=30 MAX_SEGMENT_LENGTH=200 diff --git a/src/voice_tts/api/server.py b/src/voice_tts/api/server.py index b0b62cc..1c27a71 100644 --- a/src/voice_tts/api/server.py +++ b/src/voice_tts/api/server.py @@ -336,7 +336,13 @@ f"Unknown TTS backend: {backend}. " f"Available backends: {list(_BACKEND_MAP.keys())}" ) - engine = engine_cls(sample_rate=settings.tts_sample_rate) + if engine_cls is F5TTSEngine: + engine = engine_cls( + model=settings.tts_model_name, + sample_rate=settings.tts_sample_rate, + ) + else: + engine = engine_cls(sample_rate=settings.tts_sample_rate) if hasattr(engine, "load"): engine.load() return engine diff --git a/src/voice_tts/config.py b/src/voice_tts/config.py index 57dedcf..00d3788 100644 --- a/src/voice_tts/config.py +++ b/src/voice_tts/config.py @@ -11,6 +11,7 @@ # TTS model configuration tts_backend: str = "f5_tts" # or "dummy" for tests + tts_model_name: str = "F5TTS_v1_Base" # env: TTS_MODEL_NAME tts_model_path: Path | None = None tts_vocab_path: Path | None = None tts_sample_rate: int = 24_000