voice/src/voice_tts/tts/utils.py at fcb0f02753674e6a7c3ee82bf4cf42663baf451e

Fork: 0
root / voice
Find file
Newer
Older
voice / src / voice_tts / tts / utils.py
Eugene Sukhodolskiy 5 hours ago 4 KB feat: backend registry, S2-Pro INT4, progressive segmentation, text cleaning
Raw Blame History
"""Helpers for text preprocessing and reference voice management."""

import re
from pathlib import Path


# Common sentence-ending punctuation for multiple languages.
SENTENCE_ENDINGS = {
    ".", "!", "?", ";", ":",
    "。", "！", "？", "；", "：",
}

# Emoji range: all Unicode emoji blocks
_EMOJI_PATTERN = re.compile(
    "["
    "\U0001f600-\U0001f64f"  # emoticons
    "\U0001f300-\U0001f5ff"  # symbols & pictographs
    "\U0001f680-\U0001f6ff"  # transport & map symbols
    "\U0001f1e0-\U0001f1ff"  # flags (iOS)
    "\U00002600-\U000027BF"  # misc symbols, dingbats
    "\U0001f900-\U0001f9ff"  # supplemental symbols
    "\U0001fa00-\U0001fa6f"  # chess symbols
    "\U0001fa70-\U0001faff"  # symbols extended-A
    "\U00002702-\U000027B0"  # dingbats
    "\U000024C2-\U00002500"  # enclosed / geometric shapes
    "\U00002B05-\U00002B55"  # arrows
    "\U0001d300-\U0001d7ff"  # musical symbols, etc.
    "\U0001f000-\U0001f02f"  # mahjong tiles
    "\U0001f030-\U0001f09f"  # domino tiles
    "\U00002100-\U0000214f"  # letterlike symbols
    "\U0001f0a0-\U0001f0ff"  # playing cards
    "\U0001f600-\U0001f64f"  # emoticons (duplicate range for safety)
    "\U0000FE00-\U0000FE0F"  # variation selectors
    "\U0000FE20-\U0000FE23"  # combining half marks
    "\U0000200D"             # zero-width joiner
    "\U0000200C"             # zero-width non-joiner
    "]+"
)

_MARKDOWN_PATTERN = re.compile(
    r"```[\s\S]*?```"         # fenced code blocks
    r"|`[^`\n]+`"              # inline code
    r"|\[([^\]]+)\]\([^)]+\)"  # markdown links → keep link text
    r"|!\[([^\]]*)\]\([^)]+\)" # markdown images
    r"|^#{1,6}\s+"             # headings
    r"|^>+\s+"                 # blockquotes
    r"|^\s*[-*+]\s+(?![-*+])" # unordered lists
    r"|^\s*\d+[.)]\s+"        # ordered lists
    r"|^\s*\|.*\|"            # tables
    r"|^[-=]{3,}\s*$"         # horizontal rules
    r"|(?:^|(?<=\s))\*{1,3}(?=\S)"  # leading bold/italic
    r"|(?<=\S)\*{1,3}(?=\s|$)"      # trailing bold/italic
    r"|(?:^|(?<=\s))_{1,3}(?=\S)"   # leading underline emphasis
    r"|(?<=\S)_{1,3}(?=\s|$)"       # trailing underline emphasis
    r"|~~(.*?)~~"              # strikethrough
    r"|(?:^|(?<=\s))~{1,3}(?=\S)"   # leading strikethrough marker
    r"|(?<=\S)~{1,3}(?=\s|$)"       # trailing strikethrough marker
, re.MULTILINE)

_URL_PATTERN = re.compile(r"https?://[^\s<>\"']+|www\.[^\s<>\"']+")

_HTML_PATTERN = re.compile(r"<[^>]+>")

_SPECIAL_SYMBOLS = re.compile(r"[^\w\s.,!?;:\-—\"'()«»„“”‘’…\n]")


def normalize_whitespace(text: str) -> str:
    """Collapse repeated whitespace and strip edges, preserving single spaces."""
    return re.sub(r"\s+", " ", text).strip()


def clean_text_for_tts(text: str) -> str:
    """Remove characters that TTS should never pronounce."""
    text = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]", "", text)
    # Remove emojis
    text = _EMOJI_PATTERN.sub("", text)
    # Remove HTML tags
    text = _HTML_PATTERN.sub("", text)
    # Remove markdown formatting (keep link/image/strikethrough text)
    text = _MARKDOWN_PATTERN.sub(
        lambda m: next((g for g in m.groups() if g is not None), ""), text
    )
    # Remove URLs
    text = _URL_PATTERN.sub("", text)
    # Remove special Unicode symbols not used in normal text
    text = _SPECIAL_SYMBOLS.sub("", text)
    return normalize_whitespace(text)


def preprocess_text_for_tts(text: str) -> str:
    """
    Clean text before TTS synthesis.
    - Remove control characters, emojis, HTML, markdown, URLs, special symbols.
    - Collapse whitespace.
    """
    return clean_text_for_tts(text)


def has_sentence_ending(text: str) -> bool:
    """Check whether the text ends with a sentence-ending punctuation."""
    stripped = text.rstrip()
    return any(stripped.endswith(p) for p in SENTENCE_ENDINGS)


def validate_reference_audio(path: Path) -> None:
    """Raise a clear error if the reference audio file is missing or unsupported."""
    if not path.exists():
        raise FileNotFoundError(
            f"Reference audio not found: {path}. "
            f"Place a WAV/MP3 file under {path.parent}/ and retry."
        )
    if not path.is_file():
        raise ValueError(f"Reference audio path is not a file: {path}")