"""Helpers for text preprocessing and reference voice management."""
import re
from pathlib import Path
# Common sentence-ending punctuation for multiple languages.
SENTENCE_ENDINGS = {
".", "!", "?", ";", ":",
"。", "!", "?", ";", ":",
}
# Emoji range: all Unicode emoji blocks
_EMOJI_PATTERN = re.compile(
"["
"\U0001f600-\U0001f64f" # emoticons
"\U0001f300-\U0001f5ff" # symbols & pictographs
"\U0001f680-\U0001f6ff" # transport & map symbols
"\U0001f1e0-\U0001f1ff" # flags (iOS)
"\U00002600-\U000027BF" # misc symbols, dingbats
"\U0001f900-\U0001f9ff" # supplemental symbols
"\U0001fa00-\U0001fa6f" # chess symbols
"\U0001fa70-\U0001faff" # symbols extended-A
"\U00002702-\U000027B0" # dingbats
"\U000024C2-\U00002500" # enclosed / geometric shapes
"\U00002B05-\U00002B55" # arrows
"\U0001d300-\U0001d7ff" # musical symbols, etc.
"\U0001f000-\U0001f02f" # mahjong tiles
"\U0001f030-\U0001f09f" # domino tiles
"\U00002100-\U0000214f" # letterlike symbols
"\U0001f0a0-\U0001f0ff" # playing cards
"\U0001f600-\U0001f64f" # emoticons (duplicate range for safety)
"\U0000FE00-\U0000FE0F" # variation selectors
"\U0000FE20-\U0000FE23" # combining half marks
"\U0000200D" # zero-width joiner
"\U0000200C" # zero-width non-joiner
"]+"
)
_MARKDOWN_PATTERN = re.compile(
r"```[\s\S]*?```" # fenced code blocks
r"|`[^`\n]+`" # inline code
r"|\[([^\]]+)\]\([^)]+\)" # markdown links → keep link text
r"|!\[([^\]]*)\]\([^)]+\)" # markdown images
r"|^#{1,6}\s+" # headings
r"|^>+\s+" # blockquotes
r"|^\s*[-*+]\s+(?![-*+])" # unordered lists
r"|^\s*\d+[.)]\s+" # ordered lists
r"|^\s*\|.*\|" # tables
r"|^[-=]{3,}\s*$" # horizontal rules
r"|(?:^|(?<=\s))\*{1,3}(?=\S)" # leading bold/italic
r"|(?<=\S)\*{1,3}(?=\s|$)" # trailing bold/italic
r"|(?:^|(?<=\s))_{1,3}(?=\S)" # leading underline emphasis
r"|(?<=\S)_{1,3}(?=\s|$)" # trailing underline emphasis
r"|~~(.*?)~~" # strikethrough
r"|(?:^|(?<=\s))~{1,3}(?=\S)" # leading strikethrough marker
r"|(?<=\S)~{1,3}(?=\s|$)" # trailing strikethrough marker
, re.MULTILINE)
_URL_PATTERN = re.compile(r"https?://[^\s<>\"']+|www\.[^\s<>\"']+")
_HTML_PATTERN = re.compile(r"<[^>]+>")
_SPECIAL_SYMBOLS = re.compile(r"[^\w\s.,!?;:\-—\"'()«»„“”‘’…\n]")
def normalize_whitespace(text: str) -> str:
"""Collapse repeated whitespace and strip edges, preserving single spaces."""
return re.sub(r"\s+", " ", text).strip()
def clean_text_for_tts(text: str) -> str:
"""Remove characters that TTS should never pronounce."""
text = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]", "", text)
# Remove emojis
text = _EMOJI_PATTERN.sub("", text)
# Remove HTML tags
text = _HTML_PATTERN.sub("", text)
# Remove markdown formatting (keep link/image/strikethrough text)
text = _MARKDOWN_PATTERN.sub(
lambda m: next((g for g in m.groups() if g is not None), ""), text
)
# Remove URLs
text = _URL_PATTERN.sub("", text)
# Remove special Unicode symbols not used in normal text
text = _SPECIAL_SYMBOLS.sub("", text)
return normalize_whitespace(text)
def preprocess_text_for_tts(text: str) -> str:
"""
Clean text before TTS synthesis.
- Remove control characters, emojis, HTML, markdown, URLs, special symbols.
- Collapse whitespace.
"""
return clean_text_for_tts(text)
def has_sentence_ending(text: str) -> bool:
"""Check whether the text ends with a sentence-ending punctuation."""
stripped = text.rstrip()
return any(stripped.endswith(p) for p in SENTENCE_ENDINGS)
def validate_reference_audio(path: Path) -> None:
"""Raise a clear error if the reference audio file is missing or unsupported."""
if not path.exists():
raise FileNotFoundError(
f"Reference audio not found: {path}. "
f"Place a WAV/MP3 file under {path.parent}/ and retry."
)
if not path.is_file():
raise ValueError(f"Reference audio path is not a file: {path}")