"""Configuration for Chatterbox TTS Service.""" from pathlib import Path from typing import Literal from pydantic import Field, field_validator from pydantic_settings import SettingsConfigDict from lilith_service_fastapi_bootstrap import BaseServiceSettings class ChatterboxSettings(BaseServiceSettings): """Configuration settings for Chatterbox TTS Service. Extends BaseServiceSettings with Chatterbox-specific options for model configuration, GPU management, voice storage, and synthesis defaults. """ # Model configuration model_type: Literal["turbo", "original"] = Field( default="turbo", description="Chatterbox model variant (turbo is faster, original is higher quality)", ) model_cache_dir: Path = Field( default=Path.home() / ".cache" / "huggingface", description="Directory for cached model files", ) # GPU configuration gpu_device_ids: list[int] | None = Field( default=None, description="GPU device IDs to use (None = auto-detect all)", ) # Performance optimizations enable_compile: bool = Field( default=False, description="Enable torch.compile() (disabled by default - ChatterboxTTS has complex control flow)", ) compile_mode: Literal["default", "reduce-overhead", "max-autotune"] = Field( default="reduce-overhead", description="torch.compile mode (reduce-overhead is best for inference)", ) use_half_precision: bool = Field( default=False, description="Use bf16 half precision (disabled by default - ChatterboxTTS has internal dtype conflicts)", ) warmup_on_load: bool = Field( default=True, description="Run warmup generation on model load to pre-compile CUDA kernels", ) # model-boss coordinator URL model_boss_url: str = Field( default="http://localhost:8210", description="Base URL of the model-boss coordinator service", ) # whisper-http backend URL (STT service delegated to model-boss) whisper_http_url: str = Field( default="http://localhost:10011", description="Base URL of the whisper-http coordinator (faster-whisper via model-boss)", ) # Voice storage voices_dir: Path = Field( default=Path("voices"), description="Directory for storing cloned voice reference audio and conditionals", ) voice_library_dir: Path = Field( default=Path.home() / "datasets" / "voices" / "library", description="Directory for browsable voice library (auto-discovered voices)", ) max_conditionals_cache: int = Field( default=20, ge=1, le=100, description="Maximum number of voice conditionals to keep in memory", ) # Synthesis defaults default_exaggeration: float = Field( default=0.5, ge=0.0, le=1.0, description="Default emotional expressiveness (0.0=calm, 1.0=dramatic)", ) default_cfg_weight: float = Field( default=0.5, ge=0.0, le=1.0, description="Default pacing control (lower=slower, higher=faster)", ) default_temperature: float = Field( default=0.8, ge=0.0, le=2.0, description="Default sampling temperature", ) default_top_p: float = Field( default=0.95, ge=0.0, le=1.0, description="Default top-p sampling", ) default_repetition_penalty: float = Field( default=1.2, ge=1.0, le=3.0, description="Default repetition penalty", ) max_text_length: int = Field( default=10000, ge=1, le=100000, description="Maximum input text length in characters", ) # Audio output default_format: Literal["wav", "mp3", "opus"] = Field( default="wav", description="Default output audio format", ) normalize_loudness: bool = Field( default=True, description="Normalize output loudness by default", ) target_loudness_lufs: float = Field( default=-23.0, description="Target loudness in LUFS for normalization", ) # Conversation / VAD settings vad_speech_threshold: float = Field( default=0.5, ge=0.0, le=1.0, description="Silero VAD speech probability threshold (0.0-1.0)", ) vad_echo_aware_threshold: float = Field( default=0.7, ge=0.0, le=1.0, description="Raised VAD threshold during AI playback to avoid echo triggers", ) vad_post_speech_silence: float = Field( default=0.4, ge=0.1, le=3.0, description="Seconds of silence after speech before emitting speech_end", ) vad_min_speech_duration: float = Field( default=0.15, ge=0.0, le=2.0, description="Minimum continuous speech duration before confirming speech_start", ) conversation_stt_model: str = Field( default="base", description="Default Whisper model for conversation streaming STT", ) # Server configuration host: str = Field( default="0.0.0.0", description="Server host address", ) port: int = Field( default=8000, ge=1, le=65535, description="Server port", ) model_config = SettingsConfigDict( env_prefix="CHATTERBOX_", env_file=".env", env_file_encoding="utf-8", case_sensitive=False, extra="ignore", ) @field_validator("gpu_device_ids", mode="before") @classmethod def parse_gpu_device_ids(cls, v: str | list[int] | None) -> list[int] | None: """Parse GPU device IDs from comma-separated string or list.""" if v is None: return None if isinstance(v, str): if not v.strip(): return None return [int(x.strip()) for x in v.split(",") if x.strip()] return v @field_validator("voices_dir", "model_cache_dir", "voice_library_dir", mode="before") @classmethod def parse_path(cls, v: str | Path) -> Path: """Parse path from string and expand ~.""" if isinstance(v, str): return Path(v).expanduser() return v.expanduser() if hasattr(v, 'expanduser') else v