201 lines
6.2 KiB
Text
201 lines
6.2 KiB
Text
"""Configuration for Chatterbox TTS Service."""
|
|
|
|
from pathlib import Path
|
|
from typing import Literal
|
|
|
|
from pydantic import Field, field_validator
|
|
from pydantic_settings import SettingsConfigDict
|
|
|
|
from lilith_service_fastapi_bootstrap import BaseServiceSettings
|
|
|
|
|
|
class ChatterboxSettings(BaseServiceSettings):
|
|
"""Configuration settings for Chatterbox TTS Service.
|
|
|
|
Extends BaseServiceSettings with Chatterbox-specific options for
|
|
model configuration, GPU management, voice storage, and synthesis defaults.
|
|
"""
|
|
|
|
# Model configuration
|
|
model_type: Literal["turbo", "original"] = Field(
|
|
default="turbo",
|
|
description="Chatterbox model variant (turbo is faster, original is higher quality)",
|
|
)
|
|
model_cache_dir: Path = Field(
|
|
default=Path.home() / ".cache" / "huggingface",
|
|
description="Directory for cached model files",
|
|
)
|
|
|
|
# GPU configuration
|
|
gpu_device_ids: list[int] | None = Field(
|
|
default=None,
|
|
description="GPU device IDs to use (None = auto-detect all)",
|
|
)
|
|
|
|
# Performance optimizations
|
|
enable_compile: bool = Field(
|
|
default=False,
|
|
description="Enable torch.compile() (disabled by default - ChatterboxTTS has complex control flow)",
|
|
)
|
|
compile_mode: Literal["default", "reduce-overhead", "max-autotune"] = Field(
|
|
default="reduce-overhead",
|
|
description="torch.compile mode (reduce-overhead is best for inference)",
|
|
)
|
|
use_half_precision: bool = Field(
|
|
default=False,
|
|
description="Use bf16 half precision (disabled by default - ChatterboxTTS has internal dtype conflicts)",
|
|
)
|
|
warmup_on_load: bool = Field(
|
|
default=True,
|
|
description="Run warmup generation on model load to pre-compile CUDA kernels",
|
|
)
|
|
|
|
# model-boss coordinator URL
|
|
model_boss_url: str = Field(
|
|
default="http://localhost:8210",
|
|
description="Base URL of the model-boss coordinator service",
|
|
)
|
|
# whisper-http backend URL (STT service delegated to model-boss)
|
|
whisper_http_url: str = Field(
|
|
default="http://localhost:10011",
|
|
description="Base URL of the whisper-http coordinator (faster-whisper via model-boss)",
|
|
)
|
|
|
|
|
|
# Voice storage
|
|
voices_dir: Path = Field(
|
|
default=Path("voices"),
|
|
description="Directory for storing cloned voice reference audio and conditionals",
|
|
)
|
|
voice_library_dir: Path = Field(
|
|
default=Path.home() / "datasets" / "voices" / "library",
|
|
description="Directory for browsable voice library (auto-discovered voices)",
|
|
)
|
|
max_conditionals_cache: int = Field(
|
|
default=20,
|
|
ge=1,
|
|
le=100,
|
|
description="Maximum number of voice conditionals to keep in memory",
|
|
)
|
|
|
|
# Synthesis defaults
|
|
default_exaggeration: float = Field(
|
|
default=0.5,
|
|
ge=0.0,
|
|
le=1.0,
|
|
description="Default emotional expressiveness (0.0=calm, 1.0=dramatic)",
|
|
)
|
|
default_cfg_weight: float = Field(
|
|
default=0.5,
|
|
ge=0.0,
|
|
le=1.0,
|
|
description="Default pacing control (lower=slower, higher=faster)",
|
|
)
|
|
default_temperature: float = Field(
|
|
default=0.8,
|
|
ge=0.0,
|
|
le=2.0,
|
|
description="Default sampling temperature",
|
|
)
|
|
default_top_p: float = Field(
|
|
default=0.95,
|
|
ge=0.0,
|
|
le=1.0,
|
|
description="Default top-p sampling",
|
|
)
|
|
default_repetition_penalty: float = Field(
|
|
default=1.2,
|
|
ge=1.0,
|
|
le=3.0,
|
|
description="Default repetition penalty",
|
|
)
|
|
max_text_length: int = Field(
|
|
default=10000,
|
|
ge=1,
|
|
le=100000,
|
|
description="Maximum input text length in characters",
|
|
)
|
|
|
|
# Audio output
|
|
default_format: Literal["wav", "mp3", "opus"] = Field(
|
|
default="wav",
|
|
description="Default output audio format",
|
|
)
|
|
normalize_loudness: bool = Field(
|
|
default=True,
|
|
description="Normalize output loudness by default",
|
|
)
|
|
target_loudness_lufs: float = Field(
|
|
default=-23.0,
|
|
description="Target loudness in LUFS for normalization",
|
|
)
|
|
|
|
# Conversation / VAD settings
|
|
vad_speech_threshold: float = Field(
|
|
default=0.5,
|
|
ge=0.0,
|
|
le=1.0,
|
|
description="Silero VAD speech probability threshold (0.0-1.0)",
|
|
)
|
|
vad_echo_aware_threshold: float = Field(
|
|
default=0.7,
|
|
ge=0.0,
|
|
le=1.0,
|
|
description="Raised VAD threshold during AI playback to avoid echo triggers",
|
|
)
|
|
vad_post_speech_silence: float = Field(
|
|
default=0.4,
|
|
ge=0.1,
|
|
le=3.0,
|
|
description="Seconds of silence after speech before emitting speech_end",
|
|
)
|
|
vad_min_speech_duration: float = Field(
|
|
default=0.15,
|
|
ge=0.0,
|
|
le=2.0,
|
|
description="Minimum continuous speech duration before confirming speech_start",
|
|
)
|
|
conversation_stt_model: str = Field(
|
|
default="base",
|
|
description="Default Whisper model for conversation streaming STT",
|
|
)
|
|
|
|
# Server configuration
|
|
host: str = Field(
|
|
default="0.0.0.0",
|
|
description="Server host address",
|
|
)
|
|
port: int = Field(
|
|
default=8000,
|
|
ge=1,
|
|
le=65535,
|
|
description="Server port",
|
|
)
|
|
|
|
model_config = SettingsConfigDict(
|
|
env_prefix="CHATTERBOX_",
|
|
env_file=".env",
|
|
env_file_encoding="utf-8",
|
|
case_sensitive=False,
|
|
extra="ignore",
|
|
)
|
|
|
|
@field_validator("gpu_device_ids", mode="before")
|
|
@classmethod
|
|
def parse_gpu_device_ids(cls, v: str | list[int] | None) -> list[int] | None:
|
|
"""Parse GPU device IDs from comma-separated string or list."""
|
|
if v is None:
|
|
return None
|
|
if isinstance(v, str):
|
|
if not v.strip():
|
|
return None
|
|
return [int(x.strip()) for x in v.split(",") if x.strip()]
|
|
return v
|
|
|
|
@field_validator("voices_dir", "model_cache_dir", "voice_library_dir", mode="before")
|
|
@classmethod
|
|
def parse_path(cls, v: str | Path) -> Path:
|
|
"""Parse path from string and expand ~."""
|
|
if isinstance(v, str):
|
|
return Path(v).expanduser()
|
|
return v.expanduser() if hasattr(v, 'expanduser') else v
|