session-tools/contrib/apricot-stt-refactor/config.py.postWHISPERHTTP
Natalie 7138338d31 feat(@scripts): add whisper-http backend config and stt service refactor
Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
2026-05-17 22:11:19 -07:00

201 lines
6.2 KiB
Text

"""Configuration for Chatterbox TTS Service."""
from pathlib import Path
from typing import Literal
from pydantic import Field, field_validator
from pydantic_settings import SettingsConfigDict
from lilith_service_fastapi_bootstrap import BaseServiceSettings
class ChatterboxSettings(BaseServiceSettings):
"""Configuration settings for Chatterbox TTS Service.
Extends BaseServiceSettings with Chatterbox-specific options for
model configuration, GPU management, voice storage, and synthesis defaults.
"""
# Model configuration
model_type: Literal["turbo", "original"] = Field(
default="turbo",
description="Chatterbox model variant (turbo is faster, original is higher quality)",
)
model_cache_dir: Path = Field(
default=Path.home() / ".cache" / "huggingface",
description="Directory for cached model files",
)
# GPU configuration
gpu_device_ids: list[int] | None = Field(
default=None,
description="GPU device IDs to use (None = auto-detect all)",
)
# Performance optimizations
enable_compile: bool = Field(
default=False,
description="Enable torch.compile() (disabled by default - ChatterboxTTS has complex control flow)",
)
compile_mode: Literal["default", "reduce-overhead", "max-autotune"] = Field(
default="reduce-overhead",
description="torch.compile mode (reduce-overhead is best for inference)",
)
use_half_precision: bool = Field(
default=False,
description="Use bf16 half precision (disabled by default - ChatterboxTTS has internal dtype conflicts)",
)
warmup_on_load: bool = Field(
default=True,
description="Run warmup generation on model load to pre-compile CUDA kernels",
)
# model-boss coordinator URL
model_boss_url: str = Field(
default="http://localhost:8210",
description="Base URL of the model-boss coordinator service",
)
# whisper-http backend URL (STT service delegated to model-boss)
whisper_http_url: str = Field(
default="http://localhost:10011",
description="Base URL of the whisper-http coordinator (faster-whisper via model-boss)",
)
# Voice storage
voices_dir: Path = Field(
default=Path("voices"),
description="Directory for storing cloned voice reference audio and conditionals",
)
voice_library_dir: Path = Field(
default=Path.home() / "datasets" / "voices" / "library",
description="Directory for browsable voice library (auto-discovered voices)",
)
max_conditionals_cache: int = Field(
default=20,
ge=1,
le=100,
description="Maximum number of voice conditionals to keep in memory",
)
# Synthesis defaults
default_exaggeration: float = Field(
default=0.5,
ge=0.0,
le=1.0,
description="Default emotional expressiveness (0.0=calm, 1.0=dramatic)",
)
default_cfg_weight: float = Field(
default=0.5,
ge=0.0,
le=1.0,
description="Default pacing control (lower=slower, higher=faster)",
)
default_temperature: float = Field(
default=0.8,
ge=0.0,
le=2.0,
description="Default sampling temperature",
)
default_top_p: float = Field(
default=0.95,
ge=0.0,
le=1.0,
description="Default top-p sampling",
)
default_repetition_penalty: float = Field(
default=1.2,
ge=1.0,
le=3.0,
description="Default repetition penalty",
)
max_text_length: int = Field(
default=10000,
ge=1,
le=100000,
description="Maximum input text length in characters",
)
# Audio output
default_format: Literal["wav", "mp3", "opus"] = Field(
default="wav",
description="Default output audio format",
)
normalize_loudness: bool = Field(
default=True,
description="Normalize output loudness by default",
)
target_loudness_lufs: float = Field(
default=-23.0,
description="Target loudness in LUFS for normalization",
)
# Conversation / VAD settings
vad_speech_threshold: float = Field(
default=0.5,
ge=0.0,
le=1.0,
description="Silero VAD speech probability threshold (0.0-1.0)",
)
vad_echo_aware_threshold: float = Field(
default=0.7,
ge=0.0,
le=1.0,
description="Raised VAD threshold during AI playback to avoid echo triggers",
)
vad_post_speech_silence: float = Field(
default=0.4,
ge=0.1,
le=3.0,
description="Seconds of silence after speech before emitting speech_end",
)
vad_min_speech_duration: float = Field(
default=0.15,
ge=0.0,
le=2.0,
description="Minimum continuous speech duration before confirming speech_start",
)
conversation_stt_model: str = Field(
default="base",
description="Default Whisper model for conversation streaming STT",
)
# Server configuration
host: str = Field(
default="0.0.0.0",
description="Server host address",
)
port: int = Field(
default=8000,
ge=1,
le=65535,
description="Server port",
)
model_config = SettingsConfigDict(
env_prefix="CHATTERBOX_",
env_file=".env",
env_file_encoding="utf-8",
case_sensitive=False,
extra="ignore",
)
@field_validator("gpu_device_ids", mode="before")
@classmethod
def parse_gpu_device_ids(cls, v: str | list[int] | None) -> list[int] | None:
"""Parse GPU device IDs from comma-separated string or list."""
if v is None:
return None
if isinstance(v, str):
if not v.strip():
return None
return [int(x.strip()) for x in v.split(",") if x.strip()]
return v
@field_validator("voices_dir", "model_cache_dir", "voice_library_dir", mode="before")
@classmethod
def parse_path(cls, v: str | Path) -> Path:
"""Parse path from string and expand ~."""
if isinstance(v, str):
return Path(v).expanduser()
return v.expanduser() if hasattr(v, 'expanduser') else v