feat(conversation): Implement multimodal conversation clients for LLM, STT, and TTS integration

Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
This commit is contained in:
Claude Code 2026-03-29 10:05:35 -07:00
parent eb22858ea2
commit 76cffc50e7
5 changed files with 64 additions and 14 deletions

View file

@ -346,6 +346,8 @@ func _interrupt() -> void:
_transition("interrupted")
await get_tree().create_timer(0.3).timeout
if not is_inside_tree():
return
_transition("listening")
@ -436,10 +438,14 @@ func _sanitize_for_speech(text: String) -> String:
func _emotion_to_exaggeration(emotion: String) -> float:
if not CompanionConfig.tts_use_emotion_params:
return CompanionConfig.tts_exaggeration
return ConversationDefs.EXAGGERATION_MAP.get(emotion, 0.5)
func _emotion_to_cfg_weight(emotion: String) -> float:
if not CompanionConfig.tts_use_emotion_params:
return CompanionConfig.tts_cfg_weight
return ConversationDefs.CFG_WEIGHT_MAP.get(emotion, 0.5)

View file

@ -3,7 +3,7 @@ extends Node
## Receives PCM chunks from the duplex client, buffers them via a jitter window,
## then pushes them into an AudioStreamGenerator for seamless playback.
const JITTER_BUFFER_BYTES: int = 8820 # ~200ms at 22050Hz 16-bit mono
const JITTER_BUFFER_BYTES: int = 9600 # ~200ms at 24000Hz 16-bit mono (24000 * 0.2 * 2)
var _audio_player: AudioStreamPlayer
var _duplex_client: Node
@ -24,7 +24,7 @@ func _setup_stream_generator() -> void:
if _audio_player == null:
return
_stream_generator = AudioStreamGenerator.new()
_stream_generator.mix_rate = 22050.0
_stream_generator.mix_rate = 24000.0
_stream_generator.buffer_length = 0.5

View file

@ -50,7 +50,14 @@ func chat(messages: Array[Dictionary]) -> void:
_is_streaming = true
_http_client = HTTPClient.new()
FlightRecorder.record("llm.chat_start", "Chat request", {"messages": messages.size()})
(
FlightRecorder
. record(
"llm.chat_start",
"Chat request",
{"url": _base_url + _api_path, "model": _model, "messages": messages.size()},
)
)
var body := (
JSON
@ -118,7 +125,8 @@ func _start_request(body: String) -> void:
return
if _http_client.get_response_code() != 200:
_emit_error("LLM: HTTP %d" % _http_client.get_response_code())
var error_body: String = await _read_error_body()
_emit_error("LLM: HTTP %d" % _http_client.get_response_code(), error_body)
return
await _read_stream()
@ -186,8 +194,22 @@ func _parse_data(data: String) -> void:
token_received.emit(content)
func _emit_error(message: String) -> void:
func _read_error_body() -> String:
var body: String = ""
while _http_client.get_status() == HTTPClient.STATUS_BODY:
_http_client.poll()
var chunk: PackedByteArray = _http_client.read_response_body_chunk()
if chunk.size() > 0:
body += chunk.get_string_from_utf8()
await get_tree().process_frame
return body.substr(0, 512)
func _emit_error(message: String, body: String = "") -> void:
_is_streaming = false
FlightRecorder.record("llm.error", message)
var meta: Dictionary = {"url": _base_url + _api_path}
if not body.is_empty():
meta["body"] = body
FlightRecorder.record("llm.error", message, meta)
EventBus.backend_error.emit(message)
response_error.emit(message)

View file

@ -36,7 +36,14 @@ func transcribe(wav_bytes: PackedByteArray) -> void:
]
)
FlightRecorder.record("stt.transcribe", "Transcription request", {"bytes": wav_bytes.size()})
(
FlightRecorder
. record(
"stt.transcribe",
"Transcription request",
{"url": _base_url + "/stt/transcribe", "bytes": wav_bytes.size()},
)
)
var url := _base_url + "/stt/transcribe"
var err := _http.request_raw(url, headers, HTTPClient.METHOD_POST, body)
@ -90,16 +97,22 @@ func _on_request_completed(
body: PackedByteArray,
) -> void:
if result != HTTPRequest.RESULT_SUCCESS:
FlightRecorder.record("stt.error", "Request failed", {"result": result})
EventBus.backend_error.emit("STT request failed: result=%d" % result)
return
if response_code != 200:
var body_str: String = body.get_string_from_utf8().substr(0, 512)
FlightRecorder.record("stt.error", "HTTP %d" % response_code, {"body": body_str})
EventBus.backend_error.emit("STT error: HTTP %d" % response_code)
return
var json := JSON.new()
var parse_err := json.parse(body.get_string_from_utf8())
var json: JSON = JSON.new()
var parse_err: int = json.parse(body.get_string_from_utf8())
if parse_err != OK:
FlightRecorder.record(
"stt.error", "Invalid JSON", {"body": body.get_string_from_utf8().substr(0, 512)}
)
EventBus.backend_error.emit("STT: Invalid JSON response")
return

View file

@ -96,8 +96,9 @@ func _send_request(text: String, exaggeration: float, cfg_weight: float) -> void
]
)
var url := _base_url + "/synthesize"
var err := _http.request(url, headers, HTTPClient.METHOD_POST, body)
var url: String = _base_url + "/synthesize"
FlightRecorder.record("tts.request", text.substr(0, 80), {"url": url})
var err: int = _http.request(url, headers, HTTPClient.METHOD_POST, body)
if err != OK:
EventBus.backend_error.emit("TTS request failed: %s" % error_string(err))
_process_next()
@ -110,6 +111,7 @@ func _on_request_completed(
body: PackedByteArray,
) -> void:
if result != HTTPRequest.RESULT_SUCCESS:
FlightRecorder.record("tts.error", "Request failed", {"result": result})
if _tts_available:
_tts_available = false
EventBus.backend_error.emit("TTS unavailable")
@ -118,6 +120,8 @@ func _on_request_completed(
return
if response_code != 200:
var body_str: String = body.get_string_from_utf8().substr(0, 512)
FlightRecorder.record("tts.error", "HTTP %d" % response_code, {"body": body_str})
if _tts_available:
_tts_available = false
EventBus.backend_error.emit("TTS error: HTTP %d" % response_code)
@ -127,9 +131,12 @@ func _on_request_completed(
_tts_available = true
var json := JSON.new()
var parse_err := json.parse(body.get_string_from_utf8())
var json: JSON = JSON.new()
var parse_err: int = json.parse(body.get_string_from_utf8())
if parse_err != OK:
FlightRecorder.record(
"tts.error", "Invalid JSON", {"body": body.get_string_from_utf8().substr(0, 512)}
)
EventBus.backend_error.emit("TTS: Invalid JSON")
_process_next()
return
@ -137,11 +144,13 @@ func _on_request_completed(
var data: Dictionary = json.data
var audio_b64: String = data.get("audio_base64", "")
if audio_b64.is_empty():
FlightRecorder.record("tts.error", "No audio in response", {})
EventBus.backend_error.emit("TTS: No audio in response")
_process_next()
return
var audio_bytes := Marshalls.base64_to_raw(audio_b64)
var audio_bytes: PackedByteArray = Marshalls.base64_to_raw(audio_b64)
FlightRecorder.record("tts.audio_ready", "Audio received", {"bytes": audio_bytes.size()})
_play_wav(audio_bytes)