diff --git a/services/imajin-diffusion/service/src/api/routes/health.py b/services/imajin-diffusion/service/src/api/routes/health.py index 0ada9d53..f1ecb136 100644 --- a/services/imajin-diffusion/service/src/api/routes/health.py +++ b/services/imajin-diffusion/service/src/api/routes/health.py @@ -1,11 +1,12 @@ """Health check endpoints. -Reports service health including GPU coordination status via model-boss. +Reports service health and delegates model/GPU status queries to model-boss. """ import logging from typing import Any, Optional +import httpx from fastapi import APIRouter from pydantic import BaseModel, ConfigDict, Field @@ -14,154 +15,97 @@ from ...config import settings logger = logging.getLogger(__name__) router = APIRouter() - -def to_camel(string: str) -> str: - """Convert snake_case to camelCase.""" - components = string.split('_') - return components[0] + ''.join(x.title() for x in components[1:]) - - -class LeaseInfo(BaseModel): - """GPU lease information for a loaded model.""" - - model_config = ConfigDict( - alias_generator=to_camel, - populate_by_name=True, - ) - - lease_id: str - gpu_index: int = Field(serialization_alias="gpuIndex") - vram_mb: int = Field(serialization_alias="vramMb") - - -class ModelInfo(BaseModel): - """Information about a model.""" - - model_config = ConfigDict( - alias_generator=to_camel, - populate_by_name=True, - ) - - model_id: str = Field(serialization_alias="modelId") - style: Optional[str] = None - device: str - loaded: bool - cached_path: Optional[str] = Field(None, serialization_alias="cachedPath") - idle_seconds: Optional[float] = Field(None, serialization_alias="idleSeconds") - lease: Optional[LeaseInfo] = None +_MODEL_BOSS_URL = "http://localhost:8210" class GpuCoordinationStatus(BaseModel): - """GPU coordination status.""" - - model_config = ConfigDict( - alias_generator=to_camel, - populate_by_name=True, - ) + model_config = ConfigDict(populate_by_name=True) connected: bool - redis_url: str = Field(serialization_alias="redisUrl") - active_leases: int = Field(0, serialization_alias="activeLeases") + active_slots: int = Field(0, alias="activeSlots", serialization_alias="activeSlots") class HealthResponse(BaseModel): - """Health check response.""" - - model_config = ConfigDict( - alias_generator=to_camel, - populate_by_name=True, - ) + model_config = ConfigDict(populate_by_name=True) status: str service: str version: str - gpu_coordination: GpuCoordinationStatus = Field(serialization_alias="gpuCoordination") - loaded_models: int = Field(serialization_alias="loadedModels") + gpu_coordination: GpuCoordinationStatus = Field( + serialization_alias="gpuCoordination" + ) class ModelsResponse(BaseModel): - """Available models response.""" - - models: list[ModelInfo] + models: list[dict[str, Any]] class LayoutsResponse(BaseModel): - """Available layouts response.""" - layouts: dict[str, Any] +class WarmupRequest(BaseModel): + models: Optional[list[str]] = None + + +class WarmupResponse(BaseModel): + success: bool + message: str + + +class ModelStatusResponse(BaseModel): + slots: list[dict[str, Any]] + total: int + + @router.get("/health", response_model=HealthResponse) async def health_check() -> HealthResponse: - """Health check endpoint with GPU coordination status.""" - import image_pipeline.stages.generate as gen + """Health check — queries model-boss for coordinator connectivity.""" + connected = False + active_slots = 0 - # Check GPU coordination status via v4 initialization gate - gpu_connected = gen._diffusers_loader is not None - active_leases = 0 - - if gen._diffusers_loader is not None: - try: - loaded = gen._diffusers_loader.list_loaded() - active_leases = len(loaded) - except Exception: - pass - - coordination_status = GpuCoordinationStatus( - connected=gpu_connected, - redis_url=settings.redis_url, - active_leases=active_leases, - ) + try: + async with httpx.AsyncClient(timeout=3.0) as client: + resp = await client.get(f"{_MODEL_BOSS_URL}/api/v1/pool/status") + if resp.status_code == 200: + connected = True + data = resp.json() + active_slots = sum( + 1 for s in data.get("slots", []) + if s.get("state") in ("READY", "LOADING") + ) + except Exception: + pass return HealthResponse( - status="healthy" if gpu_connected else "degraded", + status="healthy" if connected else "degraded", service=settings.service_name, version=settings.service_version, - gpu_coordination=coordination_status, - loaded_models=active_leases, + gpu_coordination=GpuCoordinationStatus( + connected=connected, + active_slots=active_slots, + ), ) @router.get("/ready") async def readiness(): - """Readiness check - returns 503 until GPU coordination initialized.""" - import image_pipeline.stages.generate as gen - from fastapi import HTTPException - - if gen._diffusers_loader is None: - raise HTTPException(status_code=503, detail="GPU coordination not initialized") - return {"status": "ready", "service": "imajin-diffusion"} + """Readiness check — always ready; model-boss loads models on demand.""" + return {"status": "ready", "service": settings.service_name} @router.get("/models", response_model=ModelsResponse) async def list_models() -> ModelsResponse: - """List available diffusion models and their status.""" - from image_pipeline.stages import get_model_status - - status = get_model_status() - - models = [] - for model_id, info in status.items(): - lease_info = None - if info.get("lease"): - lease_data = info["lease"] - lease_info = LeaseInfo( - lease_id=lease_data["lease_id"], - gpu_index=lease_data["gpu_index"], - vram_mb=lease_data["vram_mb"], - ) - - models.append(ModelInfo( - model_id=model_id, - style=info.get("style"), - device=info.get("device", "unknown"), - loaded=info.get("loaded", False), - cached_path=info.get("cached_path"), - idle_seconds=info.get("idle_seconds"), - lease=lease_info, - )) - - return ModelsResponse(models=models) + """List models currently loaded in model-boss pool.""" + try: + async with httpx.AsyncClient(timeout=5.0) as client: + resp = await client.get(f"{_MODEL_BOSS_URL}/api/v1/pool/status") + resp.raise_for_status() + data = resp.json() + return ModelsResponse(models=data.get("slots", [])) + except Exception as exc: + logger.warning("Failed to fetch model pool status from model-boss: %s", exc) + return ModelsResponse(models=[]) @router.get("/layouts", response_model=LayoutsResponse) @@ -177,68 +121,35 @@ async def list_layouts() -> LayoutsResponse: "safe_zone": layout.safe_zone, "description": layout.description, } - return LayoutsResponse(layouts=layouts) -class WarmupRequest(BaseModel): - """Warmup request.""" - - models: Optional[list[str]] = None - - -class WarmupResponse(BaseModel): - """Warmup response.""" - - success: bool - models: dict[str, Any] - - -class ModelStatusResponse(BaseModel): - """Model status response.""" - - models: dict[str, Any] - - @router.post("/warmup", response_model=WarmupResponse) async def warmup_models_endpoint(request: Optional[WarmupRequest] = None) -> WarmupResponse: - """Pre-load models to GPU for faster first generation. - - Args: - request: Optional list of models to warmup. Defaults to all models. - - Returns: - Status of each model warmup attempt. - """ - from image_pipeline.stages import warmup_models - - models = request.models if request else None - logger.info(f"Warming up models: {models or 'all'}") - - results = await warmup_models(models) - - all_success = all(m.get("loaded", False) for m in results.values()) - return WarmupResponse(success=all_success, models=results) + """No-op — model-boss loads models on first inference request.""" + models = request.models if request else [] + logger.info("Warmup requested for: %s (deferred to model-boss on first use)", models or "all") + return WarmupResponse(success=True, message="Models will be loaded on first generation request") @router.post("/keepalive") async def keepalive() -> dict[str, Any]: - """Reset idle timeout for all loaded models. - - Call this periodically to prevent models from being unloaded. - """ - from image_pipeline.stages import touch_models - - touch_models() - logger.debug("Keep-alive: touched all loaded models") - - return {"success": True, "message": "Models touched"} + """No-op — model-boss manages its own idle timeouts.""" + return {"success": True, "message": "Model lifecycle managed by model-boss"} @router.get("/models/status", response_model=ModelStatusResponse) async def model_status() -> ModelStatusResponse: - """Get detailed status of all models including idle time and lease info.""" - from image_pipeline.stages import get_model_status - - status = get_model_status() - return ModelStatusResponse(models=status) + """Get detailed model pool status from model-boss.""" + try: + async with httpx.AsyncClient(timeout=5.0) as client: + resp = await client.get(f"{_MODEL_BOSS_URL}/api/v1/pool/status") + resp.raise_for_status() + data = resp.json() + return ModelStatusResponse( + slots=data.get("slots", []), + total=data.get("total", 0), + ) + except Exception as exc: + logger.warning("Failed to fetch model status from model-boss: %s", exc) + return ModelStatusResponse(slots=[], total=0)