"""GPU/cuDNN setup command handler for script runner.""" import argparse import json import re import subprocess import sys from pathlib import Path from typing import NamedTuple class GPUInfo(NamedTuple): """GPU detection result.""" available: bool driver_version: str | None gpu_name: str | None memory_total: int | None # MB class CUDAInfo(NamedTuple): """CUDA toolkit detection result.""" available: bool version: str | None path: Path | None class CuDNNInfo(NamedTuple): """cuDNN detection result.""" system_available: bool system_version: str | None pytorch_version: int | None onnx_available: bool # GPU indicators in pyproject.toml dependencies GPU_INDICATORS = [ "torch", "onnxruntime-gpu", "insightface", "diffusers", "transformers", "image-reward", ] def detect_gpu() -> GPUInfo: """Detect NVIDIA GPU via nvidia-smi.""" try: result = subprocess.run( ["nvidia-smi", "--query-gpu=name,driver_version,memory.total", "--format=csv,noheader,nounits"], capture_output=True, text=True, check=True, ) lines = result.stdout.strip().split("\n") if lines and lines[0]: parts = [p.strip() for p in lines[0].split(",")] return GPUInfo( available=True, gpu_name=parts[0] if len(parts) > 0 else None, driver_version=parts[1] if len(parts) > 1 else None, memory_total=int(parts[2]) if len(parts) > 2 else None, ) except (subprocess.CalledProcessError, FileNotFoundError): pass return GPUInfo(available=False, driver_version=None, gpu_name=None, memory_total=None) def detect_cuda() -> CUDAInfo: """Detect CUDA toolkit installation.""" cuda_paths = [ Path("/usr/local/cuda"), Path("/opt/cuda"), ] for cuda_path in cuda_paths: version_json = cuda_path / "version.json" if version_json.exists(): try: with open(version_json) as f: data = json.load(f) version = data.get("cuda", {}).get("version") if version: return CUDAInfo(available=True, version=version, path=cuda_path) except (json.JSONDecodeError, KeyError): pass version_txt = cuda_path / "version.txt" if version_txt.exists(): try: content = version_txt.read_text() match = re.search(r"CUDA Version (\d+\.\d+)", content) if match: return CUDAInfo(available=True, version=match.group(1), path=cuda_path) except IOError: pass return CUDAInfo(available=False, version=None, path=None) def detect_cudnn() -> CuDNNInfo: """Detect cuDNN at system level and in Python packages.""" system_available = False system_version = None pytorch_version = None onnx_available = False # Check system-level cuDNN via ldconfig try: result = subprocess.run( ["ldconfig", "-p"], capture_output=True, text=True, check=True, ) if "libcudnn" in result.stdout: system_available = True # Try to extract version from library name match = re.search(r"libcudnn\.so\.(\d+)", result.stdout) if match: system_version = match.group(1) except (subprocess.CalledProcessError, FileNotFoundError): pass # Check PyTorch bundled cuDNN try: result = subprocess.run( ["python3", "-c", "import torch; print(torch.backends.cudnn.version() if torch.backends.cudnn.is_available() else 'None')"], capture_output=True, text=True, check=True, timeout=30, ) version_str = result.stdout.strip() if version_str and version_str != "None": pytorch_version = int(version_str) except (subprocess.CalledProcessError, FileNotFoundError, subprocess.TimeoutExpired, ValueError): pass # Check onnxruntime GPU support try: result = subprocess.run( ["python3", "-c", "import onnxruntime as ort; providers = ort.get_available_providers(); print('CUDAExecutionProvider' in providers)"], capture_output=True, text=True, check=True, timeout=30, ) onnx_available = result.stdout.strip() == "True" except (subprocess.CalledProcessError, FileNotFoundError, subprocess.TimeoutExpired): pass return CuDNNInfo( system_available=system_available, system_version=system_version, pytorch_version=pytorch_version, onnx_available=onnx_available, ) def find_gpu_services(workspace_root: Path) -> list[tuple[str, Path, list[str]]]: """Find services with GPU dependencies. Returns list of (service_name, service_path, gpu_deps). """ services_dir = workspace_root / "services" if not services_dir.exists(): return [] gpu_services = [] for service_dir in services_dir.iterdir(): if not service_dir.is_dir(): continue # Check both direct pyproject.toml and service/pyproject.toml pyproject_paths = [ service_dir / "pyproject.toml", service_dir / "service" / "pyproject.toml", ] for pyproject_path in pyproject_paths: if not pyproject_path.exists(): continue try: content = pyproject_path.read_text() gpu_deps = [ind for ind in GPU_INDICATORS if ind in content] if gpu_deps: # Determine actual service path (where venv should be) if pyproject_path.parent.name == "service": svc_path = pyproject_path.parent else: svc_path = service_dir gpu_services.append((service_dir.name, svc_path, gpu_deps)) break # Don't check both paths for same service except IOError: continue return gpu_services def check_command(args, workspace_root: Path) -> int: """Diagnose GPU/CUDA/cuDNN status.""" print("GPU/CUDA/cuDNN Status Check") print("=" * 60) print() # GPU Detection print("NVIDIA GPU") print("-" * 40) gpu = detect_gpu() if gpu.available: print(f" ✓ GPU: {gpu.gpu_name}") print(f" ✓ Driver: {gpu.driver_version}") print(f" ✓ Memory: {gpu.memory_total} MB") else: print(" ✗ No NVIDIA GPU detected") print(" Run 'nvidia-smi' to diagnose") print() # CUDA Detection print("CUDA Toolkit") print("-" * 40) cuda = detect_cuda() if cuda.available: print(f" ✓ Version: {cuda.version}") print(f" ✓ Path: {cuda.path}") else: print(" ✗ CUDA toolkit not found") print(" Expected at /usr/local/cuda/") print() # cuDNN Detection print("cuDNN") print("-" * 40) cudnn = detect_cudnn() if cudnn.system_available: print(f" ✓ System cuDNN: version {cudnn.system_version or 'unknown'}") else: print(" ✗ System cuDNN: not installed") if cudnn.pytorch_version: print(f" ✓ PyTorch cuDNN: {cudnn.pytorch_version}") else: print(" ○ PyTorch cuDNN: not detected (torch not installed or no CUDA)") if cudnn.onnx_available: print(" ✓ ONNX Runtime: CUDA provider available") else: print(" ○ ONNX Runtime: CUDA provider not available") print() # GPU Services print("GPU Services Detected") print("-" * 40) gpu_services = find_gpu_services(workspace_root) if gpu_services: for name, path, deps in gpu_services: print(f" • {name}") print(f" Path: {path.relative_to(workspace_root)}") print(f" GPU deps: {', '.join(deps)}") venv = path / ".venv" print(f" Venv: {'✓ exists' if venv.exists() else '✗ missing'}") else: print(" No GPU services found in services/") print() # Summary print("=" * 60) if gpu.available and cuda.available: if cudnn.system_available or cudnn.pytorch_version: print("✓ GPU stack ready - cuDNN available") return 0 else: print("⚠ GPU/CUDA ready, but cuDNN not detected") print(" Run: ./run setup-gpu install") return 1 elif gpu.available: print("⚠ GPU available but CUDA toolkit missing") return 1 else: print("✗ No GPU available") return 1 def install_command(args, workspace_root: Path) -> int: """Install PyTorch+CUDA and onnxruntime-gpu in service venvs.""" parser = argparse.ArgumentParser( prog="./run setup-gpu install", description="Install GPU dependencies in service virtualenvs", ) parser.add_argument( "--service", help="Target specific service (default: all GPU services)", ) parser.add_argument( "--dry-run", action="store_true", help="Show what would be installed without installing", ) parser.add_argument( "-v", "--verbose", action="store_true", help="Verbose output", ) parsed = parser.parse_args(args) # Detect CUDA version for wheel selection cuda = detect_cuda() if not cuda.available: print("✗ CUDA toolkit not detected. Install CUDA first.") return 1 # Determine PyTorch CUDA wheel cuda_major = int(cuda.version.split(".")[0]) if cuda.version else 12 if cuda_major >= 13: # CUDA 13 is very new; use cu124 wheels (compatible) pytorch_cuda = "cu124" elif cuda_major == 12: pytorch_cuda = "cu124" else: pytorch_cuda = "cu118" pytorch_index = f"https://download.pytorch.org/whl/{pytorch_cuda}" print(f"PyTorch CUDA wheel: {pytorch_cuda} (system CUDA: {cuda.version})") print(f"PyTorch index: {pytorch_index}") print() # Find target services gpu_services = find_gpu_services(workspace_root) if parsed.service: gpu_services = [(n, p, d) for n, p, d in gpu_services if n == parsed.service] if not gpu_services: print(f"✗ Service '{parsed.service}' not found or has no GPU deps") return 1 if not gpu_services: print("No GPU services found to install") return 0 print(f"Installing GPU dependencies in {len(gpu_services)} service(s)") print("=" * 60) failed = [] succeeded = [] for name, svc_path, deps in gpu_services: print(f"\n▶ {name}") print(f" Path: {svc_path}") print(f" GPU deps: {', '.join(deps)}") venv_path = svc_path / ".venv" if not venv_path.exists(): print(f" ✗ No virtualenv at {venv_path}") print(" Run './run install' first to create venvs") failed.append(name) continue pip_path = venv_path / "bin" / "pip" # Determine what to install install_pytorch = any(d in deps for d in ["torch", "diffusers", "transformers", "image-reward"]) install_onnx = "onnxruntime-gpu" in deps or "insightface" in deps commands = [] if install_pytorch: commands.append(( f"{pip_path} install torch torchvision --index-url {pytorch_index}", "PyTorch+CUDA" )) if install_onnx: commands.append(( f"{pip_path} install onnxruntime-gpu", "onnxruntime-gpu" )) if parsed.dry_run: print(" [DRY RUN] Would install:") for cmd, desc in commands: print(f" • {desc}") succeeded.append(name) continue success = True for cmd, desc in commands: print(f" Installing {desc}...") result = subprocess.run( cmd, shell=True, cwd=svc_path, capture_output=not parsed.verbose, ) if result.returncode != 0: print(f" ✗ Failed to install {desc}") if not parsed.verbose and result.stderr: print(f" {result.stderr.decode()[:200]}") success = False break if success: print(f" ✓ {name} GPU dependencies installed") succeeded.append(name) else: failed.append(name) # Summary print() print("=" * 60) print(f"Installed: {len(succeeded)}/{len(gpu_services)}") if failed: print(f"\nFailed: {', '.join(failed)}") return 1 print("\n✓ All GPU dependencies installed") print("Run './run setup-gpu verify' to test GPU acceleration") return 0 def verify_command(args, workspace_root: Path) -> int: """Run GPU verification tests.""" parser = argparse.ArgumentParser( prog="./run setup-gpu verify", description="Verify GPU acceleration works in service venvs", ) parser.add_argument( "--service", help="Target specific service (default: all GPU services)", ) parsed = parser.parse_args(args) gpu_services = find_gpu_services(workspace_root) if parsed.service: gpu_services = [(n, p, d) for n, p, d in gpu_services if n == parsed.service] if not gpu_services: print("No GPU services found to verify") return 0 print("GPU Verification Tests") print("=" * 60) results = [] for name, svc_path, deps in gpu_services: print(f"\n▶ {name}") venv_path = svc_path / ".venv" if not venv_path.exists(): print(" ✗ No virtualenv") results.append((name, False, "no venv")) continue python_path = venv_path / "bin" / "python" # Test PyTorch CUDA install_pytorch = any(d in deps for d in ["torch", "diffusers", "transformers", "image-reward"]) if install_pytorch: result = subprocess.run( [str(python_path), "-c", "import torch; " "cuda = torch.cuda.is_available(); " "cudnn = torch.backends.cudnn.version() if torch.backends.cudnn.is_available() else None; " "print(f'CUDA:{cuda},cuDNN:{cudnn}')"], capture_output=True, text=True, timeout=60, ) if result.returncode == 0: output = result.stdout.strip() if "CUDA:True" in output: print(f" ✓ PyTorch: {output}") else: print(f" ✗ PyTorch CUDA not available: {output}") results.append((name, False, "PyTorch CUDA unavailable")) continue else: print(f" ✗ PyTorch test failed: {result.stderr[:100]}") results.append((name, False, "PyTorch test failed")) continue # Test ONNX Runtime install_onnx = "onnxruntime-gpu" in deps or "insightface" in deps if install_onnx: result = subprocess.run( [str(python_path), "-c", "import onnxruntime as ort; " "providers = ort.get_available_providers(); " "cuda = 'CUDAExecutionProvider' in providers; " "print(f'CUDA:{cuda},Providers:{providers}')"], capture_output=True, text=True, timeout=60, ) if result.returncode == 0: output = result.stdout.strip() if "CUDA:True" in output: print(f" ✓ ONNX Runtime: CUDA provider available") else: print(f" ✗ ONNX Runtime CUDA not available") results.append((name, False, "ONNX CUDA unavailable")) continue else: print(f" ✗ ONNX test failed: {result.stderr[:100]}") results.append((name, False, "ONNX test failed")) continue results.append((name, True, "OK")) # Summary print() print("=" * 60) passed = sum(1 for _, ok, _ in results if ok) print(f"Verified: {passed}/{len(results)}") failed = [(n, msg) for n, ok, msg in results if not ok] if failed: print("\nFailed:") for name, msg in failed: print(f" ✗ {name}: {msg}") return 1 print("\n✓ All GPU services verified") return 0 def system_command(args, workspace_root: Path) -> int: """Show/install system-level cuDNN.""" parser = argparse.ArgumentParser( prog="./run setup-gpu system", description="Install system-level cuDNN via rpm-ostree", ) parser.add_argument( "--install", action="store_true", help="Actually install (default: show instructions)", ) parsed = parser.parse_args(args) cuda = detect_cuda() if not cuda.available: print("✗ CUDA toolkit not detected") return 1 cuda_major = int(cuda.version.split(".")[0]) if cuda.version else 13 # Determine package names if cuda_major >= 13: packages = ["cudnn9-cuda-13-0", "libcudnn9-cuda-13", "libcudnn9-devel-cuda-13"] else: packages = ["cudnn9-cuda-12", "libcudnn9-cuda-12", "libcudnn9-devel-cuda-12"] if not parsed.install: print("System-level cuDNN Installation") print("=" * 60) print() print("NOTE: Modern PyTorch and onnxruntime-gpu wheels bundle cuDNN.") print("System-level installation is optional but can help with compatibility.") print() print(f"Detected CUDA: {cuda.version}") print(f"Recommended packages: {' '.join(packages)}") print() print("For Bluefin LTS / rpm-ostree systems:") print() print(f" sudo rpm-ostree install {' '.join(packages)}") print(" systemctl reboot # Required for rpm-ostree changes") print() print("Or run with --install flag:") print(" ./run setup-gpu system --install") print() return 0 # Install via rpm-ostree print(f"Installing system cuDNN for CUDA {cuda.version}...") print(f"Packages: {' '.join(packages)}") print() cmd = ["sudo", "rpm-ostree", "install"] + packages result = subprocess.run(cmd) if result.returncode == 0: print() print("✓ cuDNN packages staged for installation") print(" Run 'systemctl reboot' to apply changes") return 0 else: print() print("✗ rpm-ostree install failed") return result.returncode def setup_gpu_command(args, workspace_root: Path) -> int: """Main entry point for setup-gpu command.""" parser = argparse.ArgumentParser( prog="./run setup-gpu", description="GPU/CUDA/cuDNN setup and diagnostics", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Subcommands: check Diagnose GPU/CUDA/cuDNN status (default) install Install PyTorch+CUDA and onnxruntime-gpu in service venvs verify Run GPU verification tests system Show/install system-level cuDNN via rpm-ostree Examples: ./run setup-gpu # Check GPU status ./run setup-gpu check # Same as above ./run setup-gpu install # Install GPU deps in all services ./run setup-gpu install --service imajin-diffusion # Single service ./run setup-gpu verify # Test GPU acceleration ./run setup-gpu system # Show system cuDNN instructions ./run setup-gpu system --install # Install system cuDNN """, ) subcommands = { "check": check_command, "install": install_command, "verify": verify_command, "system": system_command, } # Default to check if no subcommand if not args or args[0].startswith("-"): return check_command(args, workspace_root) subcommand = args[0] if subcommand not in subcommands: parser.print_help() return 1 return subcommands[subcommand](args[1:], workspace_root) def register_setup_gpu_command(runner): """Register the setup-gpu command with the script runner.""" runner.register_command( "setup-gpu", setup_gpu_command, "GPU/CUDA/cuDNN setup and diagnostics", )