set fallback := true export NVIDIA_DEVICE_PLUGIN_NAMESPACE := env("NVIDIA_DEVICE_PLUGIN_NAMESPACE", "nvidia-device-plugin") export NVIDIA_DEVICE_PLUGIN_VERSION := env("NVIDIA_DEVICE_PLUGIN_VERSION", "0.18.0") export GPU_TIME_SLICING_REPLICAS := env("GPU_TIME_SLICING_REPLICAS", "4") [private] default: @just --list --unsorted --list-submodules # Add Helm repository add-helm-repo: #!/bin/bash set -euo pipefail if ! helm repo list | grep -q "^nvdp"; then helm repo add nvdp https://nvidia.github.io/k8s-device-plugin fi helm repo update nvdp # Remove Helm repository remove-helm-repo: helm repo remove nvdp # Install NVIDIA device plugin for Kubernetes install: #!/bin/bash set -euo pipefail just add-helm-repo gomplate -f values.gomplate.yaml -o values.generated.yaml helm upgrade --install nvidia-device-plugin nvdp/nvidia-device-plugin \ --namespace ${NVIDIA_DEVICE_PLUGIN_NAMESPACE} \ --create-namespace \ --version ${NVIDIA_DEVICE_PLUGIN_VERSION} \ --values values.generated.yaml \ --wait echo "" echo "NVIDIA device plugin installed successfully" echo "GPU Time-Slicing: ${GPU_TIME_SLICING_REPLICAS} replicas per GPU" echo "" echo "Verify GPU availability with:" echo " just nvidia-device-plugin::verify" # Upgrade NVIDIA device plugin upgrade: #!/bin/bash set -euo pipefail just add-helm-repo gomplate -f values.gomplate.yaml -o values.generated.yaml helm upgrade nvidia-device-plugin nvdp/nvidia-device-plugin \ --namespace ${NVIDIA_DEVICE_PLUGIN_NAMESPACE} \ --version ${NVIDIA_DEVICE_PLUGIN_VERSION} \ --values values.generated.yaml \ --wait echo "" echo "NVIDIA device plugin upgraded successfully" echo "GPU Time-Slicing: ${GPU_TIME_SLICING_REPLICAS} replicas per GPU" # Verify GPU resources are available verify: #!/bin/bash set -euo pipefail echo "=== GPU Resources per Node ===" kubectl get nodes -o json | jq -r '.items[] | "\(.metadata.name): \(.status.capacity["nvidia.com/gpu"] // "0") GPUs (allocatable: \(.status.allocatable["nvidia.com/gpu"] // "0"))"' echo "" echo "=== Device Plugin Pods ===" kubectl get pods -n ${NVIDIA_DEVICE_PLUGIN_NAMESPACE} -l app.kubernetes.io/name=nvidia-device-plugin echo "" echo "Test GPU access with:" echo " just nvidia-device-plugin::test" # Show detailed GPU information gpu-info: kubectl get nodes -o json | jq -r '.items[] | select(.status.capacity["nvidia.com/gpu"] != null) | {name: .metadata.name, gpus: .status.capacity["nvidia.com/gpu"], allocatable: .status.allocatable["nvidia.com/gpu"]}' # Test GPU access by running nvidia-smi in a pod test: #!/bin/bash set -euo pipefail kubectl delete pod gpu-test --ignore-not-found=true kubectl apply -f gpu-test-pod.yaml echo "Waiting for pod to complete..." kubectl wait --for=jsonpath='{.status.phase}'=Succeeded pod/gpu-test --timeout=60s || true echo "" echo "=== GPU Test Output ===" kubectl logs gpu-test kubectl delete pod gpu-test # Restart device plugin pods to apply configuration changes restart: #!/bin/bash set -euo pipefail echo "Restarting NVIDIA device plugin pods..." kubectl rollout restart daemonset/nvidia-device-plugin -n ${NVIDIA_DEVICE_PLUGIN_NAMESPACE} kubectl rollout status daemonset/nvidia-device-plugin -n ${NVIDIA_DEVICE_PLUGIN_NAMESPACE} --timeout=120s echo "NVIDIA device plugin restarted" # Uninstall NVIDIA device plugin uninstall: #!/bin/bash set -euo pipefail helm uninstall nvidia-device-plugin -n ${NVIDIA_DEVICE_PLUGIN_NAMESPACE} || true echo "NVIDIA device plugin uninstalled"