buun-stack/nvidia-device-plugin/justfile

set fallback := true

export NVIDIA_DEVICE_PLUGIN_NAMESPACE := env("NVIDIA_DEVICE_PLUGIN_NAMESPACE", "nvidia-device-plugin")
export NVIDIA_DEVICE_PLUGIN_VERSION := env("NVIDIA_DEVICE_PLUGIN_VERSION", "0.18.0")
export GPU_TIME_SLICING_REPLICAS := env("GPU_TIME_SLICING_REPLICAS", "4")

[private]
default:
    @just --list --unsorted --list-submodules

# Add Helm repository
add-helm-repo:
    #!/bin/bash
    set -euo pipefail
    if ! helm repo list | grep -q "^nvdp"; then
        helm repo add nvdp https://nvidia.github.io/k8s-device-plugin
    fi
    helm repo update nvdp

# Remove Helm repository
remove-helm-repo:
    helm repo remove nvdp

# Install NVIDIA device plugin for Kubernetes
install:
    #!/bin/bash
    set -euo pipefail

    just add-helm-repo

    gomplate -f values.gomplate.yaml -o values.generated.yaml

    helm upgrade --install nvidia-device-plugin nvdp/nvidia-device-plugin \
        --namespace ${NVIDIA_DEVICE_PLUGIN_NAMESPACE} \
        --create-namespace \
        --version ${NVIDIA_DEVICE_PLUGIN_VERSION} \
        --values values.generated.yaml \
        --wait

    echo ""
    echo "NVIDIA device plugin installed successfully"
    echo "GPU Time-Slicing: ${GPU_TIME_SLICING_REPLICAS} replicas per GPU"
    echo ""
    echo "Verify GPU availability with:"
    echo "  just nvidia-device-plugin::verify"

# Upgrade NVIDIA device plugin
upgrade:
    #!/bin/bash
    set -euo pipefail

    just add-helm-repo

    gomplate -f values.gomplate.yaml -o values.generated.yaml

    helm upgrade nvidia-device-plugin nvdp/nvidia-device-plugin \
        --namespace ${NVIDIA_DEVICE_PLUGIN_NAMESPACE} \
        --version ${NVIDIA_DEVICE_PLUGIN_VERSION} \
        --values values.generated.yaml \
        --wait

    echo ""
    echo "NVIDIA device plugin upgraded successfully"
    echo "GPU Time-Slicing: ${GPU_TIME_SLICING_REPLICAS} replicas per GPU"

# Verify GPU resources are available
verify:
    #!/bin/bash
    set -euo pipefail

    echo "=== GPU Resources per Node ==="
    kubectl get nodes -o json | jq -r '.items[] | "\(.metadata.name): \(.status.capacity["nvidia.com/gpu"] // "0") GPUs (allocatable: \(.status.allocatable["nvidia.com/gpu"] // "0"))"'

    echo ""
    echo "=== Device Plugin Pods ==="
    kubectl get pods -n ${NVIDIA_DEVICE_PLUGIN_NAMESPACE} -l app.kubernetes.io/name=nvidia-device-plugin

    echo ""
    echo "Test GPU access with:"
    echo "  just nvidia-device-plugin::test"

# Show detailed GPU information
gpu-info:
    kubectl get nodes -o json | jq -r '.items[] | select(.status.capacity["nvidia.com/gpu"] != null) | {name: .metadata.name, gpus: .status.capacity["nvidia.com/gpu"], allocatable: .status.allocatable["nvidia.com/gpu"]}'

# Test GPU access by running nvidia-smi in a pod
test:
    #!/bin/bash
    set -euo pipefail

    kubectl delete pod gpu-test --ignore-not-found=true
    kubectl apply -f gpu-test-pod.yaml

    echo "Waiting for pod to complete..."
    kubectl wait --for=jsonpath='{.status.phase}'=Succeeded pod/gpu-test --timeout=60s || true

    echo ""
    echo "=== GPU Test Output ==="
    kubectl logs gpu-test

    kubectl delete pod gpu-test

# Restart device plugin pods to apply configuration changes
restart:
    #!/bin/bash
    set -euo pipefail
    echo "Restarting NVIDIA device plugin pods..."
    kubectl rollout restart daemonset/nvidia-device-plugin -n ${NVIDIA_DEVICE_PLUGIN_NAMESPACE}
    kubectl rollout status daemonset/nvidia-device-plugin -n ${NVIDIA_DEVICE_PLUGIN_NAMESPACE} --timeout=120s
    echo "NVIDIA device plugin restarted"

# Uninstall NVIDIA device plugin
uninstall:
    #!/bin/bash
    set -euo pipefail

    helm uninstall nvidia-device-plugin -n ${NVIDIA_DEVICE_PLUGIN_NAMESPACE} || true
    echo "NVIDIA device plugin uninstalled"