diff --git a/nvidia-device-plugin/.gitignore b/nvidia-device-plugin/.gitignore new file mode 100644 index 0000000..a2329b3 --- /dev/null +++ b/nvidia-device-plugin/.gitignore @@ -0,0 +1 @@ +values.generated.yaml diff --git a/nvidia-device-plugin/justfile b/nvidia-device-plugin/justfile index 9a77ca1..f94058b 100644 --- a/nvidia-device-plugin/justfile +++ b/nvidia-device-plugin/justfile @@ -2,40 +2,74 @@ set fallback := true export NVIDIA_DEVICE_PLUGIN_NAMESPACE := env("NVIDIA_DEVICE_PLUGIN_NAMESPACE", "nvidia-device-plugin") export NVIDIA_DEVICE_PLUGIN_VERSION := env("NVIDIA_DEVICE_PLUGIN_VERSION", "0.18.0") +export GPU_TIME_SLICING_REPLICAS := env("GPU_TIME_SLICING_REPLICAS", "4") [private] default: @just --list --unsorted --list-submodules +# Add Helm repository +add-helm-repo: + #!/bin/bash + set -euo pipefail + if ! helm repo list | grep -q "^nvdp"; then + helm repo add nvdp https://nvidia.github.io/k8s-device-plugin + fi + helm repo update nvdp + +# Remove Helm repository +remove-helm-repo: + helm repo remove nvdp + # Install NVIDIA device plugin for Kubernetes install: #!/bin/bash set -euo pipefail - if ! helm repo list | grep -q "^nvdp"; then - helm repo add nvdp https://nvidia.github.io/k8s-device-plugin - fi - helm repo update nvdp + just add-helm-repo + + gomplate -f values.gomplate.yaml -o values.generated.yaml helm upgrade --install nvidia-device-plugin nvdp/nvidia-device-plugin \ --namespace ${NVIDIA_DEVICE_PLUGIN_NAMESPACE} \ --create-namespace \ --version ${NVIDIA_DEVICE_PLUGIN_VERSION} \ - --values values.yaml \ + --values values.generated.yaml \ --wait - echo "✓ NVIDIA device plugin installed successfully" + echo "" + echo "NVIDIA device plugin installed successfully" + echo "GPU Time-Slicing: ${GPU_TIME_SLICING_REPLICAS} replicas per GPU" echo "" echo "Verify GPU availability with:" echo " just nvidia-device-plugin::verify" +# Upgrade NVIDIA device plugin +upgrade: + #!/bin/bash + set -euo pipefail + + just add-helm-repo + + gomplate -f values.gomplate.yaml -o values.generated.yaml + + helm upgrade nvidia-device-plugin nvdp/nvidia-device-plugin \ + --namespace ${NVIDIA_DEVICE_PLUGIN_NAMESPACE} \ + --version ${NVIDIA_DEVICE_PLUGIN_VERSION} \ + --values values.generated.yaml \ + --wait + + echo "" + echo "NVIDIA device plugin upgraded successfully" + echo "GPU Time-Slicing: ${GPU_TIME_SLICING_REPLICAS} replicas per GPU" + # Verify GPU resources are available verify: #!/bin/bash set -euo pipefail echo "=== GPU Resources per Node ===" - kubectl get nodes -o json | jq -r '.items[] | "\(.metadata.name): \(.status.capacity["nvidia.com/gpu"] // "0") GPUs"' + kubectl get nodes -o json | jq -r '.items[] | "\(.metadata.name): \(.status.capacity["nvidia.com/gpu"] // "0") GPUs (allocatable: \(.status.allocatable["nvidia.com/gpu"] // "0"))"' echo "" echo "=== Device Plugin Pods ===" @@ -66,10 +100,19 @@ test: kubectl delete pod gpu-test +# Restart device plugin pods to apply configuration changes +restart: + #!/bin/bash + set -euo pipefail + echo "Restarting NVIDIA device plugin pods..." + kubectl rollout restart daemonset/nvidia-device-plugin -n ${NVIDIA_DEVICE_PLUGIN_NAMESPACE} + kubectl rollout status daemonset/nvidia-device-plugin -n ${NVIDIA_DEVICE_PLUGIN_NAMESPACE} --timeout=120s + echo "NVIDIA device plugin restarted" + # Uninstall NVIDIA device plugin uninstall: #!/bin/bash set -euo pipefail helm uninstall nvidia-device-plugin -n ${NVIDIA_DEVICE_PLUGIN_NAMESPACE} || true - echo "✓ NVIDIA device plugin uninstalled" + echo "NVIDIA device plugin uninstalled" diff --git a/nvidia-device-plugin/values.gomplate.yaml b/nvidia-device-plugin/values.gomplate.yaml new file mode 100644 index 0000000..daa5068 --- /dev/null +++ b/nvidia-device-plugin/values.gomplate.yaml @@ -0,0 +1,25 @@ +# Enable GPU Feature Discovery +gfd: + enabled: true + +# Enable Node Feature Discovery (dependency) +nfd: + enabled: true + +# Configure runtime for k3s +runtimeClassName: "nvidia" + +# Enable GPU Time-Slicing for sharing GPUs between workloads +# This allows multiple pods to use the same physical GPU +config: + map: + default: |- + version: v1 + sharing: + timeSlicing: + renameByDefault: false + failRequestsGreaterThanOne: false + resources: + - name: nvidia.com/gpu + replicas: {{ .Env.GPU_TIME_SLICING_REPLICAS }} + default: "default" diff --git a/nvidia-device-plugin/values.yaml b/nvidia-device-plugin/values.yaml deleted file mode 100644 index 3514ee7..0000000 --- a/nvidia-device-plugin/values.yaml +++ /dev/null @@ -1,10 +0,0 @@ -# Enable GPU Feature Discovery -gfd: - enabled: true - -# Enable Node Feature Discovery (dependency) -nfd: - enabled: true - -# Configure runtime for k3s -runtimeClassName: "nvidia"