feat(nvidia-device-plugin): enable GPU time slicing

2025-12-03 14:08:22 +09:00
parent 9d6501adae
commit ad04d5774a
4 changed files with 77 additions and 18 deletions
--- a/nvidia-device-plugin/.gitignore
+++ b/nvidia-device-plugin/.gitignore
@@ -0,0 +1 @@
+values.generated.yaml
--- a/nvidia-device-plugin/justfile
+++ b/nvidia-device-plugin/justfile
@@ -2,40 +2,74 @@ set fallback := true

 export NVIDIA_DEVICE_PLUGIN_NAMESPACE := env("NVIDIA_DEVICE_PLUGIN_NAMESPACE", "nvidia-device-plugin")
 export NVIDIA_DEVICE_PLUGIN_VERSION := env("NVIDIA_DEVICE_PLUGIN_VERSION", "0.18.0")
+export GPU_TIME_SLICING_REPLICAS := env("GPU_TIME_SLICING_REPLICAS", "4")

 [private]
 default:
    @just --list --unsorted --list-submodules

+# Add Helm repository
+add-helm-repo:
+    #!/bin/bash
+    set -euo pipefail
+    if ! helm repo list | grep -q "^nvdp"; then
+        helm repo add nvdp https://nvidia.github.io/k8s-device-plugin
+    fi
+    helm repo update nvdp
+
+# Remove Helm repository
+remove-helm-repo:
+    helm repo remove nvdp
+
 # Install NVIDIA device plugin for Kubernetes
 install:
    #!/bin/bash
    set -euo pipefail

-    if ! helm repo list | grep -q "^nvdp"; then
-        helm repo add nvdp https://nvidia.github.io/k8s-device-plugin
-    fi
-    helm repo update nvdp
+    just add-helm-repo
+
+    gomplate -f values.gomplate.yaml -o values.generated.yaml

    helm upgrade --install nvidia-device-plugin nvdp/nvidia-device-plugin \
        --namespace ${NVIDIA_DEVICE_PLUGIN_NAMESPACE} \
        --create-namespace \
        --version ${NVIDIA_DEVICE_PLUGIN_VERSION} \
-        --values values.yaml \
+        --values values.generated.yaml \
        --wait

-    echo "✓ NVIDIA device plugin installed successfully"
+    echo ""
+    echo "NVIDIA device plugin installed successfully"
+    echo "GPU Time-Slicing: ${GPU_TIME_SLICING_REPLICAS} replicas per GPU"
    echo ""
    echo "Verify GPU availability with:"
    echo "  just nvidia-device-plugin::verify"

+# Upgrade NVIDIA device plugin
+upgrade:
+    #!/bin/bash
+    set -euo pipefail
+
+    just add-helm-repo
+
+    gomplate -f values.gomplate.yaml -o values.generated.yaml
+
+    helm upgrade nvidia-device-plugin nvdp/nvidia-device-plugin \
+        --namespace ${NVIDIA_DEVICE_PLUGIN_NAMESPACE} \
+        --version ${NVIDIA_DEVICE_PLUGIN_VERSION} \
+        --values values.generated.yaml \
+        --wait
+
+    echo ""
+    echo "NVIDIA device plugin upgraded successfully"
+    echo "GPU Time-Slicing: ${GPU_TIME_SLICING_REPLICAS} replicas per GPU"
+
 # Verify GPU resources are available
 verify:
    #!/bin/bash
    set -euo pipefail

    echo "=== GPU Resources per Node ==="
-    kubectl get nodes -o json | jq -r '.items[] | "\(.metadata.name): \(.status.capacity["nvidia.com/gpu"] // "0") GPUs"'
+    kubectl get nodes -o json | jq -r '.items[] | "\(.metadata.name): \(.status.capacity["nvidia.com/gpu"] // "0") GPUs (allocatable: \(.status.allocatable["nvidia.com/gpu"] // "0"))"'

    echo ""
    echo "=== Device Plugin Pods ==="
@@ -66,10 +100,19 @@ test:

    kubectl delete pod gpu-test

+# Restart device plugin pods to apply configuration changes
+restart:
+    #!/bin/bash
+    set -euo pipefail
+    echo "Restarting NVIDIA device plugin pods..."
+    kubectl rollout restart daemonset/nvidia-device-plugin -n ${NVIDIA_DEVICE_PLUGIN_NAMESPACE}
+    kubectl rollout status daemonset/nvidia-device-plugin -n ${NVIDIA_DEVICE_PLUGIN_NAMESPACE} --timeout=120s
+    echo "NVIDIA device plugin restarted"
+
 # Uninstall NVIDIA device plugin
 uninstall:
    #!/bin/bash
    set -euo pipefail

    helm uninstall nvidia-device-plugin -n ${NVIDIA_DEVICE_PLUGIN_NAMESPACE} || true
-    echo "✓ NVIDIA device plugin uninstalled"
+    echo "NVIDIA device plugin uninstalled"
--- a/nvidia-device-plugin/values.gomplate.yaml
+++ b/nvidia-device-plugin/values.gomplate.yaml
@@ -0,0 +1,25 @@
+# Enable GPU Feature Discovery
+gfd:
+  enabled: true
+
+# Enable Node Feature Discovery (dependency)
+nfd:
+  enabled: true
+
+# Configure runtime for k3s
+runtimeClassName: "nvidia"
+
+# Enable GPU Time-Slicing for sharing GPUs between workloads
+# This allows multiple pods to use the same physical GPU
+config:
+  map:
+    default: |-
+      version: v1
+      sharing:
+        timeSlicing:
+          renameByDefault: false
+          failRequestsGreaterThanOne: false
+          resources:
+            - name: nvidia.com/gpu
+              replicas: {{ .Env.GPU_TIME_SLICING_REPLICAS }}
+  default: "default"
--- a/nvidia-device-plugin/values.yaml
+++ b/nvidia-device-plugin/values.yaml
@@ -1,10 +0,0 @@
-# Enable GPU Feature Discovery
-gfd:
-  enabled: true
-
-# Enable Node Feature Discovery (dependency)
-nfd:
-  enabled: true
-
-# Configure runtime for k3s
-runtimeClassName: "nvidia"