feat(nvidia-device-plugin): enable GPU time slicing

This commit is contained in:
Masaki Yatsu
2025-12-03 14:08:22 +09:00
parent 9d6501adae
commit ad04d5774a
4 changed files with 77 additions and 18 deletions

1
nvidia-device-plugin/.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
values.generated.yaml

View File

@@ -2,40 +2,74 @@ set fallback := true
export NVIDIA_DEVICE_PLUGIN_NAMESPACE := env("NVIDIA_DEVICE_PLUGIN_NAMESPACE", "nvidia-device-plugin")
export NVIDIA_DEVICE_PLUGIN_VERSION := env("NVIDIA_DEVICE_PLUGIN_VERSION", "0.18.0")
export GPU_TIME_SLICING_REPLICAS := env("GPU_TIME_SLICING_REPLICAS", "4")
[private]
default:
@just --list --unsorted --list-submodules
# Add Helm repository
add-helm-repo:
#!/bin/bash
set -euo pipefail
if ! helm repo list | grep -q "^nvdp"; then
helm repo add nvdp https://nvidia.github.io/k8s-device-plugin
fi
helm repo update nvdp
# Remove Helm repository
remove-helm-repo:
helm repo remove nvdp
# Install NVIDIA device plugin for Kubernetes
install:
#!/bin/bash
set -euo pipefail
if ! helm repo list | grep -q "^nvdp"; then
helm repo add nvdp https://nvidia.github.io/k8s-device-plugin
fi
helm repo update nvdp
just add-helm-repo
gomplate -f values.gomplate.yaml -o values.generated.yaml
helm upgrade --install nvidia-device-plugin nvdp/nvidia-device-plugin \
--namespace ${NVIDIA_DEVICE_PLUGIN_NAMESPACE} \
--create-namespace \
--version ${NVIDIA_DEVICE_PLUGIN_VERSION} \
--values values.yaml \
--values values.generated.yaml \
--wait
echo "✓ NVIDIA device plugin installed successfully"
echo ""
echo "NVIDIA device plugin installed successfully"
echo "GPU Time-Slicing: ${GPU_TIME_SLICING_REPLICAS} replicas per GPU"
echo ""
echo "Verify GPU availability with:"
echo " just nvidia-device-plugin::verify"
# Upgrade NVIDIA device plugin
upgrade:
#!/bin/bash
set -euo pipefail
just add-helm-repo
gomplate -f values.gomplate.yaml -o values.generated.yaml
helm upgrade nvidia-device-plugin nvdp/nvidia-device-plugin \
--namespace ${NVIDIA_DEVICE_PLUGIN_NAMESPACE} \
--version ${NVIDIA_DEVICE_PLUGIN_VERSION} \
--values values.generated.yaml \
--wait
echo ""
echo "NVIDIA device plugin upgraded successfully"
echo "GPU Time-Slicing: ${GPU_TIME_SLICING_REPLICAS} replicas per GPU"
# Verify GPU resources are available
verify:
#!/bin/bash
set -euo pipefail
echo "=== GPU Resources per Node ==="
kubectl get nodes -o json | jq -r '.items[] | "\(.metadata.name): \(.status.capacity["nvidia.com/gpu"] // "0") GPUs"'
kubectl get nodes -o json | jq -r '.items[] | "\(.metadata.name): \(.status.capacity["nvidia.com/gpu"] // "0") GPUs (allocatable: \(.status.allocatable["nvidia.com/gpu"] // "0"))"'
echo ""
echo "=== Device Plugin Pods ==="
@@ -66,10 +100,19 @@ test:
kubectl delete pod gpu-test
# Restart device plugin pods to apply configuration changes
restart:
#!/bin/bash
set -euo pipefail
echo "Restarting NVIDIA device plugin pods..."
kubectl rollout restart daemonset/nvidia-device-plugin -n ${NVIDIA_DEVICE_PLUGIN_NAMESPACE}
kubectl rollout status daemonset/nvidia-device-plugin -n ${NVIDIA_DEVICE_PLUGIN_NAMESPACE} --timeout=120s
echo "NVIDIA device plugin restarted"
# Uninstall NVIDIA device plugin
uninstall:
#!/bin/bash
set -euo pipefail
helm uninstall nvidia-device-plugin -n ${NVIDIA_DEVICE_PLUGIN_NAMESPACE} || true
echo "NVIDIA device plugin uninstalled"
echo "NVIDIA device plugin uninstalled"

View File

@@ -0,0 +1,25 @@
# Enable GPU Feature Discovery
gfd:
enabled: true
# Enable Node Feature Discovery (dependency)
nfd:
enabled: true
# Configure runtime for k3s
runtimeClassName: "nvidia"
# Enable GPU Time-Slicing for sharing GPUs between workloads
# This allows multiple pods to use the same physical GPU
config:
map:
default: |-
version: v1
sharing:
timeSlicing:
renameByDefault: false
failRequestsGreaterThanOne: false
resources:
- name: nvidia.com/gpu
replicas: {{ .Env.GPU_TIME_SLICING_REPLICAS }}
default: "default"

View File

@@ -1,10 +0,0 @@
# Enable GPU Feature Discovery
gfd:
enabled: true
# Enable Node Feature Discovery (dependency)
nfd:
enabled: true
# Configure runtime for k3s
runtimeClassName: "nvidia"