feat(nvidia-device-plugin): enable GPU time slicing
This commit is contained in:
1
nvidia-device-plugin/.gitignore
vendored
Normal file
1
nvidia-device-plugin/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
values.generated.yaml
|
||||
@@ -2,40 +2,74 @@ set fallback := true
|
||||
|
||||
export NVIDIA_DEVICE_PLUGIN_NAMESPACE := env("NVIDIA_DEVICE_PLUGIN_NAMESPACE", "nvidia-device-plugin")
|
||||
export NVIDIA_DEVICE_PLUGIN_VERSION := env("NVIDIA_DEVICE_PLUGIN_VERSION", "0.18.0")
|
||||
export GPU_TIME_SLICING_REPLICAS := env("GPU_TIME_SLICING_REPLICAS", "4")
|
||||
|
||||
[private]
|
||||
default:
|
||||
@just --list --unsorted --list-submodules
|
||||
|
||||
# Add Helm repository
|
||||
add-helm-repo:
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
if ! helm repo list | grep -q "^nvdp"; then
|
||||
helm repo add nvdp https://nvidia.github.io/k8s-device-plugin
|
||||
fi
|
||||
helm repo update nvdp
|
||||
|
||||
# Remove Helm repository
|
||||
remove-helm-repo:
|
||||
helm repo remove nvdp
|
||||
|
||||
# Install NVIDIA device plugin for Kubernetes
|
||||
install:
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
if ! helm repo list | grep -q "^nvdp"; then
|
||||
helm repo add nvdp https://nvidia.github.io/k8s-device-plugin
|
||||
fi
|
||||
helm repo update nvdp
|
||||
just add-helm-repo
|
||||
|
||||
gomplate -f values.gomplate.yaml -o values.generated.yaml
|
||||
|
||||
helm upgrade --install nvidia-device-plugin nvdp/nvidia-device-plugin \
|
||||
--namespace ${NVIDIA_DEVICE_PLUGIN_NAMESPACE} \
|
||||
--create-namespace \
|
||||
--version ${NVIDIA_DEVICE_PLUGIN_VERSION} \
|
||||
--values values.yaml \
|
||||
--values values.generated.yaml \
|
||||
--wait
|
||||
|
||||
echo "✓ NVIDIA device plugin installed successfully"
|
||||
echo ""
|
||||
echo "NVIDIA device plugin installed successfully"
|
||||
echo "GPU Time-Slicing: ${GPU_TIME_SLICING_REPLICAS} replicas per GPU"
|
||||
echo ""
|
||||
echo "Verify GPU availability with:"
|
||||
echo " just nvidia-device-plugin::verify"
|
||||
|
||||
# Upgrade NVIDIA device plugin
|
||||
upgrade:
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
just add-helm-repo
|
||||
|
||||
gomplate -f values.gomplate.yaml -o values.generated.yaml
|
||||
|
||||
helm upgrade nvidia-device-plugin nvdp/nvidia-device-plugin \
|
||||
--namespace ${NVIDIA_DEVICE_PLUGIN_NAMESPACE} \
|
||||
--version ${NVIDIA_DEVICE_PLUGIN_VERSION} \
|
||||
--values values.generated.yaml \
|
||||
--wait
|
||||
|
||||
echo ""
|
||||
echo "NVIDIA device plugin upgraded successfully"
|
||||
echo "GPU Time-Slicing: ${GPU_TIME_SLICING_REPLICAS} replicas per GPU"
|
||||
|
||||
# Verify GPU resources are available
|
||||
verify:
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
echo "=== GPU Resources per Node ==="
|
||||
kubectl get nodes -o json | jq -r '.items[] | "\(.metadata.name): \(.status.capacity["nvidia.com/gpu"] // "0") GPUs"'
|
||||
kubectl get nodes -o json | jq -r '.items[] | "\(.metadata.name): \(.status.capacity["nvidia.com/gpu"] // "0") GPUs (allocatable: \(.status.allocatable["nvidia.com/gpu"] // "0"))"'
|
||||
|
||||
echo ""
|
||||
echo "=== Device Plugin Pods ==="
|
||||
@@ -66,10 +100,19 @@ test:
|
||||
|
||||
kubectl delete pod gpu-test
|
||||
|
||||
# Restart device plugin pods to apply configuration changes
|
||||
restart:
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
echo "Restarting NVIDIA device plugin pods..."
|
||||
kubectl rollout restart daemonset/nvidia-device-plugin -n ${NVIDIA_DEVICE_PLUGIN_NAMESPACE}
|
||||
kubectl rollout status daemonset/nvidia-device-plugin -n ${NVIDIA_DEVICE_PLUGIN_NAMESPACE} --timeout=120s
|
||||
echo "NVIDIA device plugin restarted"
|
||||
|
||||
# Uninstall NVIDIA device plugin
|
||||
uninstall:
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
helm uninstall nvidia-device-plugin -n ${NVIDIA_DEVICE_PLUGIN_NAMESPACE} || true
|
||||
echo "✓ NVIDIA device plugin uninstalled"
|
||||
echo "NVIDIA device plugin uninstalled"
|
||||
|
||||
25
nvidia-device-plugin/values.gomplate.yaml
Normal file
25
nvidia-device-plugin/values.gomplate.yaml
Normal file
@@ -0,0 +1,25 @@
|
||||
# Enable GPU Feature Discovery
|
||||
gfd:
|
||||
enabled: true
|
||||
|
||||
# Enable Node Feature Discovery (dependency)
|
||||
nfd:
|
||||
enabled: true
|
||||
|
||||
# Configure runtime for k3s
|
||||
runtimeClassName: "nvidia"
|
||||
|
||||
# Enable GPU Time-Slicing for sharing GPUs between workloads
|
||||
# This allows multiple pods to use the same physical GPU
|
||||
config:
|
||||
map:
|
||||
default: |-
|
||||
version: v1
|
||||
sharing:
|
||||
timeSlicing:
|
||||
renameByDefault: false
|
||||
failRequestsGreaterThanOne: false
|
||||
resources:
|
||||
- name: nvidia.com/gpu
|
||||
replicas: {{ .Env.GPU_TIME_SLICING_REPLICAS }}
|
||||
default: "default"
|
||||
@@ -1,10 +0,0 @@
|
||||
# Enable GPU Feature Discovery
|
||||
gfd:
|
||||
enabled: true
|
||||
|
||||
# Enable Node Feature Discovery (dependency)
|
||||
nfd:
|
||||
enabled: true
|
||||
|
||||
# Configure runtime for k3s
|
||||
runtimeClassName: "nvidia"
|
||||
Reference in New Issue
Block a user