119 lines
3.7 KiB
Makefile
119 lines
3.7 KiB
Makefile
set fallback := true
|
|
|
|
export NVIDIA_DEVICE_PLUGIN_NAMESPACE := env("NVIDIA_DEVICE_PLUGIN_NAMESPACE", "nvidia-device-plugin")
|
|
export NVIDIA_DEVICE_PLUGIN_VERSION := env("NVIDIA_DEVICE_PLUGIN_VERSION", "0.18.0")
|
|
export GPU_TIME_SLICING_REPLICAS := env("GPU_TIME_SLICING_REPLICAS", "4")
|
|
|
|
[private]
|
|
default:
|
|
@just --list --unsorted --list-submodules
|
|
|
|
# Add Helm repository
|
|
add-helm-repo:
|
|
#!/bin/bash
|
|
set -euo pipefail
|
|
if ! helm repo list | grep -q "^nvdp"; then
|
|
helm repo add nvdp https://nvidia.github.io/k8s-device-plugin
|
|
fi
|
|
helm repo update nvdp
|
|
|
|
# Remove Helm repository
|
|
remove-helm-repo:
|
|
helm repo remove nvdp
|
|
|
|
# Install NVIDIA device plugin for Kubernetes
|
|
install:
|
|
#!/bin/bash
|
|
set -euo pipefail
|
|
|
|
just add-helm-repo
|
|
|
|
gomplate -f values.gomplate.yaml -o values.generated.yaml
|
|
|
|
helm upgrade --install nvidia-device-plugin nvdp/nvidia-device-plugin \
|
|
--namespace ${NVIDIA_DEVICE_PLUGIN_NAMESPACE} \
|
|
--create-namespace \
|
|
--version ${NVIDIA_DEVICE_PLUGIN_VERSION} \
|
|
--values values.generated.yaml \
|
|
--wait
|
|
|
|
echo ""
|
|
echo "NVIDIA device plugin installed successfully"
|
|
echo "GPU Time-Slicing: ${GPU_TIME_SLICING_REPLICAS} replicas per GPU"
|
|
echo ""
|
|
echo "Verify GPU availability with:"
|
|
echo " just nvidia-device-plugin::verify"
|
|
|
|
# Upgrade NVIDIA device plugin
|
|
upgrade:
|
|
#!/bin/bash
|
|
set -euo pipefail
|
|
|
|
just add-helm-repo
|
|
|
|
gomplate -f values.gomplate.yaml -o values.generated.yaml
|
|
|
|
helm upgrade nvidia-device-plugin nvdp/nvidia-device-plugin \
|
|
--namespace ${NVIDIA_DEVICE_PLUGIN_NAMESPACE} \
|
|
--version ${NVIDIA_DEVICE_PLUGIN_VERSION} \
|
|
--values values.generated.yaml \
|
|
--wait
|
|
|
|
echo ""
|
|
echo "NVIDIA device plugin upgraded successfully"
|
|
echo "GPU Time-Slicing: ${GPU_TIME_SLICING_REPLICAS} replicas per GPU"
|
|
|
|
# Verify GPU resources are available
|
|
verify:
|
|
#!/bin/bash
|
|
set -euo pipefail
|
|
|
|
echo "=== GPU Resources per Node ==="
|
|
kubectl get nodes -o json | jq -r '.items[] | "\(.metadata.name): \(.status.capacity["nvidia.com/gpu"] // "0") GPUs (allocatable: \(.status.allocatable["nvidia.com/gpu"] // "0"))"'
|
|
|
|
echo ""
|
|
echo "=== Device Plugin Pods ==="
|
|
kubectl get pods -n ${NVIDIA_DEVICE_PLUGIN_NAMESPACE} -l app.kubernetes.io/name=nvidia-device-plugin
|
|
|
|
echo ""
|
|
echo "Test GPU access with:"
|
|
echo " just nvidia-device-plugin::test"
|
|
|
|
# Show detailed GPU information
|
|
gpu-info:
|
|
kubectl get nodes -o json | jq -r '.items[] | select(.status.capacity["nvidia.com/gpu"] != null) | {name: .metadata.name, gpus: .status.capacity["nvidia.com/gpu"], allocatable: .status.allocatable["nvidia.com/gpu"]}'
|
|
|
|
# Test GPU access by running nvidia-smi in a pod
|
|
test:
|
|
#!/bin/bash
|
|
set -euo pipefail
|
|
|
|
kubectl delete pod gpu-test --ignore-not-found=true
|
|
kubectl apply -f gpu-test-pod.yaml
|
|
|
|
echo "Waiting for pod to complete..."
|
|
kubectl wait --for=jsonpath='{.status.phase}'=Succeeded pod/gpu-test --timeout=60s || true
|
|
|
|
echo ""
|
|
echo "=== GPU Test Output ==="
|
|
kubectl logs gpu-test
|
|
|
|
kubectl delete pod gpu-test
|
|
|
|
# Restart device plugin pods to apply configuration changes
|
|
restart:
|
|
#!/bin/bash
|
|
set -euo pipefail
|
|
echo "Restarting NVIDIA device plugin pods..."
|
|
kubectl rollout restart daemonset/nvidia-device-plugin -n ${NVIDIA_DEVICE_PLUGIN_NAMESPACE}
|
|
kubectl rollout status daemonset/nvidia-device-plugin -n ${NVIDIA_DEVICE_PLUGIN_NAMESPACE} --timeout=120s
|
|
echo "NVIDIA device plugin restarted"
|
|
|
|
# Uninstall NVIDIA device plugin
|
|
uninstall:
|
|
#!/bin/bash
|
|
set -euo pipefail
|
|
|
|
helm uninstall nvidia-device-plugin -n ${NVIDIA_DEVICE_PLUGIN_NAMESPACE} || true
|
|
echo "NVIDIA device plugin uninstalled"
|