feat(jupyterhub): GPU support

2025-11-21 00:36:27 +09:00
parent 71b41c6dbf
commit 585c0f5ba3
6 changed files with 273 additions and 12 deletions
--- a/jupyterhub/.gitignore
+++ b/jupyterhub/.gitignore
@@ -1,4 +1,5 @@
 jupyterhub-values.yaml
+jupyterhub-crypt-key-external-secret.yaml
 pre_spawn_hook.py
 vault-agent-config.hcl
 /notebooks/
--- a/jupyterhub/README.md
+++ b/jupyterhub/README.md
@@ -9,6 +9,7 @@ JupyterHub provides a multi-user Jupyter notebook environment with Keycloak OIDC
 - [Access](#access)
 - [Kernel Images](#kernel-images)
 - [Profile Configuration](#profile-configuration)
+- [GPU Support](#gpu-support)
 - [Buun-Stack Images](#buun-stack-images)
 - [buunstack Package & SecretStore](#buunstack-package--secretstore)
 - [Vault Integration](#vault-integration)
@@ -98,6 +99,176 @@ Available profile variables:

 Only `JUPYTER_PROFILE_DATASCIENCE_ENABLED` is true by default.

+## GPU Support
+
+JupyterHub supports GPU-accelerated notebooks using NVIDIA GPUs. GPU support is automatically enabled during installation if the nvidia-device-plugin is detected.
+
+### GPU Prerequisites
+
+GPU support requires the following components to be installed:
+
+#### NVIDIA Device Plugin
+
+Install the NVIDIA device plugin for Kubernetes:
+
+```bash
+just nvidia-device-plugin::install
+```
+
+This plugin:
+
+- Exposes NVIDIA GPUs to Kubernetes as schedulable resources
+- Manages GPU allocation to pods
+- Ensures proper GPU driver access within containers
+
+#### RuntimeClass Configuration
+
+The nvidia-device-plugin installation automatically creates the `nvidia` RuntimeClass, which:
+
+- Configures containerd to use the NVIDIA container runtime
+- Enables GPU access for containers using `runtimeClassName: nvidia`
+
+### Enabling GPU Support
+
+During JupyterHub installation, you will be prompted:
+
+```bash
+just jupyterhub::install
+# When nvidia-device-plugin is installed, you'll see:
+# "Enable GPU support for JupyterHub notebooks? (y/N)"
+```
+
+Alternatively, set the environment variable before installation:
+
+```bash
+JUPYTERHUB_GPU_ENABLED=true
+JUPYTERHUB_GPU_LIMIT=1  # Number of GPUs per user (default: 1)
+```
+
+### GPU-Enabled Profiles
+
+When GPU support is enabled:
+
+1. **All notebook profiles** get GPU access via `runtimeClassName: nvidia`
+2. **CUDA-specific profile** (buun-stack-cuda) additionally includes:
+   - CUDA 12.x toolkit
+   - PyTorch with CUDA support
+   - GPU-optimized libraries
+
+### Usage
+
+#### Selecting a GPU Profile
+
+When spawning a notebook, select a profile with GPU capabilities:
+
+- **Buun-stack with CUDA**: Recommended for GPU workloads (requires custom image)
+- **PyTorch**: Standard PyTorch notebook
+- **TensorFlow**: Standard TensorFlow notebook
+
+#### Verifying GPU Access
+
+In your notebook, verify GPU availability:
+
+```python
+import torch
+
+# Check if CUDA is available
+print(f"CUDA available: {torch.cuda.is_available()}")
+
+# Get GPU device count
+print(f"GPU count: {torch.cuda.device_count()}")
+
+# Get GPU device name
+if torch.cuda.is_available():
+    print(f"GPU name: {torch.cuda.get_device_name(0)}")
+
+    # Test GPU operation
+    torch.cuda.synchronize()
+    print("GPU is working correctly!")
+```
+
+#### GPU Configuration
+
+Default GPU configuration:
+
+- **GPU limit per user**: 1 GPU (configurable via `JUPYTERHUB_GPU_LIMIT`)
+- **Memory requests**: 1Gi (defined in singleuser settings)
+- **RuntimeClass**: `nvidia` (automatically applied when GPU enabled)
+
+### Building GPU-Enabled Custom Images
+
+If using the buun-stack-cuda profile, build and push the CUDA-enabled image:
+
+```bash
+# Enable CUDA profile
+export JUPYTER_PROFILE_BUUN_STACK_CUDA_ENABLED=true
+
+# Build CUDA-enabled image (includes PyTorch with CUDA 12.x)
+just jupyterhub::build-kernel-images
+
+# Push to registry
+just jupyterhub::push-kernel-images
+```
+
+The CUDA image:
+
+- Based on `quay.io/jupyter/pytorch-notebook:x86_64-cuda12-python-3.12.10`
+- Includes PyTorch with CUDA 12.4 support (`cu124`)
+- Contains all standard buun-stack packages
+- Supports GPU-accelerated deep learning
+
+### Troubleshooting GPU Issues
+
+#### Pod Not Scheduling
+
+If GPU-enabled pods fail to schedule:
+
+```bash
+# Check if nvidia-device-plugin is running
+kubectl get pods -n nvidia-device-plugin
+
+# Verify GPU resources are advertised
+kubectl describe nodes | grep nvidia.com/gpu
+
+# Check RuntimeClass exists
+kubectl get runtimeclass nvidia
+```
+
+#### CUDA Not Available
+
+If `torch.cuda.is_available()` returns `False`:
+
+1. Verify the image has CUDA support:
+
+   ```bash
+   # In notebook
+   !nvcc --version  # Should show CUDA compiler version
+   ```
+
+2. Check Pod uses nvidia RuntimeClass:
+
+   ```bash
+   kubectl get pod <pod-name> -n datastack -o yaml | grep runtimeClassName
+   ```
+
+3. Rebuild image if using custom buun-stack-cuda image
+
+#### GPU Memory Issues
+
+Monitor GPU usage:
+
+```python
+import torch
+
+# Check GPU memory
+if torch.cuda.is_available():
+    print(f"Allocated: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")
+    print(f"Reserved: {torch.cuda.memory_reserved(0) / 1024**3:.2f} GB")
+
+    # Clear cache if needed
+    torch.cuda.empty_cache()
+```
+
 ## Buun-Stack Images

 Buun-stack images provide comprehensive data science environments with:
--- a/jupyterhub/images/datastack-cuda-notebook/Dockerfile
+++ b/jupyterhub/images/datastack-cuda-notebook/Dockerfile
@@ -168,11 +168,11 @@ RUN --mount=type=cache,target=/home/${NB_USER}/.cache/pip pip install -i "${pip_
 #  && pip uninstall -y pycrdt datalayer_pycrdt \
 #  && pip install -i "${pip_repository_url}" 'datalayer_pycrdt==0.12.17'

-# Install PyTorch with pip (https://pytorch.org/get-started/locally/)
+# Install PyTorch with CUDA 12.x support (https://pytorch.org/get-started/locally/)
 # langchain-openai must be updated to avoid pydantic v2 error
-# https://github.com/run-llama/llama_index/issues/16540https://github.com/run-llama/llama_index/issues/16540
+# https://github.com/run-llama/llama_index/issues/16540
 # hadolint ignore=DL3013
-RUN pip install --no-cache-dir --index-url 'https://download.pytorch.org/whl/cpu' --upgrade \
+RUN pip install --no-cache-dir --index-url 'https://download.pytorch.org/whl/cu124' --upgrade \
    langchain-openai \
    torch \
    torchaudio \
--- a/jupyterhub/jupyterhub-crypt-key-external-secret.gomplate.yaml
+++ b/jupyterhub/jupyterhub-crypt-key-external-secret.gomplate.yaml
@@ -0,0 +1,18 @@
+apiVersion: external-secrets.io/v1
+kind: ExternalSecret
+metadata:
+  name: jupyterhub-crypt-key
+  namespace: {{ .Env.JUPYTERHUB_NAMESPACE }}
+spec:
+  refreshInterval: 1h
+  secretStoreRef:
+    name: vault-secret-store
+    kind: ClusterSecretStore
+  target:
+    name: jupyterhub-crypt-key
+    creationPolicy: Owner
+  data:
+    - secretKey: crypt-key
+      remoteRef:
+        key: jupyterhub/config
+        property: crypt-key
--- a/jupyterhub/jupyterhub-values.gomplate.yaml
+++ b/jupyterhub/jupyterhub-values.gomplate.yaml
@@ -1,6 +1,10 @@
 hub:
  extraEnv:
-    JUPYTERHUB_CRYPT_KEY: {{ .Env.JUPYTERHUB_CRYPT_KEY | quote }}
+    JUPYTERHUB_CRYPT_KEY:
+      valueFrom:
+        secretKeyRef:
+          name: jupyterhub-crypt-key
+          key: crypt-key
    VAULT_ADDR: {{ .Env.VAULT_ADDR | quote }}
    NOTEBOOK_VAULT_TOKEN_TTL: {{ .Env.NOTEBOOK_VAULT_TOKEN_TTL | quote }}
    NOTEBOOK_VAULT_TOKEN_MAX_TTL: {{ .Env.NOTEBOOK_VAULT_TOKEN_MAX_TTL | quote }}
@@ -173,6 +177,14 @@ singleuser:
    NOTEBOOK_VAULT_TOKEN_MAX_TTL: "{{ .Env.NOTEBOOK_VAULT_TOKEN_MAX_TTL }}"
    # JUPYTERHUB_SINGLEUSER_EXTENSION: "0"

+  {{- if eq .Env.JUPYTERHUB_GPU_ENABLED "true" }}
+  extraPodConfig:
+    runtimeClassName: nvidia
+  extraResource:
+    limits:
+      nvidia.com/gpu: "{{ .Env.JUPYTERHUB_GPU_LIMIT }}"
+  {{- end }}
+
  storage:
    {{ if env.Getenv "PVC_NAME" -}}
    type: static
--- a/jupyterhub/justfile
+++ b/jupyterhub/justfile
@@ -9,7 +9,7 @@ export JUPYTERHUB_NFS_PV_ENABLED := env("JUPYTERHUB_NFS_PV_ENABLED", "")
 export JUPYTERHUB_STORAGE_CLASS := env("JUPYTERHUB_STORAGE_CLASS", "")
 export JUPYTERHUB_VAULT_INTEGRATION_ENABLED := env("JUPYTERHUB_VAULT_INTEGRATION_ENABLED", "")
 export JUPYTERHUB_AIRFLOW_DAGS_PERSISTENCE_ENABLED := env("JUPYTERHUB_AIRFLOW_DAGS_PERSISTENCE_ENABLED", "")
-export JUPYTER_PYTHON_KERNEL_TAG := env("JUPYTER_PYTHON_KERNEL_TAG", "python-3.12-50")
+export JUPYTER_PYTHON_KERNEL_TAG := env("JUPYTER_PYTHON_KERNEL_TAG", "python-3.12-51")
 export KERNEL_IMAGE_BUUN_STACK_REPOSITORY := env("KERNEL_IMAGE_BUUN_STACK_REPOSITORY", "buun-stack-notebook")
 export KERNEL_IMAGE_BUUN_STACK_CUDA_REPOSITORY := env("KERNEL_IMAGE_BUUN_STACK_CUDA_REPOSITORY", "buun-stack-cuda-notebook")
 export JUPYTER_PROFILE_MINIMAL_ENABLED := env("JUPYTER_PROFILE_MINIMAL_ENABLED", "false")
@@ -20,6 +20,8 @@ export JUPYTER_PROFILE_PYTORCH_ENABLED := env("JUPYTER_PROFILE_PYTORCH_ENABLED",
 export JUPYTER_PROFILE_TENSORFLOW_ENABLED := env("JUPYTER_PROFILE_TENSORFLOW_ENABLED", "false")
 export JUPYTER_PROFILE_BUUN_STACK_ENABLED := env("JUPYTER_PROFILE_BUUN_STACK_ENABLED", "false")
 export JUPYTER_PROFILE_BUUN_STACK_CUDA_ENABLED := env("JUPYTER_PROFILE_BUUN_STACK_CUDA_ENABLED", "false")
+export JUPYTERHUB_GPU_ENABLED := env("JUPYTERHUB_GPU_ENABLED", "")
+export JUPYTERHUB_GPU_LIMIT := env("JUPYTERHUB_GPU_LIMIT", "1")
 export JUPYTERHUB_VAULT_TOKEN_TTL := env("JUPYTERHUB_VAULT_TOKEN_TTL", "24h")
 export NOTEBOOK_VAULT_TOKEN_TTL := env("NOTEBOOK_VAULT_TOKEN_TTL", "24h")
 export NOTEBOOK_VAULT_TOKEN_MAX_TTL := env("NOTEBOOK_VAULT_TOKEN_MAX_TTL", "168h")
@@ -38,6 +40,8 @@ export VAULT_ADDR := "https://" + VAULT_HOST
 export MONITORING_ENABLED := env("MONITORING_ENABLED", "")
 export PROMETHEUS_NAMESPACE := env("PROMETHEUS_NAMESPACE", "monitoring")
 export DOCKER_CMD := env("DOCKER_CMD", "docker")
+export EXTERNAL_SECRETS_NAMESPACE := env("EXTERNAL_SECRETS_NAMESPACE", "external-secrets")
+export K8S_VAULT_NAMESPACE := env("K8S_VAULT_NAMESPACE", "vault")

 [private]
 default:
@@ -61,6 +65,37 @@ create-namespace:
 delete-namespace:
    kubectl delete namespace ${JUPYTERHUB_NAMESPACE} --ignore-not-found

+# Create JupyterHub crypt key secret
+create-crypt-key-secret:
+    #!/bin/bash
+    set -euo pipefail
+    crypt_key=$(just utils::random-password)
+
+    if helm status external-secrets -n ${EXTERNAL_SECRETS_NAMESPACE} &>/dev/null; then
+        echo "External Secrets Operator detected. Storing crypt key in Vault..."
+        just vault::put jupyterhub/config crypt-key="${crypt_key}"
+
+        kubectl delete secret jupyterhub-crypt-key -n ${JUPYTERHUB_NAMESPACE} --ignore-not-found
+        kubectl delete externalsecret jupyterhub-crypt-key -n ${JUPYTERHUB_NAMESPACE} --ignore-not-found
+
+        gomplate -f jupyterhub-crypt-key-external-secret.gomplate.yaml \
+            -o jupyterhub-crypt-key-external-secret.yaml
+        kubectl apply -f jupyterhub-crypt-key-external-secret.yaml
+
+        echo "Waiting for ExternalSecret to sync..."
+        kubectl wait --for=condition=Ready externalsecret/jupyterhub-crypt-key \
+            -n ${JUPYTERHUB_NAMESPACE} --timeout=60s
+    else
+        echo "External Secrets Operator not found. Creating secret directly..."
+        kubectl delete secret jupyterhub-crypt-key -n ${JUPYTERHUB_NAMESPACE} --ignore-not-found
+        kubectl create secret generic jupyterhub-crypt-key -n ${JUPYTERHUB_NAMESPACE} \
+            --from-literal=crypt-key="${crypt_key}"
+
+        if helm status vault -n ${K8S_VAULT_NAMESPACE} &>/dev/null; then
+            just vault::put jupyterhub/config crypt-key="${crypt_key}"
+        fi
+    fi
+
 # Install JupyterHub
 install root_token='':
    #!/bin/bash
@@ -73,12 +108,11 @@ install root_token='':
        )
    done

-    # Generate JUPYTERHUB_CRYPT_KEY if not exists
-    if [ -z "${JUPYTERHUB_CRYPT_KEY:-}" ]; then
-        echo "Generating JUPYTERHUB_CRYPT_KEY..."
-        export JUPYTERHUB_CRYPT_KEY=$(just utils::random-password)
-        echo "JUPYTERHUB_CRYPT_KEY=${JUPYTERHUB_CRYPT_KEY}" >> ../../.env.local
-        echo "✓ JUPYTERHUB_CRYPT_KEY generated and saved to .env.local"
+    just create-namespace
+
+    # Create crypt key secret if it doesn't exist
+    if ! kubectl get secret jupyterhub-crypt-key -n ${JUPYTERHUB_NAMESPACE} &>/dev/null; then
+        just create-crypt-key-secret
    fi

    if helm status kube-prometheus-stack -n ${PROMETHEUS_NAMESPACE} &>/dev/null; then
@@ -93,7 +127,25 @@ install root_token='':
        MONITORING_ENABLED="false"
    fi

-    just create-namespace
+    # Check if nvidia-device-plugin is installed
+    if helm status nvidia-device-plugin -n ${NVIDIA_DEVICE_PLUGIN_NAMESPACE:-nvidia-device-plugin} &>/dev/null; then
+        if [ -z "${JUPYTERHUB_GPU_ENABLED}" ]; then
+            if gum confirm "Enable GPU support for JupyterHub notebooks?"; then
+                JUPYTERHUB_GPU_ENABLED="true"
+                if [ -z "${JUPYTERHUB_GPU_LIMIT}" ]; then
+                    JUPYTERHUB_GPU_LIMIT=$(
+                        gum input --prompt="GPU limit per user (default: 1): " --width=100 \
+                        --placeholder="1" --value="1"
+                    )
+                fi
+            else
+                JUPYTERHUB_GPU_ENABLED="false"
+            fi
+        fi
+    else
+        JUPYTERHUB_GPU_ENABLED="false"
+    fi
+
    # just k8s::copy-regcred ${JUPYTERHUB_NAMESPACE}
    just keycloak::create-client realm=${KEYCLOAK_REALM} client_id=${JUPYTERHUB_OIDC_CLIENT_ID} \
        redirect_url="https://${JUPYTERHUB_HOST}/hub/oauth_callback" \
@@ -216,11 +268,18 @@ uninstall:
    helm uninstall jupyterhub -n ${JUPYTERHUB_NAMESPACE} --wait --ignore-not-found
    kubectl delete pods -n ${JUPYTERHUB_NAMESPACE} -l app.kubernetes.io/component=singleuser-server
    kubectl delete -n ${JUPYTERHUB_NAMESPACE} pvc jupyter-nfs-pvc --ignore-not-found
+    kubectl delete -n ${JUPYTERHUB_NAMESPACE} secret jupyterhub-crypt-key --ignore-not-found
+    kubectl delete -n ${JUPYTERHUB_NAMESPACE} externalsecret jupyterhub-crypt-key --ignore-not-found
    kubectl delete -n ${JUPYTERHUB_NAMESPACE} externalsecret jupyterhub-vault-token --ignore-not-found
    if kubectl get pv jupyter-nfs-pv &>/dev/null; then
        kubectl patch pv jupyter-nfs-pv -p '{"spec":{"claimRef":null}}'
    fi

+    # Clean up Vault entries if present
+    if helm status vault -n ${K8S_VAULT_NAMESPACE} &>/dev/null; then
+        just vault::delete jupyterhub/config || true
+    fi
+
 # Delete JupyterHub PV and StorageClass
 delete-pv:
    #!/bin/bash