From 585c0f5ba3889bfc6e29b9fb2cb77f2af2b6c85b Mon Sep 17 00:00:00 2001 From: Masaki Yatsu Date: Fri, 21 Nov 2025 00:36:27 +0900 Subject: [PATCH] feat(jupyterhub): GPU support --- jupyterhub/.gitignore | 1 + jupyterhub/README.md | 171 ++++++++++++++++++ .../images/datastack-cuda-notebook/Dockerfile | 6 +- ...ub-crypt-key-external-secret.gomplate.yaml | 18 ++ jupyterhub/jupyterhub-values.gomplate.yaml | 14 +- jupyterhub/justfile | 75 +++++++- 6 files changed, 273 insertions(+), 12 deletions(-) create mode 100644 jupyterhub/jupyterhub-crypt-key-external-secret.gomplate.yaml diff --git a/jupyterhub/.gitignore b/jupyterhub/.gitignore index 29ad6b0..def5bb4 100644 --- a/jupyterhub/.gitignore +++ b/jupyterhub/.gitignore @@ -1,4 +1,5 @@ jupyterhub-values.yaml +jupyterhub-crypt-key-external-secret.yaml pre_spawn_hook.py vault-agent-config.hcl /notebooks/ diff --git a/jupyterhub/README.md b/jupyterhub/README.md index 973b1db..34eb92c 100644 --- a/jupyterhub/README.md +++ b/jupyterhub/README.md @@ -9,6 +9,7 @@ JupyterHub provides a multi-user Jupyter notebook environment with Keycloak OIDC - [Access](#access) - [Kernel Images](#kernel-images) - [Profile Configuration](#profile-configuration) +- [GPU Support](#gpu-support) - [Buun-Stack Images](#buun-stack-images) - [buunstack Package & SecretStore](#buunstack-package--secretstore) - [Vault Integration](#vault-integration) @@ -98,6 +99,176 @@ Available profile variables: Only `JUPYTER_PROFILE_DATASCIENCE_ENABLED` is true by default. +## GPU Support + +JupyterHub supports GPU-accelerated notebooks using NVIDIA GPUs. GPU support is automatically enabled during installation if the nvidia-device-plugin is detected. + +### GPU Prerequisites + +GPU support requires the following components to be installed: + +#### NVIDIA Device Plugin + +Install the NVIDIA device plugin for Kubernetes: + +```bash +just nvidia-device-plugin::install +``` + +This plugin: + +- Exposes NVIDIA GPUs to Kubernetes as schedulable resources +- Manages GPU allocation to pods +- Ensures proper GPU driver access within containers + +#### RuntimeClass Configuration + +The nvidia-device-plugin installation automatically creates the `nvidia` RuntimeClass, which: + +- Configures containerd to use the NVIDIA container runtime +- Enables GPU access for containers using `runtimeClassName: nvidia` + +### Enabling GPU Support + +During JupyterHub installation, you will be prompted: + +```bash +just jupyterhub::install +# When nvidia-device-plugin is installed, you'll see: +# "Enable GPU support for JupyterHub notebooks? (y/N)" +``` + +Alternatively, set the environment variable before installation: + +```bash +JUPYTERHUB_GPU_ENABLED=true +JUPYTERHUB_GPU_LIMIT=1 # Number of GPUs per user (default: 1) +``` + +### GPU-Enabled Profiles + +When GPU support is enabled: + +1. **All notebook profiles** get GPU access via `runtimeClassName: nvidia` +2. **CUDA-specific profile** (buun-stack-cuda) additionally includes: + - CUDA 12.x toolkit + - PyTorch with CUDA support + - GPU-optimized libraries + +### Usage + +#### Selecting a GPU Profile + +When spawning a notebook, select a profile with GPU capabilities: + +- **Buun-stack with CUDA**: Recommended for GPU workloads (requires custom image) +- **PyTorch**: Standard PyTorch notebook +- **TensorFlow**: Standard TensorFlow notebook + +#### Verifying GPU Access + +In your notebook, verify GPU availability: + +```python +import torch + +# Check if CUDA is available +print(f"CUDA available: {torch.cuda.is_available()}") + +# Get GPU device count +print(f"GPU count: {torch.cuda.device_count()}") + +# Get GPU device name +if torch.cuda.is_available(): + print(f"GPU name: {torch.cuda.get_device_name(0)}") + + # Test GPU operation + torch.cuda.synchronize() + print("GPU is working correctly!") +``` + +#### GPU Configuration + +Default GPU configuration: + +- **GPU limit per user**: 1 GPU (configurable via `JUPYTERHUB_GPU_LIMIT`) +- **Memory requests**: 1Gi (defined in singleuser settings) +- **RuntimeClass**: `nvidia` (automatically applied when GPU enabled) + +### Building GPU-Enabled Custom Images + +If using the buun-stack-cuda profile, build and push the CUDA-enabled image: + +```bash +# Enable CUDA profile +export JUPYTER_PROFILE_BUUN_STACK_CUDA_ENABLED=true + +# Build CUDA-enabled image (includes PyTorch with CUDA 12.x) +just jupyterhub::build-kernel-images + +# Push to registry +just jupyterhub::push-kernel-images +``` + +The CUDA image: + +- Based on `quay.io/jupyter/pytorch-notebook:x86_64-cuda12-python-3.12.10` +- Includes PyTorch with CUDA 12.4 support (`cu124`) +- Contains all standard buun-stack packages +- Supports GPU-accelerated deep learning + +### Troubleshooting GPU Issues + +#### Pod Not Scheduling + +If GPU-enabled pods fail to schedule: + +```bash +# Check if nvidia-device-plugin is running +kubectl get pods -n nvidia-device-plugin + +# Verify GPU resources are advertised +kubectl describe nodes | grep nvidia.com/gpu + +# Check RuntimeClass exists +kubectl get runtimeclass nvidia +``` + +#### CUDA Not Available + +If `torch.cuda.is_available()` returns `False`: + +1. Verify the image has CUDA support: + + ```bash + # In notebook + !nvcc --version # Should show CUDA compiler version + ``` + +2. Check Pod uses nvidia RuntimeClass: + + ```bash + kubectl get pod -n datastack -o yaml | grep runtimeClassName + ``` + +3. Rebuild image if using custom buun-stack-cuda image + +#### GPU Memory Issues + +Monitor GPU usage: + +```python +import torch + +# Check GPU memory +if torch.cuda.is_available(): + print(f"Allocated: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB") + print(f"Reserved: {torch.cuda.memory_reserved(0) / 1024**3:.2f} GB") + + # Clear cache if needed + torch.cuda.empty_cache() +``` + ## Buun-Stack Images Buun-stack images provide comprehensive data science environments with: diff --git a/jupyterhub/images/datastack-cuda-notebook/Dockerfile b/jupyterhub/images/datastack-cuda-notebook/Dockerfile index b9a5fdb..b6426f7 100644 --- a/jupyterhub/images/datastack-cuda-notebook/Dockerfile +++ b/jupyterhub/images/datastack-cuda-notebook/Dockerfile @@ -168,11 +168,11 @@ RUN --mount=type=cache,target=/home/${NB_USER}/.cache/pip pip install -i "${pip_ # && pip uninstall -y pycrdt datalayer_pycrdt \ # && pip install -i "${pip_repository_url}" 'datalayer_pycrdt==0.12.17' -# Install PyTorch with pip (https://pytorch.org/get-started/locally/) +# Install PyTorch with CUDA 12.x support (https://pytorch.org/get-started/locally/) # langchain-openai must be updated to avoid pydantic v2 error -# https://github.com/run-llama/llama_index/issues/16540https://github.com/run-llama/llama_index/issues/16540 +# https://github.com/run-llama/llama_index/issues/16540 # hadolint ignore=DL3013 -RUN pip install --no-cache-dir --index-url 'https://download.pytorch.org/whl/cpu' --upgrade \ +RUN pip install --no-cache-dir --index-url 'https://download.pytorch.org/whl/cu124' --upgrade \ langchain-openai \ torch \ torchaudio \ diff --git a/jupyterhub/jupyterhub-crypt-key-external-secret.gomplate.yaml b/jupyterhub/jupyterhub-crypt-key-external-secret.gomplate.yaml new file mode 100644 index 0000000..daa44fc --- /dev/null +++ b/jupyterhub/jupyterhub-crypt-key-external-secret.gomplate.yaml @@ -0,0 +1,18 @@ +apiVersion: external-secrets.io/v1 +kind: ExternalSecret +metadata: + name: jupyterhub-crypt-key + namespace: {{ .Env.JUPYTERHUB_NAMESPACE }} +spec: + refreshInterval: 1h + secretStoreRef: + name: vault-secret-store + kind: ClusterSecretStore + target: + name: jupyterhub-crypt-key + creationPolicy: Owner + data: + - secretKey: crypt-key + remoteRef: + key: jupyterhub/config + property: crypt-key diff --git a/jupyterhub/jupyterhub-values.gomplate.yaml b/jupyterhub/jupyterhub-values.gomplate.yaml index b584d28..bad9851 100644 --- a/jupyterhub/jupyterhub-values.gomplate.yaml +++ b/jupyterhub/jupyterhub-values.gomplate.yaml @@ -1,6 +1,10 @@ hub: extraEnv: - JUPYTERHUB_CRYPT_KEY: {{ .Env.JUPYTERHUB_CRYPT_KEY | quote }} + JUPYTERHUB_CRYPT_KEY: + valueFrom: + secretKeyRef: + name: jupyterhub-crypt-key + key: crypt-key VAULT_ADDR: {{ .Env.VAULT_ADDR | quote }} NOTEBOOK_VAULT_TOKEN_TTL: {{ .Env.NOTEBOOK_VAULT_TOKEN_TTL | quote }} NOTEBOOK_VAULT_TOKEN_MAX_TTL: {{ .Env.NOTEBOOK_VAULT_TOKEN_MAX_TTL | quote }} @@ -173,6 +177,14 @@ singleuser: NOTEBOOK_VAULT_TOKEN_MAX_TTL: "{{ .Env.NOTEBOOK_VAULT_TOKEN_MAX_TTL }}" # JUPYTERHUB_SINGLEUSER_EXTENSION: "0" + {{- if eq .Env.JUPYTERHUB_GPU_ENABLED "true" }} + extraPodConfig: + runtimeClassName: nvidia + extraResource: + limits: + nvidia.com/gpu: "{{ .Env.JUPYTERHUB_GPU_LIMIT }}" + {{- end }} + storage: {{ if env.Getenv "PVC_NAME" -}} type: static diff --git a/jupyterhub/justfile b/jupyterhub/justfile index fdd8650..714c5be 100644 --- a/jupyterhub/justfile +++ b/jupyterhub/justfile @@ -9,7 +9,7 @@ export JUPYTERHUB_NFS_PV_ENABLED := env("JUPYTERHUB_NFS_PV_ENABLED", "") export JUPYTERHUB_STORAGE_CLASS := env("JUPYTERHUB_STORAGE_CLASS", "") export JUPYTERHUB_VAULT_INTEGRATION_ENABLED := env("JUPYTERHUB_VAULT_INTEGRATION_ENABLED", "") export JUPYTERHUB_AIRFLOW_DAGS_PERSISTENCE_ENABLED := env("JUPYTERHUB_AIRFLOW_DAGS_PERSISTENCE_ENABLED", "") -export JUPYTER_PYTHON_KERNEL_TAG := env("JUPYTER_PYTHON_KERNEL_TAG", "python-3.12-50") +export JUPYTER_PYTHON_KERNEL_TAG := env("JUPYTER_PYTHON_KERNEL_TAG", "python-3.12-51") export KERNEL_IMAGE_BUUN_STACK_REPOSITORY := env("KERNEL_IMAGE_BUUN_STACK_REPOSITORY", "buun-stack-notebook") export KERNEL_IMAGE_BUUN_STACK_CUDA_REPOSITORY := env("KERNEL_IMAGE_BUUN_STACK_CUDA_REPOSITORY", "buun-stack-cuda-notebook") export JUPYTER_PROFILE_MINIMAL_ENABLED := env("JUPYTER_PROFILE_MINIMAL_ENABLED", "false") @@ -20,6 +20,8 @@ export JUPYTER_PROFILE_PYTORCH_ENABLED := env("JUPYTER_PROFILE_PYTORCH_ENABLED", export JUPYTER_PROFILE_TENSORFLOW_ENABLED := env("JUPYTER_PROFILE_TENSORFLOW_ENABLED", "false") export JUPYTER_PROFILE_BUUN_STACK_ENABLED := env("JUPYTER_PROFILE_BUUN_STACK_ENABLED", "false") export JUPYTER_PROFILE_BUUN_STACK_CUDA_ENABLED := env("JUPYTER_PROFILE_BUUN_STACK_CUDA_ENABLED", "false") +export JUPYTERHUB_GPU_ENABLED := env("JUPYTERHUB_GPU_ENABLED", "") +export JUPYTERHUB_GPU_LIMIT := env("JUPYTERHUB_GPU_LIMIT", "1") export JUPYTERHUB_VAULT_TOKEN_TTL := env("JUPYTERHUB_VAULT_TOKEN_TTL", "24h") export NOTEBOOK_VAULT_TOKEN_TTL := env("NOTEBOOK_VAULT_TOKEN_TTL", "24h") export NOTEBOOK_VAULT_TOKEN_MAX_TTL := env("NOTEBOOK_VAULT_TOKEN_MAX_TTL", "168h") @@ -38,6 +40,8 @@ export VAULT_ADDR := "https://" + VAULT_HOST export MONITORING_ENABLED := env("MONITORING_ENABLED", "") export PROMETHEUS_NAMESPACE := env("PROMETHEUS_NAMESPACE", "monitoring") export DOCKER_CMD := env("DOCKER_CMD", "docker") +export EXTERNAL_SECRETS_NAMESPACE := env("EXTERNAL_SECRETS_NAMESPACE", "external-secrets") +export K8S_VAULT_NAMESPACE := env("K8S_VAULT_NAMESPACE", "vault") [private] default: @@ -61,6 +65,37 @@ create-namespace: delete-namespace: kubectl delete namespace ${JUPYTERHUB_NAMESPACE} --ignore-not-found +# Create JupyterHub crypt key secret +create-crypt-key-secret: + #!/bin/bash + set -euo pipefail + crypt_key=$(just utils::random-password) + + if helm status external-secrets -n ${EXTERNAL_SECRETS_NAMESPACE} &>/dev/null; then + echo "External Secrets Operator detected. Storing crypt key in Vault..." + just vault::put jupyterhub/config crypt-key="${crypt_key}" + + kubectl delete secret jupyterhub-crypt-key -n ${JUPYTERHUB_NAMESPACE} --ignore-not-found + kubectl delete externalsecret jupyterhub-crypt-key -n ${JUPYTERHUB_NAMESPACE} --ignore-not-found + + gomplate -f jupyterhub-crypt-key-external-secret.gomplate.yaml \ + -o jupyterhub-crypt-key-external-secret.yaml + kubectl apply -f jupyterhub-crypt-key-external-secret.yaml + + echo "Waiting for ExternalSecret to sync..." + kubectl wait --for=condition=Ready externalsecret/jupyterhub-crypt-key \ + -n ${JUPYTERHUB_NAMESPACE} --timeout=60s + else + echo "External Secrets Operator not found. Creating secret directly..." + kubectl delete secret jupyterhub-crypt-key -n ${JUPYTERHUB_NAMESPACE} --ignore-not-found + kubectl create secret generic jupyterhub-crypt-key -n ${JUPYTERHUB_NAMESPACE} \ + --from-literal=crypt-key="${crypt_key}" + + if helm status vault -n ${K8S_VAULT_NAMESPACE} &>/dev/null; then + just vault::put jupyterhub/config crypt-key="${crypt_key}" + fi + fi + # Install JupyterHub install root_token='': #!/bin/bash @@ -73,12 +108,11 @@ install root_token='': ) done - # Generate JUPYTERHUB_CRYPT_KEY if not exists - if [ -z "${JUPYTERHUB_CRYPT_KEY:-}" ]; then - echo "Generating JUPYTERHUB_CRYPT_KEY..." - export JUPYTERHUB_CRYPT_KEY=$(just utils::random-password) - echo "JUPYTERHUB_CRYPT_KEY=${JUPYTERHUB_CRYPT_KEY}" >> ../../.env.local - echo "✓ JUPYTERHUB_CRYPT_KEY generated and saved to .env.local" + just create-namespace + + # Create crypt key secret if it doesn't exist + if ! kubectl get secret jupyterhub-crypt-key -n ${JUPYTERHUB_NAMESPACE} &>/dev/null; then + just create-crypt-key-secret fi if helm status kube-prometheus-stack -n ${PROMETHEUS_NAMESPACE} &>/dev/null; then @@ -93,7 +127,25 @@ install root_token='': MONITORING_ENABLED="false" fi - just create-namespace + # Check if nvidia-device-plugin is installed + if helm status nvidia-device-plugin -n ${NVIDIA_DEVICE_PLUGIN_NAMESPACE:-nvidia-device-plugin} &>/dev/null; then + if [ -z "${JUPYTERHUB_GPU_ENABLED}" ]; then + if gum confirm "Enable GPU support for JupyterHub notebooks?"; then + JUPYTERHUB_GPU_ENABLED="true" + if [ -z "${JUPYTERHUB_GPU_LIMIT}" ]; then + JUPYTERHUB_GPU_LIMIT=$( + gum input --prompt="GPU limit per user (default: 1): " --width=100 \ + --placeholder="1" --value="1" + ) + fi + else + JUPYTERHUB_GPU_ENABLED="false" + fi + fi + else + JUPYTERHUB_GPU_ENABLED="false" + fi + # just k8s::copy-regcred ${JUPYTERHUB_NAMESPACE} just keycloak::create-client realm=${KEYCLOAK_REALM} client_id=${JUPYTERHUB_OIDC_CLIENT_ID} \ redirect_url="https://${JUPYTERHUB_HOST}/hub/oauth_callback" \ @@ -216,11 +268,18 @@ uninstall: helm uninstall jupyterhub -n ${JUPYTERHUB_NAMESPACE} --wait --ignore-not-found kubectl delete pods -n ${JUPYTERHUB_NAMESPACE} -l app.kubernetes.io/component=singleuser-server kubectl delete -n ${JUPYTERHUB_NAMESPACE} pvc jupyter-nfs-pvc --ignore-not-found + kubectl delete -n ${JUPYTERHUB_NAMESPACE} secret jupyterhub-crypt-key --ignore-not-found + kubectl delete -n ${JUPYTERHUB_NAMESPACE} externalsecret jupyterhub-crypt-key --ignore-not-found kubectl delete -n ${JUPYTERHUB_NAMESPACE} externalsecret jupyterhub-vault-token --ignore-not-found if kubectl get pv jupyter-nfs-pv &>/dev/null; then kubectl patch pv jupyter-nfs-pv -p '{"spec":{"claimRef":null}}' fi + # Clean up Vault entries if present + if helm status vault -n ${K8S_VAULT_NAMESPACE} &>/dev/null; then + just vault::delete jupyterhub/config || true + fi + # Delete JupyterHub PV and StorageClass delete-pv: #!/bin/bash