fix(jupyterhub): admin vault token renewal

This commit is contained in:
Masaki Yatsu
2025-09-08 18:43:08 +09:00
parent c82c6aa22b
commit 4df776c181
6 changed files with 322 additions and 127 deletions

View File

@@ -1,3 +1,4 @@
jupyterhub-values.yaml
pre_spawn_hook.py
vault-agent-config.hcl
/notebooks/

View File

@@ -23,6 +23,11 @@ hub:
mode: 0644
stringData: |
{{ .Env.USER_POLICY_HCL | strings.Indent 8 }}
pre_spawn_hook.py:
mountPath: /srv/jupyterhub/pre_spawn_hook.py
mode: 0644
stringData: |
{{ file.Read "pre_spawn_hook.py" | strings.Indent 8 }}
# Override the default command to run our startup script first
command:
@@ -56,112 +61,10 @@ hub:
- email
extraConfig:
pre-spawn-hook: |
# Set environment variables for spawned containers
import hvac
{{- if eq .Env.JUPYTERHUB_VAULT_INTEGRATION_ENABLED "true" }}
def get_vault_token():
"""Read Vault token from file"""
import os
token_file = '/vault/secrets/vault-token'
try:
with open(token_file, 'r') as f:
token = f.read().strip()
if token:
return token
except FileNotFoundError:
print(f"Token file not found: {token_file}")
except Exception as e:
print(f"Error reading token file {token_file}: {e}")
return None
{{- end }}
async def pre_spawn_hook(spawner):
"""Set essential environment variables for spawned containers"""
# PostgreSQL configuration
spawner.environment["POSTGRES_HOST"] = "postgres-cluster-rw.postgres"
spawner.environment["POSTGRES_PORT"] = "5432"
# JupyterHub API configuration
spawner.environment["JUPYTERHUB_API_URL"] = "http://hub:8081/hub/api"
# Logging configuration
spawner.environment["BUUNSTACK_LOG_LEVEL"] = "{{ .Env.JUPYTER_BUUNSTACK_LOG_LEVEL }}"
{{- if eq .Env.JUPYTERHUB_VAULT_INTEGRATION_ENABLED "true" }}
# Create user-specific Vault token directly
try:
username = spawner.user.name
# Step 1: Initialize admin Vault client with file-based token
import os
vault_addr = os.environ.get("VAULT_ADDR", "{{ .Env.VAULT_ADDR }}")
vault_token = get_vault_token()
spawner.log.info(f"pre_spawn_hook starting for {username}")
spawner.log.info(f"Vault address: {vault_addr}")
spawner.log.info(f"Vault token source: {'file' if os.path.exists('/vault/secrets/vault-token') else 'env'}")
spawner.log.info(f"Vault token present: {bool(vault_token)}, length: {len(vault_token) if vault_token else 0}")
if not vault_token:
raise Exception("No Vault token available from file or environment")
vault_client = hvac.Client(url=vault_addr, verify=False)
vault_client.token = vault_token
if not vault_client.is_authenticated():
raise Exception("Admin token is not authenticated")
# Step 2: Create user-specific policy
user_policy_name = "jupyter-user-{}".format(username)
# Read policy template from file
import os
policy_template_path = "/srv/jupyterhub/user_policy.hcl"
with open(policy_template_path, 'r') as f:
policy_template = f.read()
# Replace {username} placeholder with actual username
user_policy = policy_template.replace("{username}", username)
# Write user-specific policy
try:
vault_client.sys.create_or_update_policy(user_policy_name, user_policy)
spawner.log.info("✅ Created policy: {}".format(user_policy_name))
except Exception as policy_e:
spawner.log.warning("Policy creation failed (may already exist): {}".format(policy_e))
# Step 3: Create user-specific token
# Get TTL settings from environment variables
user_token_ttl = os.environ.get("NOTEBOOK_VAULT_TOKEN_TTL", "24h")
user_token_max_ttl = os.environ.get("NOTEBOOK_VAULT_TOKEN_MAX_TTL", "168h")
token_response = vault_client.auth.token.create_orphan(
policies=[user_policy_name],
ttl=user_token_ttl,
renewable=True,
display_name="notebook-{}".format(username),
explicit_max_ttl=user_token_max_ttl
)
user_vault_token = token_response["auth"]["client_token"]
lease_duration = token_response["auth"].get("lease_duration", 3600)
# Set user-specific Vault token as environment variable
spawner.environment["NOTEBOOK_VAULT_TOKEN"] = user_vault_token
spawner.log.info("✅ User-specific Vault token created for {} (TTL: {}s, renewable, max TTL: {})".format(username, lease_duration, user_token_max_ttl))
except Exception as e:
spawner.log.error("Failed to create user-specific Vault token for {}: {}".format(spawner.user.name, e))
import traceback
spawner.log.error("Full traceback: {}".format(traceback.format_exc()))
{{- end }}
c.KubeSpawner.pre_spawn_hook = pre_spawn_hook
load-pre-spawn-hook: |
# Load pre_spawn_hook from external file
with open('/srv/jupyterhub/pre_spawn_hook.py', 'r') as f:
exec(f.read())
{{- if eq .Env.JUPYTERHUB_VAULT_INTEGRATION_ENABLED "true" }}
# Vault Agent sidecar configuration
@@ -186,7 +89,7 @@ hub:
extraContainers:
- name: vault-agent
image: hashicorp/vault:1.15.2
image: hashicorp/vault:1.17.5
securityContext:
runAsUser: 100
runAsGroup: 101
@@ -205,6 +108,8 @@ hub:
env:
- name: VAULT_ADDR
value: {{ .Env.VAULT_ADDR | quote }}
- name: JUPYTERHUB_VAULT_TOKEN_TTL
value: {{ .Env.JUPYTERHUB_VAULT_TOKEN_TTL | quote }}
volumeMounts:
- name: vault-secrets
mountPath: /vault/secrets
@@ -378,7 +283,12 @@ cull:
# timeout: 300 # 5 minutes idle timeout (for testing) │ │
# every: 60 # Check every 1 minute (for testing) │ │
# maxAge: 86400 # Maximum age of a server pod (1 day)
# Maximum age of a server pod before forced restart
# IMPORTANT: This must be less than NOTEBOOK_VAULT_TOKEN_MAX_TTL to prevent token expiry
# - NOTEBOOK_VAULT_TOKEN_MAX_TTL: {{ .Env.NOTEBOOK_VAULT_TOKEN_MAX_TTL }} (7 days = 604800s)
# - JUPYTERHUB_CULL_MAX_AGE: {{ .Env.JUPYTERHUB_CULL_MAX_AGE }}s (6 days = 518400s)
# Pod restart creates new user token, preventing 7-day token expiry
maxAge: {{ .Env.JUPYTERHUB_CULL_MAX_AGE }}
adminUsers: true # Also cull admin users' server pods
users: false # Don't delete user accounts, only stop server pods

View File

@@ -22,6 +22,7 @@ export JUPYTER_PROFILE_BUUN_STACK_CUDA_ENABLED := env("JUPYTER_PROFILE_BUUN_STAC
export JUPYTERHUB_VAULT_TOKEN_TTL := env("JUPYTERHUB_VAULT_TOKEN_TTL", "24h")
export NOTEBOOK_VAULT_TOKEN_TTL := env("NOTEBOOK_VAULT_TOKEN_TTL", "24h")
export NOTEBOOK_VAULT_TOKEN_MAX_TTL := env("NOTEBOOK_VAULT_TOKEN_MAX_TTL", "168h")
export JUPYTERHUB_CULL_MAX_AGE := env("JUPYTERHUB_CULL_MAX_AGE", "518400")
export VAULT_AGENT_LOG_LEVEL := env("VAULT_AGENT_LOG_LEVEL", "info")
export JUPYTER_BUUNSTACK_LOG_LEVEL := env("JUPYTER_BUUNSTACK_LOG_LEVEL", "warning")
export IMAGE_REGISTRY := env("IMAGE_REGISTRY", "localhost:30500")
@@ -146,6 +147,10 @@ install root_token='':
export USER_POLICY_HCL=""
fi
# Generate pre_spawn_hook.py
echo "Generating pre_spawn_hook.py..."
gomplate -f pre_spawn_hook.gomplate.py -o pre_spawn_hook.py
# https://z2jh.jupyter.org/en/stable/
gomplate -f jupyterhub-values.gomplate.yaml -o jupyterhub-values.yaml
@@ -261,7 +266,7 @@ setup-vault-integration root_token='':
echo " User Token TTL: ${NOTEBOOK_VAULT_TOKEN_TTL}"
echo " User Token Max TTL: ${NOTEBOOK_VAULT_TOKEN_MAX_TTL}"
echo " Vault Agent Log Level: ${VAULT_AGENT_LOG_LEVEL}"
echo " Auto-renewal: Every $(( $(echo ${JUPYTERHUB_VAULT_TOKEN_TTL} | sed 's/m/*60/g; s/h/*3600/g; s/s//g' | bc) / 2 ))s (TTL/2)"
echo " Auto-renewal: Every TTL/2 (minimum 30s) based on actual token TTL"
echo ""
echo "Users can now access Vault from notebooks using:"
echo " from buunstack import SecretStore"
@@ -295,10 +300,10 @@ create-jupyterhub-vault-token root_token='':
# Create admin vault token with unlimited max TTL
echo ""
echo "Creating admin token (TTL: 24h, Max TTL: unlimited)..."
echo "Creating admin token (TTL: ${JUPYTERHUB_VAULT_TOKEN_TTL}, Max TTL: unlimited)..."
TOKEN_RESPONSE=$(vault token create \
-policy=jupyterhub-admin \
-ttl=24h \
-ttl=${JUPYTERHUB_VAULT_TOKEN_TTL} \
-explicit-max-ttl=0 \
-display-name="jupyterhub-admin" \
-renewable=true \
@@ -320,9 +325,9 @@ create-jupyterhub-vault-token root_token='':
echo "✅ Admin token created and stored successfully!"
echo ""
echo "Token behavior:"
echo " - TTL: 24 hours (will expire in 24h without renewal)"
echo " - TTL: ${JUPYTERHUB_VAULT_TOKEN_TTL} (will expire without renewal)"
echo " - Max TTL: Unlimited (can be renewed forever)"
echo " - Vault Agent will renew every 12 hours"
echo " - Vault Agent will renew at TTL/2 intervals (minimum 30s)"
echo " - No more 30-day limitation!"
echo ""
echo "Token stored at: secret/jupyterhub/vault-token"

View File

@@ -0,0 +1,105 @@
# JupyterHub pre_spawn_hook
# Sets up user environment and creates user-specific Vault tokens
import hvac
import os
{{- if eq .Env.JUPYTERHUB_VAULT_INTEGRATION_ENABLED "true" }}
def get_vault_token():
"""Read Vault token from file"""
token_file = '/vault/secrets/vault-token'
try:
with open(token_file, 'r') as f:
token = f.read().strip()
if token:
return token
except FileNotFoundError:
print(f"Token file not found: {token_file}")
except Exception as e:
print(f"Error reading token file {token_file}: {e}")
return None
{{- end }}
async def pre_spawn_hook(spawner):
"""Set essential environment variables for spawned containers"""
# PostgreSQL configuration
spawner.environment["POSTGRES_HOST"] = "postgres-cluster-rw.postgres"
spawner.environment["POSTGRES_PORT"] = "5432"
# JupyterHub API configuration
spawner.environment["JUPYTERHUB_API_URL"] = "http://hub:8081/hub/api"
# Logging configuration
spawner.environment["BUUNSTACK_LOG_LEVEL"] = "{{ .Env.JUPYTER_BUUNSTACK_LOG_LEVEL }}"
{{- if eq .Env.JUPYTERHUB_VAULT_INTEGRATION_ENABLED "true" }}
# Create user-specific Vault token directly
try:
username = spawner.user.name
# Step 1: Initialize admin Vault client with file-based token
vault_addr = os.environ.get("VAULT_ADDR", "{{ .Env.VAULT_ADDR }}")
vault_token = get_vault_token()
spawner.log.info(f"pre_spawn_hook starting for {username}")
spawner.log.info(f"Vault address: {vault_addr}")
spawner.log.info(f"Vault token source: {'file' if os.path.exists('/vault/secrets/vault-token') else 'env'}")
spawner.log.info(f"Vault token present: {bool(vault_token)}, length: {len(vault_token) if vault_token else 0}")
if not vault_token:
raise Exception("No Vault token available from file or environment")
vault_client = hvac.Client(url=vault_addr, verify=False)
vault_client.token = vault_token
if not vault_client.is_authenticated():
raise Exception("Admin token is not authenticated")
# Step 2: Create user-specific policy
user_policy_name = "jupyter-user-{}".format(username)
# Read policy template from file
policy_template_path = "/srv/jupyterhub/user_policy.hcl"
with open(policy_template_path, 'r') as f:
policy_template = f.read()
# Replace {username} placeholder with actual username
user_policy = policy_template.replace("{username}", username)
# Write user-specific policy
try:
vault_client.sys.create_or_update_policy(user_policy_name, user_policy)
spawner.log.info("✅ Created policy: {}".format(user_policy_name))
except Exception as policy_e:
spawner.log.warning("Policy creation failed (may already exist): {}".format(policy_e))
# Step 3: Create user-specific token
# Get TTL settings from environment variables
user_token_ttl = os.environ.get("NOTEBOOK_VAULT_TOKEN_TTL", "24h")
user_token_max_ttl = os.environ.get("NOTEBOOK_VAULT_TOKEN_MAX_TTL", "168h")
token_response = vault_client.auth.token.create_orphan(
policies=[user_policy_name],
ttl=user_token_ttl,
renewable=True,
display_name="notebook-{}".format(username),
explicit_max_ttl=user_token_max_ttl
)
user_vault_token = token_response["auth"]["client_token"]
lease_duration = token_response["auth"].get("lease_duration", 3600)
# Set user-specific Vault token as environment variable
spawner.environment["NOTEBOOK_VAULT_TOKEN"] = user_vault_token
spawner.log.info("✅ User-specific Vault token created for {} (TTL: {}s, renewable, max TTL: {})".format(username, lease_duration, user_token_max_ttl))
except Exception as e:
spawner.log.error("Failed to create user-specific Vault token for {}: {}".format(spawner.user.name, e))
import traceback
spawner.log.error("Full traceback: {}".format(traceback.format_exc()))
{{- end }}
# Set the hook
c.KubeSpawner.pre_spawn_hook = pre_spawn_hook

View File

@@ -1,4 +1,4 @@
#!/bin/sh
#!/bin/bash
# Script to handle admin token retrieval and renewal
set -e
@@ -23,7 +23,69 @@ if [ -z "$ADMIN_TOKEN" ]; then
fi
echo "Admin token retrieved from ExternalSecret"
echo "$ADMIN_TOKEN" > /vault/secrets/vault-token
echo "$ADMIN_TOKEN" >/vault/secrets/vault-token
# Calculate renewal interval (TTL/2, minimum 30 seconds)
# Use JUPYTERHUB_VAULT_TOKEN_TTL environment variable if available
if [ -n "${JUPYTERHUB_VAULT_TOKEN_TTL}" ]; then
echo "Using TTL from environment variable: ${JUPYTERHUB_VAULT_TOKEN_TTL}"
TTL_RAW="${JUPYTERHUB_VAULT_TOKEN_TTL}"
else
echo "Looking up token TTL..."
if vault token lookup >/dev/null 2>&1; then
echo "Token is valid, using default 5m interval for now"
TTL_RAW="300" # 5 minutes for testing
else
echo "Token lookup failed, using default TTL"
TTL_RAW="86400"
fi
fi
echo "Raw TTL: $TTL_RAW"
# Convert TTL format (e.g., "4m9s", "3600", "0") to seconds
convert_ttl_to_seconds() {
local ttl="$1"
# If already a number (seconds), return as-is
if echo "$ttl" | grep -E '^[0-9]+$' >/dev/null; then
echo "$ttl"
return
fi
# If contains time units (e.g., "4m9s")
local hours=0
local minutes=0
local seconds=0
if echo "$ttl" | grep -E '[0-9]+h' >/dev/null; then
hours=$(echo "$ttl" | sed -n 's/.*\([0-9]\+\)h.*/\1/p')
seconds=$((seconds + hours * 3600))
fi
if echo "$ttl" | grep -E '[0-9]+m' >/dev/null; then
minutes=$(echo "$ttl" | sed -n 's/.*\([0-9]\+\)m.*/\1/p')
seconds=$((seconds + minutes * 60))
fi
if echo "$ttl" | grep -E '[0-9]+s' >/dev/null; then
secs=$(echo "$ttl" | sed -n 's/.*\([0-9]\+\)s.*/\1/p')
seconds=$((seconds + secs))
fi
echo "$seconds"
}
TTL_SECONDS=$(convert_ttl_to_seconds "$TTL_RAW")
if [ "$TTL_SECONDS" = "0" ]; then
# If TTL is 0 (never expires), use default 12h interval
RENEWAL_INTERVAL=43200
else
# Renew at TTL/2, with minimum of 30 seconds
RENEWAL_INTERVAL=$((TTL_SECONDS / 2))
if [ "$RENEWAL_INTERVAL" -lt 30 ]; then
RENEWAL_INTERVAL=30
fi
fi
echo "Token TTL: ${TTL_SECONDS}s, renewal interval: ${RENEWAL_INTERVAL}s"
# Start token renewal loop
export VAULT_TOKEN="$ADMIN_TOKEN"
@@ -36,12 +98,12 @@ while true; do
# Re-read token from mounted secret
ADMIN_TOKEN=$(cat /vault/admin-token/token 2>/dev/null || echo "")
if [ -n "$ADMIN_TOKEN" ]; then
echo "$ADMIN_TOKEN" > /vault/secrets/vault-token
echo "$ADMIN_TOKEN" >/vault/secrets/vault-token
export VAULT_TOKEN="$ADMIN_TOKEN"
echo "$(date): Token re-retrieved successfully from ExternalSecret"
else
echo "$(date): Failed to re-retrieve token from ExternalSecret"
fi
fi
sleep 43200 # 12 hours
done
sleep $RENEWAL_INTERVAL
done