feat(airflow,jupyterhub): share data

This commit is contained in:
Masaki Yatsu
2025-09-11 02:53:59 +09:00
parent d753a68b51
commit 6b01b94b56
10 changed files with 163 additions and 6 deletions

View File

@@ -47,6 +47,16 @@ postgresql:
data:
metadataSecretName: airflow-metadata-connection
# DAG persistence configuration
dags:
persistence:
enabled: {{ .Env.AIRFLOW_DAGS_PERSISTENCE_ENABLED | default "true" }}
{{- if eq (.Env.AIRFLOW_DAGS_STORAGE_TYPE | default "default") "nfs" }}
existingClaim: airflow-dags-nfs-pvc
{{- else }}
existingClaim: airflow-dags-pvc
{{- end }}
ingress:
apiServer:
enabled: true
@@ -58,3 +68,12 @@ ingress:
- name: {{ .Env.AIRFLOW_HOST }}
tls:
enabled: true
# Security contexts for shared file system access
securityContexts:
pod:
runAsUser: 1000
runAsGroup: 0
fsGroup: 100
container:
allowPrivilegeEscalation: false

11
airflow/dags-pvc.yaml Normal file
View File

@@ -0,0 +1,11 @@
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: airflow-dags-pvc
spec:
accessModes:
- ReadWriteMany # Multiple pods can read/write
storageClassName: longhorn # Explicitly use Longhorn which supports RWX
resources:
requests:
storage: 10Gi

View File

@@ -1,9 +1,14 @@
set fallback := true
export AIRFLOW_NAMESPACE := env("AIRFLOW_NAMESPACE", "airflow")
export AIRFLOW_NAMESPACE := env("AIRFLOW_NAMESPACE", "jupyter")
export AIRFLOW_CHART_VERSION := env("AIRFLOW_CHART_VERSION", "1.18.0")
export EXTERNAL_SECRETS_NAMESPACE := env("EXTERNAL_SECRETS_NAMESPACE", "external-secrets")
export KEYCLOAK_REALM := env("KEYCLOAK_REALM", "buunstack")
export AIRFLOW_DAGS_PERSISTENCE_ENABLED := env("AIRFLOW_DAGS_PERSISTENCE_ENABLED", "")
export AIRFLOW_DAGS_STORAGE_TYPE := env("AIRFLOW_DAGS_STORAGE_TYPE", "")
export AIRFLOW_NFS_IP := env("AIRFLOW_NFS_IP", "")
export AIRFLOW_NFS_PATH := env("AIRFLOW_NFS_PATH", "")
export AIRFLOW_DAGS_STORAGE_SIZE := env("AIRFLOW_DAGS_STORAGE_SIZE", "10Gi")
[private]
default:
@@ -18,7 +23,7 @@ add-helm-repo:
remove-helm-repo:
helm repo remove apache-airflow
# Create Airflow namespace
# Create namespace (shared with JupyterHub when using jupyter namespace)
create-namespace:
@kubectl get namespace ${AIRFLOW_NAMESPACE} &>/dev/null || \
kubectl create namespace ${AIRFLOW_NAMESPACE}
@@ -247,6 +252,17 @@ install:
--placeholder="e.g., airflow.example.com"
)
done
if [ -z "${AIRFLOW_DAGS_PERSISTENCE_ENABLED}" ]; then
if gum confirm "Enable DAG persistence with PVC?"; then
AIRFLOW_DAGS_PERSISTENCE_ENABLED="true"
else
AIRFLOW_DAGS_PERSISTENCE_ENABLED="false"
fi
fi
# Force default storage type (NFS disabled due to permission issues)
if [ "${AIRFLOW_DAGS_PERSISTENCE_ENABLED}" = "true" ]; then
AIRFLOW_DAGS_STORAGE_TYPE="default"
fi
echo "Installing Airflow..."
just create-namespace
just setup-database
@@ -254,7 +270,10 @@ install:
just create-keycloak-roles
just add-helm-repo
# Create API server config ConfigMap
if [ "${AIRFLOW_DAGS_PERSISTENCE_ENABLED}" = "true" ]; then
just setup-dags-storage "default"
fi
KEYCLOAK_HOST=${KEYCLOAK_HOST} KEYCLOAK_REALM=${KEYCLOAK_REALM} \
gomplate -f webserver_config.py.gomplate -o webserver_config.py
kubectl delete configmap airflow-api-server-config -n ${AIRFLOW_NAMESPACE} --ignore-not-found
@@ -268,6 +287,14 @@ install:
-f airflow-values.yaml
echo "Airflow installation completed"
echo "Access Airflow at: https://${AIRFLOW_HOST}"
if [ "${AIRFLOW_NAMESPACE}" = "jupyter" ] && [ "${AIRFLOW_DAGS_PERSISTENCE_ENABLED}" = "true" ]; then
echo ""
echo "📝 JupyterHub Integration Notes:"
echo " • If JupyterHub is already installed with DAG mounting enabled:"
echo " Restart user pods to access DAGs: kubectl delete pods -n jupyter -l app.kubernetes.io/component=singleuser-server"
echo " • If JupyterHub will be installed later:"
echo " Enable 'Airflow DAG storage mounting' during JupyterHub installation"
fi
# Uninstall Airflow
uninstall delete-db='true':
@@ -394,6 +421,35 @@ delete-api-user username='':
echo "Deletion cancelled"
fi
# Setup DAG storage (PVC)
setup-dags-storage storage-type='':
#!/bin/bash
set -euo pipefail
echo "Setting up DAG storage (default)..."
echo "Creating PersistentVolumeClaim with default StorageClass..."
kubectl apply -n ${AIRFLOW_NAMESPACE} -f dags-pvc.yaml
echo "✅ Default storage configured"
echo " PVC: airflow-dags-pvc"
echo " Uses cluster default StorageClass (k3s local-path, etc.)"
echo ""
echo "DAG storage is ready for use"
echo "Mount path in pods: /opt/airflow/dags"
echo ""
if [ "${AIRFLOW_NAMESPACE}" = "jupyter" ]; then
echo "📝 JupyterHub Integration:"
echo " Since Airflow is in the 'jupyter' namespace, JupyterHub can mount this PVC"
echo " Enable 'Airflow DAG storage mounting' when installing JupyterHub"
echo " DAGs will be available at: /opt/airflow-dags in notebooks"
fi
# Delete DAG storage
delete-dags-storage:
#!/bin/bash
set -euo pipefail
echo "Deleting DAG storage resources..."
kubectl delete pvc airflow-dags-pvc -n ${AIRFLOW_NAMESPACE} --ignore-not-found
echo "✅ DAG storage deleted"
# Clean up database and secrets
cleanup:
#!/bin/bash

View File

@@ -142,6 +142,7 @@ RUN mamba install --yes \
RUN pip install \
agno \
apache-airflow-client \
fastembed \
feature-engine \
jupyter-ai \

View File

@@ -142,6 +142,7 @@ RUN mamba install --yes \
RUN pip install \
agno \
apache-airflow-client \
fastembed \
feature-engine \
jupyter-ai \

View File

@@ -0,0 +1,7 @@
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
name: jupyter-nfs-static
provisioner: kubernetes.io/no-provisioner
volumeBindingMode: WaitForFirstConsumer
reclaimPolicy: Retain

View File

@@ -18,11 +18,13 @@ hub:
#!/bin/bash
pip install --no-cache-dir hvac==2.3.0
exec jupyterhub --config /usr/local/etc/jupyterhub/jupyterhub_config.py --upgrade-db
{{- if .Env.USER_POLICY_HCL }}
user_policy.hcl:
mountPath: /srv/jupyterhub/user_policy.hcl
mode: 0644
stringData: |
{{ .Env.USER_POLICY_HCL | strings.Indent 8 }}
{{- end }}
pre_spawn_hook.py:
mountPath: /srv/jupyterhub/pre_spawn_hook.py
mode: 0644
@@ -152,6 +154,34 @@ singleuser:
NOTEBOOK_VAULT_TOKEN_TTL: "{{ .Env.NOTEBOOK_VAULT_TOKEN_TTL }}"
NOTEBOOK_VAULT_TOKEN_MAX_TTL: "{{ .Env.NOTEBOOK_VAULT_TOKEN_MAX_TTL }}"
storage:
{{ if env.Getenv "PVC_NAME" -}}
type: static
static:
pvcName: {{ .Env.PVC_NAME }}
{{ else -}}
type: dynamic
dynamic:
{{ if env.Getenv "JUPYTERHUB_STORAGE_CLASS" -}}
storageClass: {{ .Env.JUPYTERHUB_STORAGE_CLASS }}
{{ end -}}
storageAccessModes:
- ReadWriteOnce
{{ end -}}
capacity: 10Gi
{{- if eq .Env.JUPYTERHUB_AIRFLOW_DAGS_PERSISTENCE_ENABLED "true" }}
# Mount Airflow DAGs when both are in the same namespace (jupyter)
extraVolumes:
- name: airflow-dags
persistentVolumeClaim:
claimName: airflow-dags-pvc
optional: true # Don't fail if PVC doesn't exist yet
extraVolumeMounts:
- name: airflow-dags
mountPath: /opt/airflow-dags
readOnly: false
{{- end }}
networkPolicy:
egress:
- to:

View File

@@ -8,7 +8,8 @@ export JUPYTERHUB_OIDC_CLIENT_SESSION_MAX := env("JUPYTERHUB_OIDC_CLIENT_SESSION
export JUPYTERHUB_NFS_PV_ENABLED := env("JUPYTERHUB_NFS_PV_ENABLED", "")
export JUPYTERHUB_STORAGE_CLASS := env("JUPYTERHUB_STORAGE_CLASS", "")
export JUPYTERHUB_VAULT_INTEGRATION_ENABLED := env("JUPYTERHUB_VAULT_INTEGRATION_ENABLED", "")
export JUPYTER_PYTHON_KERNEL_TAG := env("JUPYTER_PYTHON_KERNEL_TAG", "python-3.12-34")
export JUPYTERHUB_AIRFLOW_DAGS_PERSISTENCE_ENABLED := env("JUPYTERHUB_AIRFLOW_DAGS_PERSISTENCE_ENABLED", "")
export JUPYTER_PYTHON_KERNEL_TAG := env("JUPYTER_PYTHON_KERNEL_TAG", "python-3.12-36")
export KERNEL_IMAGE_BUUN_STACK_REPOSITORY := env("KERNEL_IMAGE_BUUN_STACK_REPOSITORY", "buun-stack-notebook")
export KERNEL_IMAGE_BUUN_STACK_CUDA_REPOSITORY := env("KERNEL_IMAGE_BUUN_STACK_CUDA_REPOSITORY", "buun-stack-cuda-notebook")
export JUPYTER_PROFILE_MINIMAL_ENABLED := env("JUPYTER_PROFILE_MINIMAL_ENABLED", "false")
@@ -28,6 +29,7 @@ export JUPYTER_BUUNSTACK_LOG_LEVEL := env("JUPYTER_BUUNSTACK_LOG_LEVEL", "warnin
export IMAGE_REGISTRY := env("IMAGE_REGISTRY", "localhost:30500")
export SPARK_DOWNLOAD_URL := env("SPARK_DOWNLOAD_URL", "https://dlcdn.apache.org/spark/")
export SPARK_VERSION := env("SPARK_VERSION", "4.0.1")
export AIRFLOW_DAGS_STORAGE_SIZE := env("AIRFLOW_DAGS_STORAGE_SIZE", "10Gi")
export LONGHORN_NAMESPACE := env("LONGHORN_NAMESPACE", "longhorn")
export KEYCLOAK_REALM := env("KEYCLOAK_REALM", "buunstack")
export VAULT_HOST := env("VAULT_HOST", "")
@@ -114,12 +116,33 @@ install root_token='':
)
done
PVC_NAME=jupyter-nfs-pvc
# Create StorageClass for NFS static provisioning
if ! kubectl get storageclass jupyter-nfs-static &>/dev/null; then
kubectl apply -f jupyter-nfs-storage-class.yaml
fi
if ! kubectl get pv jupyter-nfs-pv &>/dev/null; then
gomplate -f nfs-pv.gomplate.yaml | kubectl apply -f -
fi
kubectl apply -n ${JUPYTERHUB_NAMESPACE} -f nfs-pvc.yaml
fi
# Setup Airflow DAG storage sharing (same namespace)
if [ -z "${JUPYTERHUB_AIRFLOW_DAGS_PERSISTENCE_ENABLED}" ]; then
if gum confirm "Enable Airflow DAG storage mounting (requires Airflow in same namespace)?"; then
JUPYTERHUB_AIRFLOW_DAGS_PERSISTENCE_ENABLED="true"
else
JUPYTERHUB_AIRFLOW_DAGS_PERSISTENCE_ENABLED="false"
fi
fi
if [ "${JUPYTERHUB_AIRFLOW_DAGS_PERSISTENCE_ENABLED}" = "true" ]; then
echo "✅ Airflow DAG mounting enabled"
echo " Note: Airflow must be installed in the same namespace (jupyter)"
echo " PVC: airflow-dags-pvc will be mounted at /opt/airflow-dags"
echo ""
echo " ⚠️ If you install Airflow AFTER JupyterHub, restart user pods to mount DAGs:"
echo " kubectl delete pods -n jupyter -l app.kubernetes.io/component=singleuser-server"
fi
# Setup Vault Agent for automatic token management
if [ -z "${JUPYTERHUB_VAULT_INTEGRATION_ENABLED}" ]; then
if gum confirm "Are you going to enable Vault integration?"; then
@@ -174,7 +197,7 @@ uninstall:
kubectl patch pv jupyter-nfs-pv -p '{"spec":{"claimRef":null}}'
fi
# Delete JupyterHub PV
# Delete JupyterHub PV and StorageClass
delete-pv:
#!/bin/bash
set -euo pipefail
@@ -182,6 +205,7 @@ delete-pv:
kubectl patch pv jupyter-nfs-pv -p '{"spec":{"claimRef":null}}'
kubectl delete pv jupyter-nfs-pv
fi
kubectl delete storageclass jupyter-nfs-static --ignore-not-found
# Build Jupyter notebook kernel images
build-kernel-images:

View File

@@ -2,13 +2,16 @@ apiVersion: v1
kind: PersistentVolume
metadata:
name: jupyter-nfs-pv
labels:
type: jupyter-nfs
app: jupyterhub
spec:
capacity:
storage: 10Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Retain
storageClassName: longhorn
storageClassName: jupyter-nfs-static
volumeMode: Filesystem
nfs:
server: {{ .Env.JUPYTER_NFS_IP }}

View File

@@ -8,4 +8,9 @@ spec:
resources:
requests:
storage: 10Gi
storageClassName: jupyter-nfs-static
selector:
matchLabels:
type: jupyter-nfs
app: jupyterhub
volumeName: jupyter-nfs-pv