From 6b01b94b565e39ad01695f5f598978f65597fae4 Mon Sep 17 00:00:00 2001 From: Masaki Yatsu Date: Thu, 11 Sep 2025 02:53:59 +0900 Subject: [PATCH] feat(airflow,jupyterhub): share data --- airflow/airflow-values.gomplate.yaml | 19 ++++++ airflow/dags-pvc.yaml | 11 ++++ airflow/justfile | 62 ++++++++++++++++++- .../images/datastack-cuda-notebook/Dockerfile | 1 + .../images/datastack-notebook/Dockerfile | 1 + jupyterhub/jupyter-nfs-storage-class.yaml | 7 +++ jupyterhub/jupyterhub-values.gomplate.yaml | 30 +++++++++ jupyterhub/justfile | 28 ++++++++- jupyterhub/nfs-pv.gomplate.yaml | 5 +- jupyterhub/nfs-pvc.yaml | 5 ++ 10 files changed, 163 insertions(+), 6 deletions(-) create mode 100644 airflow/dags-pvc.yaml create mode 100644 jupyterhub/jupyter-nfs-storage-class.yaml diff --git a/airflow/airflow-values.gomplate.yaml b/airflow/airflow-values.gomplate.yaml index c11a1ea..7b8d102 100644 --- a/airflow/airflow-values.gomplate.yaml +++ b/airflow/airflow-values.gomplate.yaml @@ -47,6 +47,16 @@ postgresql: data: metadataSecretName: airflow-metadata-connection +# DAG persistence configuration +dags: + persistence: + enabled: {{ .Env.AIRFLOW_DAGS_PERSISTENCE_ENABLED | default "true" }} + {{- if eq (.Env.AIRFLOW_DAGS_STORAGE_TYPE | default "default") "nfs" }} + existingClaim: airflow-dags-nfs-pvc + {{- else }} + existingClaim: airflow-dags-pvc + {{- end }} + ingress: apiServer: enabled: true @@ -58,3 +68,12 @@ ingress: - name: {{ .Env.AIRFLOW_HOST }} tls: enabled: true + +# Security contexts for shared file system access +securityContexts: + pod: + runAsUser: 1000 + runAsGroup: 0 + fsGroup: 100 + container: + allowPrivilegeEscalation: false diff --git a/airflow/dags-pvc.yaml b/airflow/dags-pvc.yaml new file mode 100644 index 0000000..9f65e88 --- /dev/null +++ b/airflow/dags-pvc.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: airflow-dags-pvc +spec: + accessModes: + - ReadWriteMany # Multiple pods can read/write + storageClassName: longhorn # Explicitly use Longhorn which supports RWX + resources: + requests: + storage: 10Gi \ No newline at end of file diff --git a/airflow/justfile b/airflow/justfile index 258e56e..ee226fd 100644 --- a/airflow/justfile +++ b/airflow/justfile @@ -1,9 +1,14 @@ set fallback := true -export AIRFLOW_NAMESPACE := env("AIRFLOW_NAMESPACE", "airflow") +export AIRFLOW_NAMESPACE := env("AIRFLOW_NAMESPACE", "jupyter") export AIRFLOW_CHART_VERSION := env("AIRFLOW_CHART_VERSION", "1.18.0") export EXTERNAL_SECRETS_NAMESPACE := env("EXTERNAL_SECRETS_NAMESPACE", "external-secrets") export KEYCLOAK_REALM := env("KEYCLOAK_REALM", "buunstack") +export AIRFLOW_DAGS_PERSISTENCE_ENABLED := env("AIRFLOW_DAGS_PERSISTENCE_ENABLED", "") +export AIRFLOW_DAGS_STORAGE_TYPE := env("AIRFLOW_DAGS_STORAGE_TYPE", "") +export AIRFLOW_NFS_IP := env("AIRFLOW_NFS_IP", "") +export AIRFLOW_NFS_PATH := env("AIRFLOW_NFS_PATH", "") +export AIRFLOW_DAGS_STORAGE_SIZE := env("AIRFLOW_DAGS_STORAGE_SIZE", "10Gi") [private] default: @@ -18,7 +23,7 @@ add-helm-repo: remove-helm-repo: helm repo remove apache-airflow -# Create Airflow namespace +# Create namespace (shared with JupyterHub when using jupyter namespace) create-namespace: @kubectl get namespace ${AIRFLOW_NAMESPACE} &>/dev/null || \ kubectl create namespace ${AIRFLOW_NAMESPACE} @@ -247,6 +252,17 @@ install: --placeholder="e.g., airflow.example.com" ) done + if [ -z "${AIRFLOW_DAGS_PERSISTENCE_ENABLED}" ]; then + if gum confirm "Enable DAG persistence with PVC?"; then + AIRFLOW_DAGS_PERSISTENCE_ENABLED="true" + else + AIRFLOW_DAGS_PERSISTENCE_ENABLED="false" + fi + fi + # Force default storage type (NFS disabled due to permission issues) + if [ "${AIRFLOW_DAGS_PERSISTENCE_ENABLED}" = "true" ]; then + AIRFLOW_DAGS_STORAGE_TYPE="default" + fi echo "Installing Airflow..." just create-namespace just setup-database @@ -254,7 +270,10 @@ install: just create-keycloak-roles just add-helm-repo - # Create API server config ConfigMap + if [ "${AIRFLOW_DAGS_PERSISTENCE_ENABLED}" = "true" ]; then + just setup-dags-storage "default" + fi + KEYCLOAK_HOST=${KEYCLOAK_HOST} KEYCLOAK_REALM=${KEYCLOAK_REALM} \ gomplate -f webserver_config.py.gomplate -o webserver_config.py kubectl delete configmap airflow-api-server-config -n ${AIRFLOW_NAMESPACE} --ignore-not-found @@ -268,6 +287,14 @@ install: -f airflow-values.yaml echo "Airflow installation completed" echo "Access Airflow at: https://${AIRFLOW_HOST}" + if [ "${AIRFLOW_NAMESPACE}" = "jupyter" ] && [ "${AIRFLOW_DAGS_PERSISTENCE_ENABLED}" = "true" ]; then + echo "" + echo "📝 JupyterHub Integration Notes:" + echo " • If JupyterHub is already installed with DAG mounting enabled:" + echo " Restart user pods to access DAGs: kubectl delete pods -n jupyter -l app.kubernetes.io/component=singleuser-server" + echo " • If JupyterHub will be installed later:" + echo " Enable 'Airflow DAG storage mounting' during JupyterHub installation" + fi # Uninstall Airflow uninstall delete-db='true': @@ -394,6 +421,35 @@ delete-api-user username='': echo "Deletion cancelled" fi +# Setup DAG storage (PVC) +setup-dags-storage storage-type='': + #!/bin/bash + set -euo pipefail + echo "Setting up DAG storage (default)..." + echo "Creating PersistentVolumeClaim with default StorageClass..." + kubectl apply -n ${AIRFLOW_NAMESPACE} -f dags-pvc.yaml + echo "✅ Default storage configured" + echo " PVC: airflow-dags-pvc" + echo " Uses cluster default StorageClass (k3s local-path, etc.)" + echo "" + echo "DAG storage is ready for use" + echo "Mount path in pods: /opt/airflow/dags" + echo "" + if [ "${AIRFLOW_NAMESPACE}" = "jupyter" ]; then + echo "📝 JupyterHub Integration:" + echo " Since Airflow is in the 'jupyter' namespace, JupyterHub can mount this PVC" + echo " Enable 'Airflow DAG storage mounting' when installing JupyterHub" + echo " DAGs will be available at: /opt/airflow-dags in notebooks" + fi + +# Delete DAG storage +delete-dags-storage: + #!/bin/bash + set -euo pipefail + echo "Deleting DAG storage resources..." + kubectl delete pvc airflow-dags-pvc -n ${AIRFLOW_NAMESPACE} --ignore-not-found + echo "✅ DAG storage deleted" + # Clean up database and secrets cleanup: #!/bin/bash diff --git a/jupyterhub/images/datastack-cuda-notebook/Dockerfile b/jupyterhub/images/datastack-cuda-notebook/Dockerfile index 217d09a..95791ce 100644 --- a/jupyterhub/images/datastack-cuda-notebook/Dockerfile +++ b/jupyterhub/images/datastack-cuda-notebook/Dockerfile @@ -142,6 +142,7 @@ RUN mamba install --yes \ RUN pip install \ agno \ + apache-airflow-client \ fastembed \ feature-engine \ jupyter-ai \ diff --git a/jupyterhub/images/datastack-notebook/Dockerfile b/jupyterhub/images/datastack-notebook/Dockerfile index 72135bf..8f05850 100644 --- a/jupyterhub/images/datastack-notebook/Dockerfile +++ b/jupyterhub/images/datastack-notebook/Dockerfile @@ -142,6 +142,7 @@ RUN mamba install --yes \ RUN pip install \ agno \ + apache-airflow-client \ fastembed \ feature-engine \ jupyter-ai \ diff --git a/jupyterhub/jupyter-nfs-storage-class.yaml b/jupyterhub/jupyter-nfs-storage-class.yaml new file mode 100644 index 0000000..5091be6 --- /dev/null +++ b/jupyterhub/jupyter-nfs-storage-class.yaml @@ -0,0 +1,7 @@ +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: jupyter-nfs-static +provisioner: kubernetes.io/no-provisioner +volumeBindingMode: WaitForFirstConsumer +reclaimPolicy: Retain \ No newline at end of file diff --git a/jupyterhub/jupyterhub-values.gomplate.yaml b/jupyterhub/jupyterhub-values.gomplate.yaml index 91d02f6..515b5a3 100644 --- a/jupyterhub/jupyterhub-values.gomplate.yaml +++ b/jupyterhub/jupyterhub-values.gomplate.yaml @@ -18,11 +18,13 @@ hub: #!/bin/bash pip install --no-cache-dir hvac==2.3.0 exec jupyterhub --config /usr/local/etc/jupyterhub/jupyterhub_config.py --upgrade-db + {{- if .Env.USER_POLICY_HCL }} user_policy.hcl: mountPath: /srv/jupyterhub/user_policy.hcl mode: 0644 stringData: | {{ .Env.USER_POLICY_HCL | strings.Indent 8 }} + {{- end }} pre_spawn_hook.py: mountPath: /srv/jupyterhub/pre_spawn_hook.py mode: 0644 @@ -152,6 +154,34 @@ singleuser: NOTEBOOK_VAULT_TOKEN_TTL: "{{ .Env.NOTEBOOK_VAULT_TOKEN_TTL }}" NOTEBOOK_VAULT_TOKEN_MAX_TTL: "{{ .Env.NOTEBOOK_VAULT_TOKEN_MAX_TTL }}" + storage: + {{ if env.Getenv "PVC_NAME" -}} + type: static + static: + pvcName: {{ .Env.PVC_NAME }} + {{ else -}} + type: dynamic + dynamic: + {{ if env.Getenv "JUPYTERHUB_STORAGE_CLASS" -}} + storageClass: {{ .Env.JUPYTERHUB_STORAGE_CLASS }} + {{ end -}} + storageAccessModes: + - ReadWriteOnce + {{ end -}} + capacity: 10Gi + {{- if eq .Env.JUPYTERHUB_AIRFLOW_DAGS_PERSISTENCE_ENABLED "true" }} + # Mount Airflow DAGs when both are in the same namespace (jupyter) + extraVolumes: + - name: airflow-dags + persistentVolumeClaim: + claimName: airflow-dags-pvc + optional: true # Don't fail if PVC doesn't exist yet + extraVolumeMounts: + - name: airflow-dags + mountPath: /opt/airflow-dags + readOnly: false + {{- end }} + networkPolicy: egress: - to: diff --git a/jupyterhub/justfile b/jupyterhub/justfile index 5162b9d..8db61db 100644 --- a/jupyterhub/justfile +++ b/jupyterhub/justfile @@ -8,7 +8,8 @@ export JUPYTERHUB_OIDC_CLIENT_SESSION_MAX := env("JUPYTERHUB_OIDC_CLIENT_SESSION export JUPYTERHUB_NFS_PV_ENABLED := env("JUPYTERHUB_NFS_PV_ENABLED", "") export JUPYTERHUB_STORAGE_CLASS := env("JUPYTERHUB_STORAGE_CLASS", "") export JUPYTERHUB_VAULT_INTEGRATION_ENABLED := env("JUPYTERHUB_VAULT_INTEGRATION_ENABLED", "") -export JUPYTER_PYTHON_KERNEL_TAG := env("JUPYTER_PYTHON_KERNEL_TAG", "python-3.12-34") +export JUPYTERHUB_AIRFLOW_DAGS_PERSISTENCE_ENABLED := env("JUPYTERHUB_AIRFLOW_DAGS_PERSISTENCE_ENABLED", "") +export JUPYTER_PYTHON_KERNEL_TAG := env("JUPYTER_PYTHON_KERNEL_TAG", "python-3.12-36") export KERNEL_IMAGE_BUUN_STACK_REPOSITORY := env("KERNEL_IMAGE_BUUN_STACK_REPOSITORY", "buun-stack-notebook") export KERNEL_IMAGE_BUUN_STACK_CUDA_REPOSITORY := env("KERNEL_IMAGE_BUUN_STACK_CUDA_REPOSITORY", "buun-stack-cuda-notebook") export JUPYTER_PROFILE_MINIMAL_ENABLED := env("JUPYTER_PROFILE_MINIMAL_ENABLED", "false") @@ -28,6 +29,7 @@ export JUPYTER_BUUNSTACK_LOG_LEVEL := env("JUPYTER_BUUNSTACK_LOG_LEVEL", "warnin export IMAGE_REGISTRY := env("IMAGE_REGISTRY", "localhost:30500") export SPARK_DOWNLOAD_URL := env("SPARK_DOWNLOAD_URL", "https://dlcdn.apache.org/spark/") export SPARK_VERSION := env("SPARK_VERSION", "4.0.1") +export AIRFLOW_DAGS_STORAGE_SIZE := env("AIRFLOW_DAGS_STORAGE_SIZE", "10Gi") export LONGHORN_NAMESPACE := env("LONGHORN_NAMESPACE", "longhorn") export KEYCLOAK_REALM := env("KEYCLOAK_REALM", "buunstack") export VAULT_HOST := env("VAULT_HOST", "") @@ -114,12 +116,33 @@ install root_token='': ) done PVC_NAME=jupyter-nfs-pvc + # Create StorageClass for NFS static provisioning + if ! kubectl get storageclass jupyter-nfs-static &>/dev/null; then + kubectl apply -f jupyter-nfs-storage-class.yaml + fi if ! kubectl get pv jupyter-nfs-pv &>/dev/null; then gomplate -f nfs-pv.gomplate.yaml | kubectl apply -f - fi kubectl apply -n ${JUPYTERHUB_NAMESPACE} -f nfs-pvc.yaml fi + # Setup Airflow DAG storage sharing (same namespace) + if [ -z "${JUPYTERHUB_AIRFLOW_DAGS_PERSISTENCE_ENABLED}" ]; then + if gum confirm "Enable Airflow DAG storage mounting (requires Airflow in same namespace)?"; then + JUPYTERHUB_AIRFLOW_DAGS_PERSISTENCE_ENABLED="true" + else + JUPYTERHUB_AIRFLOW_DAGS_PERSISTENCE_ENABLED="false" + fi + fi + if [ "${JUPYTERHUB_AIRFLOW_DAGS_PERSISTENCE_ENABLED}" = "true" ]; then + echo "✅ Airflow DAG mounting enabled" + echo " Note: Airflow must be installed in the same namespace (jupyter)" + echo " PVC: airflow-dags-pvc will be mounted at /opt/airflow-dags" + echo "" + echo " ⚠️ If you install Airflow AFTER JupyterHub, restart user pods to mount DAGs:" + echo " kubectl delete pods -n jupyter -l app.kubernetes.io/component=singleuser-server" + fi + # Setup Vault Agent for automatic token management if [ -z "${JUPYTERHUB_VAULT_INTEGRATION_ENABLED}" ]; then if gum confirm "Are you going to enable Vault integration?"; then @@ -174,7 +197,7 @@ uninstall: kubectl patch pv jupyter-nfs-pv -p '{"spec":{"claimRef":null}}' fi -# Delete JupyterHub PV +# Delete JupyterHub PV and StorageClass delete-pv: #!/bin/bash set -euo pipefail @@ -182,6 +205,7 @@ delete-pv: kubectl patch pv jupyter-nfs-pv -p '{"spec":{"claimRef":null}}' kubectl delete pv jupyter-nfs-pv fi + kubectl delete storageclass jupyter-nfs-static --ignore-not-found # Build Jupyter notebook kernel images build-kernel-images: diff --git a/jupyterhub/nfs-pv.gomplate.yaml b/jupyterhub/nfs-pv.gomplate.yaml index a02a0db..e3be2e2 100644 --- a/jupyterhub/nfs-pv.gomplate.yaml +++ b/jupyterhub/nfs-pv.gomplate.yaml @@ -2,13 +2,16 @@ apiVersion: v1 kind: PersistentVolume metadata: name: jupyter-nfs-pv + labels: + type: jupyter-nfs + app: jupyterhub spec: capacity: storage: 10Gi accessModes: - ReadWriteOnce persistentVolumeReclaimPolicy: Retain - storageClassName: longhorn + storageClassName: jupyter-nfs-static volumeMode: Filesystem nfs: server: {{ .Env.JUPYTER_NFS_IP }} diff --git a/jupyterhub/nfs-pvc.yaml b/jupyterhub/nfs-pvc.yaml index 63494c1..73c8540 100644 --- a/jupyterhub/nfs-pvc.yaml +++ b/jupyterhub/nfs-pvc.yaml @@ -8,4 +8,9 @@ spec: resources: requests: storage: 10Gi + storageClassName: jupyter-nfs-static + selector: + matchLabels: + type: jupyter-nfs + app: jupyterhub volumeName: jupyter-nfs-pv