From 00009ab19255901246a9cec985bb7b3899d3226a Mon Sep 17 00:00:00 2001
From: Masaki Yatsu <yatsu@yatsu.info>
Date: Fri, 29 Aug 2025 17:12:31 +0900
Subject: [PATCH] feat(jupyterhub): add JupyterHub

---
 jupyterhub/.gitignore                         |   1 +
 .../datastack-cuda-notebook/.dockerignore     |   1 +
 .../images/datastack-cuda-notebook/Dockerfile | 159 ++++++++++++++++++
 .../images/datastack-cuda-notebook/README.md  |   5 +
 .../ipython_kernel_config.py                  |  13 ++
 .../datastack-cuda-notebook/setup_spark.py    | 131 +++++++++++++++
 .../images/datastack-notebook/.dockerignore   |   1 +
 .../images/datastack-notebook/Dockerfile      | 158 +++++++++++++++++
 .../images/datastack-notebook/README.md       |   5 +
 .../ipython_kernel_config.py                  |  13 ++
 .../images/datastack-notebook/setup_spark.py  | 131 +++++++++++++++
 jupyterhub/jupyterhub-values.gomplate.yaml    | 155 +++++++++++++++++
 jupyterhub/justfile                           | 150 +++++++++++++++++
 jupyterhub/nfs-pv.gomplate.yaml               |  15 ++
 jupyterhub/nfs-pvc.yaml                       |  11 ++
 justfile                                      |   1 +
 k8s/justfile                                  |  26 +++
 17 files changed, 976 insertions(+)
 create mode 100644 jupyterhub/.gitignore
 create mode 100644 jupyterhub/images/datastack-cuda-notebook/.dockerignore
 create mode 100644 jupyterhub/images/datastack-cuda-notebook/Dockerfile
 create mode 100644 jupyterhub/images/datastack-cuda-notebook/README.md
 create mode 100644 jupyterhub/images/datastack-cuda-notebook/ipython_kernel_config.py
 create mode 100755 jupyterhub/images/datastack-cuda-notebook/setup_spark.py
 create mode 100644 jupyterhub/images/datastack-notebook/.dockerignore
 create mode 100644 jupyterhub/images/datastack-notebook/Dockerfile
 create mode 100644 jupyterhub/images/datastack-notebook/README.md
 create mode 100644 jupyterhub/images/datastack-notebook/ipython_kernel_config.py
 create mode 100755 jupyterhub/images/datastack-notebook/setup_spark.py
 create mode 100644 jupyterhub/jupyterhub-values.gomplate.yaml
 create mode 100644 jupyterhub/justfile
 create mode 100644 jupyterhub/nfs-pv.gomplate.yaml
 create mode 100644 jupyterhub/nfs-pvc.yaml

diff --git a/jupyterhub/.gitignore b/jupyterhub/.gitignore
new file mode 100644
index 0000000..d2b1ebf
--- /dev/null
+++ b/jupyterhub/.gitignore
@@ -0,0 +1 @@
+jupyterhub-values.yaml
diff --git a/jupyterhub/images/datastack-cuda-notebook/.dockerignore b/jupyterhub/images/datastack-cuda-notebook/.dockerignore
new file mode 100644
index 0000000..b43bf86
--- /dev/null
+++ b/jupyterhub/images/datastack-cuda-notebook/.dockerignore
@@ -0,0 +1 @@
+README.md
diff --git a/jupyterhub/images/datastack-cuda-notebook/Dockerfile b/jupyterhub/images/datastack-cuda-notebook/Dockerfile
new file mode 100644
index 0000000..dd949f8
--- /dev/null
+++ b/jupyterhub/images/datastack-cuda-notebook/Dockerfile
@@ -0,0 +1,159 @@
+# Merge pyspark-notebook into pytorch-notebook:cuda12-python-3.12
+# https://github.com/jupyter/docker-stacks/tree/main/images/pytorch-notebook
+# https://github.com/jupyter/docker-stacks/tree/main/images/pyspark-notebook
+# https://github.com/jupyter/docker-stacks/blob/main/images/pyspark-notebook/setup_spark.py
+
+FROM quay.io/jupyter/pytorch-notebook:x86_64-cuda12-python-3.12.10
+
+# Fix: https://github.com/hadolint/hadolint/wiki/DL4006
+# Fix: https://github.com/koalaman/shellcheck/wiki/SC3014
+SHELL ["/bin/bash", "-o", "pipefail", "-c"]
+
+USER root
+
+# Spark dependencies
+# Default values can be overridden at build time
+# (ARGS are in lowercase to distinguish them from ENV)
+ARG openjdk_version="17"
+
+RUN apt-get update --yes && \
+    apt-get install --yes --no-install-recommends \
+    bash jq \
+    "openjdk-${openjdk_version}-jre-headless" \
+    ca-certificates-java && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+
+# If spark_version is not set, latest Spark will be installed
+ARG spark_version
+ARG hadoop_version="3"
+# If scala_version is not set, Spark without Scala will be installed
+ARG scala_version
+# URL to use for Spark downloads
+# You need to use https://archive.apache.org/dist/spark/ website if you want to download old Spark versions
+# But it seems to be slower, that's why we use the recommended site for download
+ARG spark_download_url="https://dlcdn.apache.org/spark/"
+
+ENV SPARK_HOME=/usr/local/spark
+ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info"
+ENV JAVA_HOME="/usr/lib/jvm/java-17-openjdk-amd64"
+ENV PATH="${PATH}:${SPARK_HOME}/bin:${JAVA_HOME}/bin"
+
+COPY setup_spark.py /opt/setup-scripts/
+
+# Setup Spark
+RUN /opt/setup-scripts/setup_spark.py \
+    --spark-version="${spark_version}" \
+    --hadoop-version="${hadoop_version}" \
+    --scala-version="${scala_version}" \
+    --spark-download-url="${spark_download_url}"
+
+# Configure IPython system-wide
+COPY ipython_kernel_config.py "/etc/ipython/"
+RUN fix-permissions "/etc/ipython/"
+
+USER ${NB_UID}
+
+# Remove torch to fix `critical libmamba filesystem error` on executing `memba install`
+RUN pip uninstall -y \
+    'torch' \
+    'torchaudio' \
+    'torchvision'
+
+# Install pyarrow
+# NOTE: It's important to ensure compatibility between Pandas versions.
+# The pandas version in this Dockerfile should match the version
+# on which the Pandas API for Spark is built.
+# To find the right version:
+# 1. Check out the Spark branch you are on: <https://github.com/apache/spark>
+# 2. Find the pandas version in the file `dev/infra/Dockerfile`.
+RUN mamba install --yes \
+    'aif360' \
+    'airflow' \
+    'chromadb' \
+    'dalex' \
+    'dbt' \
+    'dlt' \
+    'duckdb' \
+    'faiss' \
+    'gitpython' \
+    'grpcio-status' \
+    'grpcio' \
+    'keras' \
+    'langchain' \
+    'langchain-ai21' \
+    'langchain-anthropic' \
+    'langchain-aws' \
+    'langchain-azure-dynamic-sessions' \
+    'langchain-chroma' \
+    'langchain-community' \
+    'langchain-experimental' \
+    'langchain-fireworks' \
+    'langchain-google-genai' \
+    'langchain-groq' \
+    'langchain-mistralai' \
+    'langchain-mongodb' \
+    'langchain-nomic' \
+    'langchain-openai' \
+    'langchain-prompty' \
+    'langchain-qdrant' \
+    'langchain-robocorp' \
+    'langchain-text-splitters' \
+    'langchain-together' \
+    'langchain-voyageai' \
+    'langgraph' \
+    'langgraph-checkpoint' \
+    'langgraph-sdk' \
+    'langsmith' \
+    'litellm' \
+    'nest-asyncio' \
+    'openai' \
+    'openai-agents' \
+    'pandas=2.2.2' \
+    'pandas-profiling' \
+    'pillow' \
+    'polars' \
+    'pyarrow' \
+    'qdrant-client' \
+    'rapidfuzz' \
+    'tensorflow' \
+    'transformers' \
+    'unstructured' \
+    && \
+    mamba clean --all -f -y && \
+    fix-permissions "${CONDA_DIR}" && \
+    fix-permissions "/home/${NB_USER}"
+
+# RUN pip install pyspark[connect,ml,mllib,pandas-on-spark,sql]==4.0.0.dev2
+# RUN pip install pyspark[connect,ml,mllib,pandas-on-spark,sql]==3.5.4
+
+RUN pip install \
+    agno \
+    fastembed \
+    feature-engine \
+    jupyter-ai \
+    jupyter-ai-magics[all] \
+    kreuzberg \
+    langchain-huggingface \
+    langchain-perplexity \
+    langfuse \
+    pydantic-ai \
+    ragas \
+    smolagents \
+    tavily-python \
+    tweet-preprocessor
+
+# Install PyTorch with pip (https://pytorch.org/get-started/locally/)
+# langchain-openai must be updated to avoid pydantic v2 error
+# https://github.com/run-llama/llama_index/issues/16540https://github.com/run-llama/llama_index/issues/16540
+# hadolint ignore=DL3013
+RUN pip install --no-cache-dir --extra-index-url=https://pypi.nvidia.com --index-url 'https://download.pytorch.org/whl/cu124' \
+    'torch' \
+    'torchaudio' \
+    'torchvision' && \
+    pip install --upgrade langchain-openai && \
+    fix-permissions "${CONDA_DIR}" && \
+    fix-permissions "/home/${NB_USER}"
+
+
+WORKDIR "${HOME}"
+EXPOSE 4040
diff --git a/jupyterhub/images/datastack-cuda-notebook/README.md b/jupyterhub/images/datastack-cuda-notebook/README.md
new file mode 100644
index 0000000..1e1e3a3
--- /dev/null
+++ b/jupyterhub/images/datastack-cuda-notebook/README.md
@@ -0,0 +1,5 @@
+# Jupyter Notebook Image
+
+Custom Jupyter notebook kernel image derived from the official one:
+
+[jupyter/docker-stacks: Ready-to-run Docker images containing Jupyter applications](https://github.com/jupyter/docker-stacks)
diff --git a/jupyterhub/images/datastack-cuda-notebook/ipython_kernel_config.py b/jupyterhub/images/datastack-cuda-notebook/ipython_kernel_config.py
new file mode 100644
index 0000000..921e6fa
--- /dev/null
+++ b/jupyterhub/images/datastack-cuda-notebook/ipython_kernel_config.py
@@ -0,0 +1,13 @@
+# Configuration file for ipython-kernel.
+# See <https://ipython.readthedocs.io/en/stable/config/options/kernel.html>
+
+# With IPython >= 6.0.0, all outputs to stdout/stderr are captured.
+# It is the case for subprocesses and output of compiled libraries like Spark.
+# Those logs now both head to notebook logs and in notebooks outputs.
+# Logs are particularly verbose with Spark, that is why we turn them off through this flag.
+# <https://github.com/jupyter/docker-stacks/issues/1423>
+
+# Attempt to capture and forward low-level output, e.g. produced by Extension libraries.
+# Default: True
+# type:ignore
+c.IPKernelApp.capture_fd_output = False  # noqa: F821
diff --git a/jupyterhub/images/datastack-cuda-notebook/setup_spark.py b/jupyterhub/images/datastack-cuda-notebook/setup_spark.py
new file mode 100755
index 0000000..c5b7643
--- /dev/null
+++ b/jupyterhub/images/datastack-cuda-notebook/setup_spark.py
@@ -0,0 +1,131 @@
+#!/usr/bin/env python3
+# Copyright (c) Jupyter Development Team.
+# Distributed under the terms of the Modified BSD License.
+
+# Requirements:
+# - Run as the root user
+# - Required env variable: SPARK_HOME
+
+import argparse
+import logging
+import os
+import subprocess
+from pathlib import Path
+
+import requests
+from bs4 import BeautifulSoup
+
+LOGGER = logging.getLogger(__name__)
+
+
+def get_all_refs(url: str) -> list[str]:
+    """
+    Get all the references for a given webpage
+    """
+    resp = requests.get(url)
+    soup = BeautifulSoup(resp.text, "html.parser")
+    return [a["href"] for a in soup.find_all("a", href=True)]
+
+
+def get_latest_spark_version() -> str:
+    """
+    Returns the last version of Spark using spark archive
+    """
+    LOGGER.info("Downloading Spark versions information")
+    all_refs = get_all_refs("https://archive.apache.org/dist/spark/")
+    versions = [
+        ref.removeprefix("spark-").removesuffix("/")
+        for ref in all_refs
+        if ref.startswith("spark-") and "incubating" not in ref
+    ]
+
+    # Compare versions semantically
+    def version_array(ver: str) -> tuple[int, int, int, str]:
+        # 3.5.3 -> [3, 5, 3, ""]
+        # 4.0.0-preview2 -> [4, 0, 0, "preview2"]
+        arr = ver.split(".")
+        assert len(arr) == 3, arr
+        major, minor = int(arr[0]), int(arr[1])
+        patch, _, preview = arr[2].partition("-")
+        return (major, minor, int(patch), preview)
+
+    latest_version = max(versions, key=lambda ver: version_array(ver))
+    LOGGER.info(f"Latest version: {latest_version}")
+    return latest_version
+
+
+def download_spark(
+    spark_version: str,
+    hadoop_version: str,
+    scala_version: str,
+    spark_download_url: Path,
+) -> str:
+    """
+    Downloads and unpacks spark
+    The resulting spark directory name is returned
+    """
+    LOGGER.info("Downloading and unpacking Spark")
+    spark_dir_name = f"spark-{spark_version}-bin-hadoop{hadoop_version}"
+    if scala_version:
+        spark_dir_name += f"-scala{scala_version}"
+    LOGGER.info(f"Spark directory name: {spark_dir_name}")
+    spark_url = spark_download_url / f"spark-{spark_version}" / f"{spark_dir_name}.tgz"
+
+    tmp_file = Path("/tmp/spark.tar.gz")
+    subprocess.check_call(
+        ["curl", "--progress-bar", "--location", "--output", tmp_file, spark_url]
+    )
+    subprocess.check_call(
+        [
+            "tar",
+            "xzf",
+            tmp_file,
+            "-C",
+            "/usr/local",
+            "--owner",
+            "root",
+            "--group",
+            "root",
+            "--no-same-owner",
+        ]
+    )
+    tmp_file.unlink()
+    return spark_dir_name
+
+
+def configure_spark(spark_dir_name: str, spark_home: Path) -> None:
+    """
+    Creates a ${SPARK_HOME} symlink to a versioned spark directory
+    Creates a 10spark-config.sh symlink to source PYTHONPATH automatically
+    """
+    LOGGER.info("Configuring Spark")
+    subprocess.check_call(["ln", "-s", f"/usr/local/{spark_dir_name}", spark_home])
+
+    # Add a link in the before_notebook hook in order to source PYTHONPATH automatically
+    CONFIG_SCRIPT = "/usr/local/bin/before-notebook.d/10spark-config.sh"
+    subprocess.check_call(
+        ["ln", "-s", spark_home / "sbin/spark-config.sh", CONFIG_SCRIPT]
+    )
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+
+    arg_parser = argparse.ArgumentParser()
+    arg_parser.add_argument("--spark-version", required=True)
+    arg_parser.add_argument("--hadoop-version", required=True)
+    arg_parser.add_argument("--scala-version", required=True)
+    arg_parser.add_argument("--spark-download-url", type=Path, required=True)
+    args = arg_parser.parse_args()
+
+    args.spark_version = args.spark_version or get_latest_spark_version()
+
+    spark_dir_name = download_spark(
+        spark_version=args.spark_version,
+        hadoop_version=args.hadoop_version,
+        scala_version=args.scala_version,
+        spark_download_url=args.spark_download_url,
+    )
+    configure_spark(
+        spark_dir_name=spark_dir_name, spark_home=Path(os.environ["SPARK_HOME"])
+    )
diff --git a/jupyterhub/images/datastack-notebook/.dockerignore b/jupyterhub/images/datastack-notebook/.dockerignore
new file mode 100644
index 0000000..b43bf86
--- /dev/null
+++ b/jupyterhub/images/datastack-notebook/.dockerignore
@@ -0,0 +1 @@
+README.md
diff --git a/jupyterhub/images/datastack-notebook/Dockerfile b/jupyterhub/images/datastack-notebook/Dockerfile
new file mode 100644
index 0000000..27ee415
--- /dev/null
+++ b/jupyterhub/images/datastack-notebook/Dockerfile
@@ -0,0 +1,158 @@
+# Merge pyspark-notebook into pytorch-notebook:python-3.12
+# https://github.com/jupyter/docker-stacks/tree/main/images/pytorch-notebook
+# https://github.com/jupyter/docker-stacks/tree/main/images/pyspark-notebook
+# https://github.com/jupyter/docker-stacks/blob/main/images/pyspark-notebook/setup_spark.py
+
+FROM quay.io/jupyter/pytorch-notebook:python-3.12
+
+# Fix: https://github.com/hadolint/hadolint/wiki/DL4006
+# Fix: https://github.com/koalaman/shellcheck/wiki/SC3014
+SHELL ["/bin/bash", "-o", "pipefail", "-c"]
+
+USER root
+
+# Spark dependencies
+# Default values can be overridden at build time
+# (ARGS are in lowercase to distinguish them from ENV)
+ARG openjdk_version="17"
+
+RUN apt-get update --yes && \
+    apt-get install --yes --no-install-recommends \
+    bash jq \
+    "openjdk-${openjdk_version}-jre-headless" \
+    ca-certificates-java && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+
+# If spark_version is not set, latest Spark will be installed
+ARG spark_version
+ARG hadoop_version="3"
+# If scala_version is not set, Spark without Scala will be installed
+ARG scala_version
+# URL to use for Spark downloads
+# You need to use https://archive.apache.org/dist/spark/ website if you want to download old Spark versions
+# But it seems to be slower, that's why we use the recommended site for download
+ARG spark_download_url="https://dlcdn.apache.org/spark/"
+
+ENV SPARK_HOME=/usr/local/spark
+ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info"
+ENV JAVA_HOME="/usr/lib/jvm/java-17-openjdk-amd64"
+ENV PATH="${PATH}:${SPARK_HOME}/bin:${JAVA_HOME}/bin"
+
+COPY setup_spark.py /opt/setup-scripts/
+
+# Setup Spark
+RUN /opt/setup-scripts/setup_spark.py \
+    --spark-version="${spark_version}" \
+    --hadoop-version="${hadoop_version}" \
+    --scala-version="${scala_version}" \
+    --spark-download-url="${spark_download_url}"
+
+# Configure IPython system-wide
+COPY ipython_kernel_config.py "/etc/ipython/"
+RUN fix-permissions "/etc/ipython/"
+
+USER ${NB_UID}
+
+# Remove torch to fix `critical libmamba filesystem error` on executing `memba install`
+RUN pip uninstall -y \
+    'torch' \
+    'torchaudio' \
+    'torchvision'
+
+# Install pyarrow
+# NOTE: It's important to ensure compatibility between Pandas versions.
+# The pandas version in this Dockerfile should match the version
+# on which the Pandas API for Spark is built.
+# To find the right version:
+# 1. Check out the Spark branch you are on: <https://github.com/apache/spark>
+# 2. Find the pandas version in the file `dev/infra/Dockerfile`.
+RUN mamba install --yes \
+    'aif360' \
+    'airflow' \
+    'chromadb' \
+    'dalex' \
+    'dbt' \
+    'dlt' \
+    'duckdb' \
+    'faiss' \
+    'gitpython' \
+    'grpcio-status' \
+    'grpcio' \
+    'keras' \
+    'langchain' \
+    'langchain-ai21' \
+    'langchain-anthropic' \
+    'langchain-aws' \
+    'langchain-azure-dynamic-sessions' \
+    'langchain-chroma' \
+    'langchain-community' \
+    'langchain-experimental' \
+    'langchain-fireworks' \
+    'langchain-google-genai' \
+    'langchain-groq' \
+    'langchain-mistralai' \
+    'langchain-mongodb' \
+    'langchain-nomic' \
+    'langchain-openai' \
+    'langchain-prompty' \
+    'langchain-qdrant' \
+    'langchain-robocorp' \
+    'langchain-text-splitters' \
+    'langchain-together' \
+    'langchain-voyageai' \
+    'langgraph' \
+    'langgraph-checkpoint' \
+    'langgraph-sdk' \
+    'langsmith' \
+    'litellm' \
+    'nest-asyncio' \
+    'openai' \
+    'openai-agents' \
+    'pandas=2.2.2' \
+    'pandas-profiling' \
+    'pillow' \
+    'polars' \
+    'pyarrow' \
+    'qdrant-client' \
+    'rapidfuzz' \
+    'tensorflow' \
+    'transformers' \
+    'unstructured' \
+    && \
+    mamba clean --all -f -y && \
+    fix-permissions "${CONDA_DIR}" && \
+    fix-permissions "/home/${NB_USER}"
+
+# RUN pip install pyspark[connect,ml,mllib,pandas-on-spark,sql]==4.0.0.dev2
+# RUN pip install pyspark[connect,ml,mllib,pandas-on-spark,sql]==3.5.4
+
+RUN pip install \
+    agno \
+    fastembed \
+    feature-engine \
+    jupyter-ai \
+    jupyter-ai-magics[all] \
+    kreuzberg \
+    langfuse \
+    langchain-huggingface \
+    langchain-perplexity \
+    pydantic-ai \
+    ragas \
+    smolagents \
+    tavily-python \
+    tweet-preprocessor
+
+# Install PyTorch with pip (https://pytorch.org/get-started/locally/)
+# langchain-openai must be updated to avoid pydantic v2 error
+# https://github.com/run-llama/llama_index/issues/16540https://github.com/run-llama/llama_index/issues/16540
+# hadolint ignore=DL3013
+RUN pip install --no-cache-dir --index-url 'https://download.pytorch.org/whl/cpu' \
+    'torch' \
+    'torchaudio' \
+    'torchvision' && \
+    pip install --upgrade langchain-openai && \
+    fix-permissions "${CONDA_DIR}" && \
+    fix-permissions "/home/${NB_USER}"
+
+WORKDIR "${HOME}"
+EXPOSE 4040
diff --git a/jupyterhub/images/datastack-notebook/README.md b/jupyterhub/images/datastack-notebook/README.md
new file mode 100644
index 0000000..1e1e3a3
--- /dev/null
+++ b/jupyterhub/images/datastack-notebook/README.md
@@ -0,0 +1,5 @@
+# Jupyter Notebook Image
+
+Custom Jupyter notebook kernel image derived from the official one:
+
+[jupyter/docker-stacks: Ready-to-run Docker images containing Jupyter applications](https://github.com/jupyter/docker-stacks)
diff --git a/jupyterhub/images/datastack-notebook/ipython_kernel_config.py b/jupyterhub/images/datastack-notebook/ipython_kernel_config.py
new file mode 100644
index 0000000..921e6fa
--- /dev/null
+++ b/jupyterhub/images/datastack-notebook/ipython_kernel_config.py
@@ -0,0 +1,13 @@
+# Configuration file for ipython-kernel.
+# See <https://ipython.readthedocs.io/en/stable/config/options/kernel.html>
+
+# With IPython >= 6.0.0, all outputs to stdout/stderr are captured.
+# It is the case for subprocesses and output of compiled libraries like Spark.
+# Those logs now both head to notebook logs and in notebooks outputs.
+# Logs are particularly verbose with Spark, that is why we turn them off through this flag.
+# <https://github.com/jupyter/docker-stacks/issues/1423>
+
+# Attempt to capture and forward low-level output, e.g. produced by Extension libraries.
+# Default: True
+# type:ignore
+c.IPKernelApp.capture_fd_output = False  # noqa: F821
diff --git a/jupyterhub/images/datastack-notebook/setup_spark.py b/jupyterhub/images/datastack-notebook/setup_spark.py
new file mode 100755
index 0000000..c5b7643
--- /dev/null
+++ b/jupyterhub/images/datastack-notebook/setup_spark.py
@@ -0,0 +1,131 @@
+#!/usr/bin/env python3
+# Copyright (c) Jupyter Development Team.
+# Distributed under the terms of the Modified BSD License.
+
+# Requirements:
+# - Run as the root user
+# - Required env variable: SPARK_HOME
+
+import argparse
+import logging
+import os
+import subprocess
+from pathlib import Path
+
+import requests
+from bs4 import BeautifulSoup
+
+LOGGER = logging.getLogger(__name__)
+
+
+def get_all_refs(url: str) -> list[str]:
+    """
+    Get all the references for a given webpage
+    """
+    resp = requests.get(url)
+    soup = BeautifulSoup(resp.text, "html.parser")
+    return [a["href"] for a in soup.find_all("a", href=True)]
+
+
+def get_latest_spark_version() -> str:
+    """
+    Returns the last version of Spark using spark archive
+    """
+    LOGGER.info("Downloading Spark versions information")
+    all_refs = get_all_refs("https://archive.apache.org/dist/spark/")
+    versions = [
+        ref.removeprefix("spark-").removesuffix("/")
+        for ref in all_refs
+        if ref.startswith("spark-") and "incubating" not in ref
+    ]
+
+    # Compare versions semantically
+    def version_array(ver: str) -> tuple[int, int, int, str]:
+        # 3.5.3 -> [3, 5, 3, ""]
+        # 4.0.0-preview2 -> [4, 0, 0, "preview2"]
+        arr = ver.split(".")
+        assert len(arr) == 3, arr
+        major, minor = int(arr[0]), int(arr[1])
+        patch, _, preview = arr[2].partition("-")
+        return (major, minor, int(patch), preview)
+
+    latest_version = max(versions, key=lambda ver: version_array(ver))
+    LOGGER.info(f"Latest version: {latest_version}")
+    return latest_version
+
+
+def download_spark(
+    spark_version: str,
+    hadoop_version: str,
+    scala_version: str,
+    spark_download_url: Path,
+) -> str:
+    """
+    Downloads and unpacks spark
+    The resulting spark directory name is returned
+    """
+    LOGGER.info("Downloading and unpacking Spark")
+    spark_dir_name = f"spark-{spark_version}-bin-hadoop{hadoop_version}"
+    if scala_version:
+        spark_dir_name += f"-scala{scala_version}"
+    LOGGER.info(f"Spark directory name: {spark_dir_name}")
+    spark_url = spark_download_url / f"spark-{spark_version}" / f"{spark_dir_name}.tgz"
+
+    tmp_file = Path("/tmp/spark.tar.gz")
+    subprocess.check_call(
+        ["curl", "--progress-bar", "--location", "--output", tmp_file, spark_url]
+    )
+    subprocess.check_call(
+        [
+            "tar",
+            "xzf",
+            tmp_file,
+            "-C",
+            "/usr/local",
+            "--owner",
+            "root",
+            "--group",
+            "root",
+            "--no-same-owner",
+        ]
+    )
+    tmp_file.unlink()
+    return spark_dir_name
+
+
+def configure_spark(spark_dir_name: str, spark_home: Path) -> None:
+    """
+    Creates a ${SPARK_HOME} symlink to a versioned spark directory
+    Creates a 10spark-config.sh symlink to source PYTHONPATH automatically
+    """
+    LOGGER.info("Configuring Spark")
+    subprocess.check_call(["ln", "-s", f"/usr/local/{spark_dir_name}", spark_home])
+
+    # Add a link in the before_notebook hook in order to source PYTHONPATH automatically
+    CONFIG_SCRIPT = "/usr/local/bin/before-notebook.d/10spark-config.sh"
+    subprocess.check_call(
+        ["ln", "-s", spark_home / "sbin/spark-config.sh", CONFIG_SCRIPT]
+    )
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+
+    arg_parser = argparse.ArgumentParser()
+    arg_parser.add_argument("--spark-version", required=True)
+    arg_parser.add_argument("--hadoop-version", required=True)
+    arg_parser.add_argument("--scala-version", required=True)
+    arg_parser.add_argument("--spark-download-url", type=Path, required=True)
+    args = arg_parser.parse_args()
+
+    args.spark_version = args.spark_version or get_latest_spark_version()
+
+    spark_dir_name = download_spark(
+        spark_version=args.spark_version,
+        hadoop_version=args.hadoop_version,
+        scala_version=args.scala_version,
+        spark_download_url=args.spark_download_url,
+    )
+    configure_spark(
+        spark_dir_name=spark_dir_name, spark_home=Path(os.environ["SPARK_HOME"])
+    )
diff --git a/jupyterhub/jupyterhub-values.gomplate.yaml b/jupyterhub/jupyterhub-values.gomplate.yaml
new file mode 100644
index 0000000..b312ee7
--- /dev/null
+++ b/jupyterhub/jupyterhub-values.gomplate.yaml
@@ -0,0 +1,155 @@
+hub:
+  config:
+    JupyterHub:
+      authenticator_class: generic-oauth
+      admin_access: false
+
+    Authenticator:
+      enable_auth_state: true
+      allow_all: true # allow all Keycloak users
+
+    GenericOAuthenticator:
+      client_id: {{ .Env.JUPYTERHUB_OIDC_CLIENT_ID }}
+      oauth_callback_url: "https://{{ .Env.JUPYTERHUB_HOST }}/hub/oauth_callback"
+      authorize_url: "https://{{ .Env.KEYCLOAK_HOST }}/realms/{{ .Env.KEYCLOAK_REALM }}/protocol/openid-connect/auth"
+      token_url: "https://{{ .Env.KEYCLOAK_HOST }}/realms/{{ .Env.KEYCLOAK_REALM }}/protocol/openid-connect/token"
+      userdata_url: "https://{{ .Env.KEYCLOAK_HOST }}/realms/{{ .Env.KEYCLOAK_REALM }}/protocol/openid-connect/userinfo"
+      login_service: keycloak
+      # username_claim: email
+      username_claim: preferred_username
+
+    OAuthenticator:
+      scope:
+        - openid
+        - profile
+        - email
+
+  # db:
+  #   pvc:
+  #     storageClassName: longhorn
+
+  podSecurityContext:
+    fsGroup: {{ .Env.JUPYTER_FSGID }}
+
+singleuser:
+  storage:
+    {{ if env.Getenv "PVC_NAME" -}}
+    type: static
+    static:
+      pvcName: {{ .Env.PVC_NAME }}
+    {{ else -}}
+    type: dynamic
+    dynamic:
+      storageClass: longhorn
+      storageAccessModes:
+        - ReadWriteOnce
+    {{ end -}}
+    capacity: 10Gi
+  networkPolicy:
+    egress:
+      - to:
+          - namespaceSelector:
+              matchLabels:
+                kubernetes.io/metadata.name: chroma
+        ports:
+          - port: 8000
+            protocol: TCP
+      - to:
+          - namespaceSelector:
+              matchLabels:
+                kubernetes.io/metadata.name: qdrant
+        ports:
+          - port: 6333
+            protocol: TCP
+          - port: 6334
+            protocol: TCP
+          - port: 6335
+            protocol: TCP
+      - to:
+          - namespaceSelector:
+              matchLabels:
+                kubernetes.io/metadata.name: litellm
+        ports:
+          - port: 4000
+            protocol: TCP
+      - to:
+        - ipBlock:
+            cidr: 0.0.0.0/0
+        ports:
+          - port: 443
+            protocol: TCP
+        domains:
+          - '*.shds.dev'
+
+  image:
+    pullPolicy: IfNotPresent
+
+  profileList:
+    # https://quay.io/repository/jupyter/pyspark-notebook
+    {{- if eq .Env.JUPYTER_PROFILE_MINIMAL_ENABLED "true" }}
+    - display_name: "Minimal Jupyter Notebook Stack"
+      description: "Minimal Jupyter Notebook Stack"
+      kubespawner_override:
+        image: quay.io/jupyter/minimal-notebook
+    {{- end }}
+    {{ if eq .Env.JUPYTER_PROFILE_BASE_ENABLED "true" }}
+    - display_name: "Base Jupyter Notebook Stack"
+      description: "Base Jupyter Notebook Stack"
+      kubespawner_override:
+        image: quay.io/jupyter/base-notebook
+    {{- end }}
+    {{- if eq .Env.JUPYTER_PROFILE_DATASCIENCE_ENABLED "true" }}
+    - display_name: "Jupyter Notebook Data Science Stack"
+      description: "Jupyter Notebook Data Science Stack"
+      kubespawner_override:
+        image: quay.io/jupyter/datascience-notebook
+    {{- end }}
+    {{- if eq .Env.JUPYTER_PROFILE_PYSPARK_ENABLED "true" }}
+    - display_name: "Jupyter Notebook Python, Spark Stack"
+      description: "Jupyter Notebook Python, Spark Stack"
+      kubespawner_override:
+        image: quay.io/jupyter/pyspark-notebook
+    {{- end }}
+    {{- if eq .Env.JUPYTER_PROFILE_PYTORCH_ENABLED "true" }}
+    - display_name: "Jupyter Notebook PyTorch Deep Learning Stack"
+      description: "Jupyter Notebook PyTorch Deep Learning Stack"
+      kubespawner_override:
+        image: quay.io/jupyter/pytorch-notebook
+    {{- end }}
+    {{- if eq .Env.JUPYTER_PROFILE_TENSORFLOW_ENABLED "true" }}
+    - display_name: "Jupyter Notebook TensorFlow Deep Learning Stack"
+      description: "Jupyter Notebook TensorFlow Deep Learning Stack"
+      kubespawner_override:
+        image: quay.io/jupyter/tensorflow-notebook
+    {{- end }}
+    {{- if eq .Env.JUPYTER_PROFILE_BUUN_STACK_ENABLED "true" }}
+    - display_name: "Buun-stack"
+      description: "Jupyter Notebook with buun-stack"
+      kubespawner_override:
+        image: "{{ .Env.IMAGE_REGISTRY }}/{{ .Env.KERNEL_IMAGE_BUUN_STACK_REPOSITORY }}:{{ .Env.JUPYTER_PYTHON_KERNEL_TAG }}"
+    {{- end }}
+    {{- if eq .Env.JUPYTER_PROFILE_BUUN_STACK_CUDA_ENABLED "true" }}
+    - display_name: "Buun-stack with CUDA"
+      description: "Jupyter Notebook with buun-stack and CUDA support"
+      kubespawner_override:
+        image: "{{ .Env.IMAGE_REGISTRY }}/{{ .Env.KERNEL_IMAGE_BUUN_STACK_CUDA_REPOSITORY }}:{{ .Env.JUPYTER_PYTHON_KERNEL_TAG }}"
+        # resources:
+        #   requests:
+        #     nvidia.com/gpu: "1"
+    {{- end }}
+
+imagePullSecrets:
+  - name: regcred
+
+ingress:
+  enabled: true
+  annotations:
+    kubernetes.io/ingress.class: traefik
+    traefik.ingress.kubernetes.io/router.entrypoints: websecure
+  ingressClassName: traefik
+  hosts:
+    - {{ .Env.JUPYTERHUB_HOST }}
+  pathType: Prefix
+  tls:
+    - hosts:
+        - {{ .Env.JUPYTERHUB_HOST }}
diff --git a/jupyterhub/justfile b/jupyterhub/justfile
new file mode 100644
index 0000000..723b812
--- /dev/null
+++ b/jupyterhub/justfile
@@ -0,0 +1,150 @@
+set fallback := true
+
+export JUPYTERHUB_NAMESPACE := env("JUPYTERHUB_NAMESPACE", "jupyter")
+export JUPYTERHUB_CHART_VERSION := env("JUPYTERHUB_CHART_VERSION", "4.2.0")
+export JUPYTERHUB_OIDC_CLIENT_ID := env("JUPYTERHUB_OIDC_CLIENT_ID", "jupyterhub")
+export JUPYTERHUB_ENABLE_NFS_PV := env("JUPYTERHUB_ENABLE_NFS_PV", "")
+export JUPYTER_PYTHON_KERNEL_TAG := env("JUPYTER_PYTHON_KERNEL_TAG", "python-3.12-1")
+export KERNEL_IMAGE_BUUN_STACK_REPOSITORY := env("KERNEL_IMAGE_BUUN_STACK_REPOSITORY", "buun-stack-notebook")
+export KERNEL_IMAGE_BUUN_STACK_CUDA_REPOSITORY := env("KERNEL_IMAGE_BUUN_STACK_CUDA_REPOSITORY", "buun-stack-cuda-notebook")
+export JUPYTER_PROFILE_MINIMAL_ENABLED := env("JUPYTER_PROFILE_MINIMAL_ENABLED", "false")
+export JUPYTER_PROFILE_BASE_ENABLED := env("JUPYTER_PROFILE_BASE_ENABLED", "false")
+export JUPYTER_PROFILE_DATASCIENCE_ENABLED := env("JUPYTER_PROFILE_DATASCIENCE_ENABLED", "true")
+export JUPYTER_PROFILE_PYSPARK_ENABLED := env("JUPYTER_PROFILE_PYSPARK_ENABLED", "false")
+export JUPYTER_PROFILE_PYTORCH_ENABLED := env("JUPYTER_PROFILE_PYTORCH_ENABLED", "false")
+export JUPYTER_PROFILE_TENSORFLOW_ENABLED := env("JUPYTER_PROFILE_TENSORFLOW_ENABLED", "false")
+export JUPYTER_PROFILE_BUUN_STACK_ENABLED := env("JUPYTER_PROFILE_BUUN_STACK_ENABLED", "false")
+export JUPYTER_PROFILE_BUUN_STACK_CUDA_ENABLED := env("JUPYTER_PROFILE_BUUN_STACK_CUDA_ENABLED", "false")
+export IMAGE_REGISTRY := env("IMAGE_REGISTRY", "localhost:30500")
+export KEYCLOAK_REALM := env("KEYCLOAK_REALM", "buunstack")
+export LONGHORN_NAMESPACE := env("LONGHORN_NAMESPACE", "longhorn")
+
+[private]
+default:
+    @just --list --unsorted --list-submodules
+
+# Add Helm repository
+add-helm-repo:
+    helm repo add jupyterhub https://jupyterhub.github.io/helm-chart
+    helm repo update
+
+# Remove Helm repository
+remove-helm-repo:
+    helm repo remove jupyterhub
+
+# Create JupyterHub namespace
+create-namespace:
+    kubectl get namespace ${JUPYTERHUB_NAMESPACE} &>/dev/null || \
+        kubectl create namespace ${JUPYTERHUB_NAMESPACE}
+
+# Delete JupyterHub namespace
+delete-namespace:
+    kubectl delete namespace ${JUPYTERHUB_NAMESPACE} --ignore-not-found
+
+# Install JupyterHub
+install:
+    #!/bin/bash
+    set -euo pipefail
+    export JUPYTERHUB_HOST=${JUPYTERHUB_HOST:-}
+    while [ -z "${JUPYTERHUB_HOST}" ]; do
+        JUPYTERHUB_HOST=$(
+            gum input --prompt="JupyterHub host (FQDN): " --width=100 \
+            --placeholder="e.g., jupyter.example.com"
+        )
+    done
+    just create-namespace
+    # just k8s::copy-regcred ${JUPYTERHUB_NAMESPACE}
+    just keycloak::create-client ${KEYCLOAK_REALM} ${JUPYTERHUB_OIDC_CLIENT_ID} \
+        "https://${JUPYTERHUB_HOST}/hub/oauth_callback"
+    # just vault::create-jupyter-role
+    just add-helm-repo
+    export JUPYTERHUB_OIDC_CLIENT_ID=${JUPYTERHUB_OIDC_CLIENT_ID}
+    export KEYCLOAK_REALM=${KEYCLOAK_REALM}
+    export JUPYTER_PYTHON_KERNEL_TAG=${JUPYTER_PYTHON_KERNEL_TAG}
+    export JUPYTER_FSGID=${JUPYTER_FSGID:-100}
+    export PVC_NAME=""
+    if [ -z "${JUPYTERHUB_ENABLE_NFS_PV}" ]; then
+        if gum confirm "Are you going to use NFS PV?"; then
+            JUPYTERHUB_ENABLE_NFS_PV=true
+        else
+            JUPYTERHUB_ENABLE_NFS_PV=false
+        fi
+    fi
+    if [ "${JUPYTERHUB_ENABLE_NFS_PV}" = "true" ]; then
+        if ! helm status longhorn -n ${LONGHORN_NAMESPACE} &>/dev/null; then
+            echo "Longhorn is not installed. Please install Longhorn first." >&2
+            exit 1
+        fi
+        export JUPYTER_NFS_IP=${JUPYTER_NFS_IP:-}
+        while [ -z "${JUPYTER_NFS_IP}" ]; do
+            JUPYTER_NFS_IP=$(
+                gum input --prompt="NFS server IP address: " --width=100 \
+                --placeholder="e.g., 192.168.10.1"
+            )
+        done
+        export JUPYTER_NFS_PATH=${JUPYTER_NFS_PATH:-}
+        while [ -z "${JUPYTER_NFS_PATH}" ]; do
+            JUPYTER_NFS_PATH=$(
+                gum input --prompt="NFS server export path: " --width=100 \
+                --placeholder="e.g., /volume1/drive1/jupyter"
+            )
+        done
+        PVC_NAME=jupyter-nfs-pvc
+        if ! kubectl get pv jupyter-nfs-pv &>/dev/null; then
+            gomplate -f nfs-pv.gomplate.yaml | kubectl apply -f -
+        fi
+        kubectl apply -n ${JUPYTERHUB_NAMESPACE} -f nfs-pvc.yaml
+    fi
+    # https://z2jh.jupyter.org/en/stable/
+    gomplate -f jupyterhub-values.gomplate.yaml -o jupyterhub-values.yaml
+    helm upgrade --cleanup-on-fail --install jupyterhub jupyterhub/jupyterhub \
+        --version ${JUPYTERHUB_CHART_VERSION} -n ${JUPYTERHUB_NAMESPACE} \
+        --timeout=20m -f jupyterhub-values.yaml
+    # wait deployments manually because `helm upgrade --wait` does not work for JupyterHub
+    just k8s::wait-deployments-ready ${JUPYTERHUB_NAMESPACE} hub proxy
+
+# Uninstall JupyterHub
+uninstall:
+    #!/bin/bash
+    set -euo pipefail
+    helm uninstall jupyterhub -n ${JUPYTERHUB_NAMESPACE} --wait --ignore-not-found
+    kubectl delete pods -n ${JUPYTERHUB_NAMESPACE} -l app.kubernetes.io/component=singleuser-server
+    kubectl delete -n ${JUPYTERHUB_NAMESPACE} pvc jupyter-nfs-pvc --ignore-not-found
+    if kubectl get pv jupyter-nfs-pv &>/dev/null; then
+        kubectl patch pv jupyter-nfs-pv -p '{"spec":{"claimRef":null}}'
+    fi
+
+# Delete JupyterHub PV
+delete-pv:
+    #!/bin/bash
+    set -euo pipefail
+    if kubectl get pv jupyter-nfs-pv &>/dev/null; then
+        kubectl patch pv jupyter-nfs-pv -p '{"spec":{"claimRef":null}}'
+        kubectl delete pv jupyter-nfs-pv
+    fi
+
+# Build Jupyter notebook kernel images
+build-kernel-images:
+    #!/bin/bash
+    set -euo pipefail
+    (
+        cd ./images/datastack-notebook
+        docker build -t \
+        ${IMAGE_REGISTRY}/${KERNEL_IMAGE_BUUN_STACK_REPOSITORY}:${JUPYTER_PYTHON_KERNEL_TAG} \
+        --build-arg spark_version="3.5.4" \
+        --build-arg spark_download_url="https://archive.apache.org/dist/spark/" \
+        .
+    )
+    (
+        cd ./images/datastack-cuda-notebook
+        docker build -t \
+        ${IMAGE_REGISTRY}/${KERNEL_IMAGE_BUUN_STACK_CUDA_REPOSITORY}:${JUPYTER_PYTHON_KERNEL_TAG} \
+        --build-arg spark_version="3.5.4" \
+        --build-arg spark_download_url="https://archive.apache.org/dist/spark/" \
+        .
+    )
+
+# Push Jupyter notebook kernel images
+push-kernel-images: build-kernel-images
+    docker push ${IMAGE_REGISTRY}/${KERNEL_IMAGE_BUUN_STACK_REPOSITORY}:${JUPYTER_PYTHON_KERNEL_TAG}
+    docker push ${IMAGE_REGISTRY}/${KERNEL_IMAGE_BUUN_STACK_CUDA_REPOSITORY}:${JUPYTER_PYTHON_KERNEL_TAG}
diff --git a/jupyterhub/nfs-pv.gomplate.yaml b/jupyterhub/nfs-pv.gomplate.yaml
new file mode 100644
index 0000000..a02a0db
--- /dev/null
+++ b/jupyterhub/nfs-pv.gomplate.yaml
@@ -0,0 +1,15 @@
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: jupyter-nfs-pv
+spec:
+  capacity:
+    storage: 10Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: longhorn
+  volumeMode: Filesystem
+  nfs:
+    server: {{ .Env.JUPYTER_NFS_IP }}
+    path: {{ .Env.JUPYTER_NFS_PATH }}
diff --git a/jupyterhub/nfs-pvc.yaml b/jupyterhub/nfs-pvc.yaml
new file mode 100644
index 0000000..63494c1
--- /dev/null
+++ b/jupyterhub/nfs-pvc.yaml
@@ -0,0 +1,11 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: jupyter-nfs-pvc
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 10Gi
+  volumeName: jupyter-nfs-pv
diff --git a/justfile b/justfile
index 73cf7e3..1ddf93b 100644
--- a/justfile
+++ b/justfile
@@ -8,6 +8,7 @@ default:
 
 mod env
 mod keycloak
+mod jupyterhub
 mod k8s
 mod longhorn
 mod postgres
diff --git a/k8s/justfile b/k8s/justfile
index 48c5512..27a9819 100644
--- a/k8s/justfile
+++ b/k8s/justfile
@@ -271,3 +271,29 @@ configure-registry:
     echo "Restarting k3s to apply registry configuration..."
     ssh "${LOCAL_K8S_HOST}" "sudo systemctl restart k3s"
     echo "✓ Registry configuration applied"
+
+[positional-arguments]
+wait-deployments-ready *args:
+    #!/bin/bash
+    set -euo pipefail
+    namespace="$1"
+    shift
+    deployments=("$@")
+    check_ready() {
+        for deployment in "${deployments[@]}"; do
+            ready=$(kubectl get -n ${namespace} deployment "${deployment}" \
+                -o jsonpath="{.status.readyReplicas}" 2>/dev/null || true)
+            replicas=$(kubectl get -n ${namespace} deployment "${deployment}" \
+                -o jsonpath="{.status.replicas}" 2>/dev/null || true)
+            if [[ "${ready}" != "${replicas}" || -z "${ready}" ]]; then
+                return 0
+            fi
+        done
+        return 1
+    }
+    echo -n "Waiting for deployments $@ to be ready..."
+    while check_ready; do
+        echo -n "."
+        sleep 2
+    done
+    echo "ok"