From 00009ab19255901246a9cec985bb7b3899d3226a Mon Sep 17 00:00:00 2001 From: Masaki Yatsu Date: Fri, 29 Aug 2025 17:12:31 +0900 Subject: [PATCH] feat(jupyterhub): add JupyterHub --- jupyterhub/.gitignore | 1 + .../datastack-cuda-notebook/.dockerignore | 1 + .../images/datastack-cuda-notebook/Dockerfile | 159 ++++++++++++++++++ .../images/datastack-cuda-notebook/README.md | 5 + .../ipython_kernel_config.py | 13 ++ .../datastack-cuda-notebook/setup_spark.py | 131 +++++++++++++++ .../images/datastack-notebook/.dockerignore | 1 + .../images/datastack-notebook/Dockerfile | 158 +++++++++++++++++ .../images/datastack-notebook/README.md | 5 + .../ipython_kernel_config.py | 13 ++ .../images/datastack-notebook/setup_spark.py | 131 +++++++++++++++ jupyterhub/jupyterhub-values.gomplate.yaml | 155 +++++++++++++++++ jupyterhub/justfile | 150 +++++++++++++++++ jupyterhub/nfs-pv.gomplate.yaml | 15 ++ jupyterhub/nfs-pvc.yaml | 11 ++ justfile | 1 + k8s/justfile | 26 +++ 17 files changed, 976 insertions(+) create mode 100644 jupyterhub/.gitignore create mode 100644 jupyterhub/images/datastack-cuda-notebook/.dockerignore create mode 100644 jupyterhub/images/datastack-cuda-notebook/Dockerfile create mode 100644 jupyterhub/images/datastack-cuda-notebook/README.md create mode 100644 jupyterhub/images/datastack-cuda-notebook/ipython_kernel_config.py create mode 100755 jupyterhub/images/datastack-cuda-notebook/setup_spark.py create mode 100644 jupyterhub/images/datastack-notebook/.dockerignore create mode 100644 jupyterhub/images/datastack-notebook/Dockerfile create mode 100644 jupyterhub/images/datastack-notebook/README.md create mode 100644 jupyterhub/images/datastack-notebook/ipython_kernel_config.py create mode 100755 jupyterhub/images/datastack-notebook/setup_spark.py create mode 100644 jupyterhub/jupyterhub-values.gomplate.yaml create mode 100644 jupyterhub/justfile create mode 100644 jupyterhub/nfs-pv.gomplate.yaml create mode 100644 jupyterhub/nfs-pvc.yaml diff --git a/jupyterhub/.gitignore b/jupyterhub/.gitignore new file mode 100644 index 0000000..d2b1ebf --- /dev/null +++ b/jupyterhub/.gitignore @@ -0,0 +1 @@ +jupyterhub-values.yaml diff --git a/jupyterhub/images/datastack-cuda-notebook/.dockerignore b/jupyterhub/images/datastack-cuda-notebook/.dockerignore new file mode 100644 index 0000000..b43bf86 --- /dev/null +++ b/jupyterhub/images/datastack-cuda-notebook/.dockerignore @@ -0,0 +1 @@ +README.md diff --git a/jupyterhub/images/datastack-cuda-notebook/Dockerfile b/jupyterhub/images/datastack-cuda-notebook/Dockerfile new file mode 100644 index 0000000..dd949f8 --- /dev/null +++ b/jupyterhub/images/datastack-cuda-notebook/Dockerfile @@ -0,0 +1,159 @@ +# Merge pyspark-notebook into pytorch-notebook:cuda12-python-3.12 +# https://github.com/jupyter/docker-stacks/tree/main/images/pytorch-notebook +# https://github.com/jupyter/docker-stacks/tree/main/images/pyspark-notebook +# https://github.com/jupyter/docker-stacks/blob/main/images/pyspark-notebook/setup_spark.py + +FROM quay.io/jupyter/pytorch-notebook:x86_64-cuda12-python-3.12.10 + +# Fix: https://github.com/hadolint/hadolint/wiki/DL4006 +# Fix: https://github.com/koalaman/shellcheck/wiki/SC3014 +SHELL ["/bin/bash", "-o", "pipefail", "-c"] + +USER root + +# Spark dependencies +# Default values can be overridden at build time +# (ARGS are in lowercase to distinguish them from ENV) +ARG openjdk_version="17" + +RUN apt-get update --yes && \ + apt-get install --yes --no-install-recommends \ + bash jq \ + "openjdk-${openjdk_version}-jre-headless" \ + ca-certificates-java && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +# If spark_version is not set, latest Spark will be installed +ARG spark_version +ARG hadoop_version="3" +# If scala_version is not set, Spark without Scala will be installed +ARG scala_version +# URL to use for Spark downloads +# You need to use https://archive.apache.org/dist/spark/ website if you want to download old Spark versions +# But it seems to be slower, that's why we use the recommended site for download +ARG spark_download_url="https://dlcdn.apache.org/spark/" + +ENV SPARK_HOME=/usr/local/spark +ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" +ENV JAVA_HOME="/usr/lib/jvm/java-17-openjdk-amd64" +ENV PATH="${PATH}:${SPARK_HOME}/bin:${JAVA_HOME}/bin" + +COPY setup_spark.py /opt/setup-scripts/ + +# Setup Spark +RUN /opt/setup-scripts/setup_spark.py \ + --spark-version="${spark_version}" \ + --hadoop-version="${hadoop_version}" \ + --scala-version="${scala_version}" \ + --spark-download-url="${spark_download_url}" + +# Configure IPython system-wide +COPY ipython_kernel_config.py "/etc/ipython/" +RUN fix-permissions "/etc/ipython/" + +USER ${NB_UID} + +# Remove torch to fix `critical libmamba filesystem error` on executing `memba install` +RUN pip uninstall -y \ + 'torch' \ + 'torchaudio' \ + 'torchvision' + +# Install pyarrow +# NOTE: It's important to ensure compatibility between Pandas versions. +# The pandas version in this Dockerfile should match the version +# on which the Pandas API for Spark is built. +# To find the right version: +# 1. Check out the Spark branch you are on: +# 2. Find the pandas version in the file `dev/infra/Dockerfile`. +RUN mamba install --yes \ + 'aif360' \ + 'airflow' \ + 'chromadb' \ + 'dalex' \ + 'dbt' \ + 'dlt' \ + 'duckdb' \ + 'faiss' \ + 'gitpython' \ + 'grpcio-status' \ + 'grpcio' \ + 'keras' \ + 'langchain' \ + 'langchain-ai21' \ + 'langchain-anthropic' \ + 'langchain-aws' \ + 'langchain-azure-dynamic-sessions' \ + 'langchain-chroma' \ + 'langchain-community' \ + 'langchain-experimental' \ + 'langchain-fireworks' \ + 'langchain-google-genai' \ + 'langchain-groq' \ + 'langchain-mistralai' \ + 'langchain-mongodb' \ + 'langchain-nomic' \ + 'langchain-openai' \ + 'langchain-prompty' \ + 'langchain-qdrant' \ + 'langchain-robocorp' \ + 'langchain-text-splitters' \ + 'langchain-together' \ + 'langchain-voyageai' \ + 'langgraph' \ + 'langgraph-checkpoint' \ + 'langgraph-sdk' \ + 'langsmith' \ + 'litellm' \ + 'nest-asyncio' \ + 'openai' \ + 'openai-agents' \ + 'pandas=2.2.2' \ + 'pandas-profiling' \ + 'pillow' \ + 'polars' \ + 'pyarrow' \ + 'qdrant-client' \ + 'rapidfuzz' \ + 'tensorflow' \ + 'transformers' \ + 'unstructured' \ + && \ + mamba clean --all -f -y && \ + fix-permissions "${CONDA_DIR}" && \ + fix-permissions "/home/${NB_USER}" + +# RUN pip install pyspark[connect,ml,mllib,pandas-on-spark,sql]==4.0.0.dev2 +# RUN pip install pyspark[connect,ml,mllib,pandas-on-spark,sql]==3.5.4 + +RUN pip install \ + agno \ + fastembed \ + feature-engine \ + jupyter-ai \ + jupyter-ai-magics[all] \ + kreuzberg \ + langchain-huggingface \ + langchain-perplexity \ + langfuse \ + pydantic-ai \ + ragas \ + smolagents \ + tavily-python \ + tweet-preprocessor + +# Install PyTorch with pip (https://pytorch.org/get-started/locally/) +# langchain-openai must be updated to avoid pydantic v2 error +# https://github.com/run-llama/llama_index/issues/16540https://github.com/run-llama/llama_index/issues/16540 +# hadolint ignore=DL3013 +RUN pip install --no-cache-dir --extra-index-url=https://pypi.nvidia.com --index-url 'https://download.pytorch.org/whl/cu124' \ + 'torch' \ + 'torchaudio' \ + 'torchvision' && \ + pip install --upgrade langchain-openai && \ + fix-permissions "${CONDA_DIR}" && \ + fix-permissions "/home/${NB_USER}" + + +WORKDIR "${HOME}" +EXPOSE 4040 diff --git a/jupyterhub/images/datastack-cuda-notebook/README.md b/jupyterhub/images/datastack-cuda-notebook/README.md new file mode 100644 index 0000000..1e1e3a3 --- /dev/null +++ b/jupyterhub/images/datastack-cuda-notebook/README.md @@ -0,0 +1,5 @@ +# Jupyter Notebook Image + +Custom Jupyter notebook kernel image derived from the official one: + +[jupyter/docker-stacks: Ready-to-run Docker images containing Jupyter applications](https://github.com/jupyter/docker-stacks) diff --git a/jupyterhub/images/datastack-cuda-notebook/ipython_kernel_config.py b/jupyterhub/images/datastack-cuda-notebook/ipython_kernel_config.py new file mode 100644 index 0000000..921e6fa --- /dev/null +++ b/jupyterhub/images/datastack-cuda-notebook/ipython_kernel_config.py @@ -0,0 +1,13 @@ +# Configuration file for ipython-kernel. +# See + +# With IPython >= 6.0.0, all outputs to stdout/stderr are captured. +# It is the case for subprocesses and output of compiled libraries like Spark. +# Those logs now both head to notebook logs and in notebooks outputs. +# Logs are particularly verbose with Spark, that is why we turn them off through this flag. +# + +# Attempt to capture and forward low-level output, e.g. produced by Extension libraries. +# Default: True +# type:ignore +c.IPKernelApp.capture_fd_output = False # noqa: F821 diff --git a/jupyterhub/images/datastack-cuda-notebook/setup_spark.py b/jupyterhub/images/datastack-cuda-notebook/setup_spark.py new file mode 100755 index 0000000..c5b7643 --- /dev/null +++ b/jupyterhub/images/datastack-cuda-notebook/setup_spark.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 +# Copyright (c) Jupyter Development Team. +# Distributed under the terms of the Modified BSD License. + +# Requirements: +# - Run as the root user +# - Required env variable: SPARK_HOME + +import argparse +import logging +import os +import subprocess +from pathlib import Path + +import requests +from bs4 import BeautifulSoup + +LOGGER = logging.getLogger(__name__) + + +def get_all_refs(url: str) -> list[str]: + """ + Get all the references for a given webpage + """ + resp = requests.get(url) + soup = BeautifulSoup(resp.text, "html.parser") + return [a["href"] for a in soup.find_all("a", href=True)] + + +def get_latest_spark_version() -> str: + """ + Returns the last version of Spark using spark archive + """ + LOGGER.info("Downloading Spark versions information") + all_refs = get_all_refs("https://archive.apache.org/dist/spark/") + versions = [ + ref.removeprefix("spark-").removesuffix("/") + for ref in all_refs + if ref.startswith("spark-") and "incubating" not in ref + ] + + # Compare versions semantically + def version_array(ver: str) -> tuple[int, int, int, str]: + # 3.5.3 -> [3, 5, 3, ""] + # 4.0.0-preview2 -> [4, 0, 0, "preview2"] + arr = ver.split(".") + assert len(arr) == 3, arr + major, minor = int(arr[0]), int(arr[1]) + patch, _, preview = arr[2].partition("-") + return (major, minor, int(patch), preview) + + latest_version = max(versions, key=lambda ver: version_array(ver)) + LOGGER.info(f"Latest version: {latest_version}") + return latest_version + + +def download_spark( + spark_version: str, + hadoop_version: str, + scala_version: str, + spark_download_url: Path, +) -> str: + """ + Downloads and unpacks spark + The resulting spark directory name is returned + """ + LOGGER.info("Downloading and unpacking Spark") + spark_dir_name = f"spark-{spark_version}-bin-hadoop{hadoop_version}" + if scala_version: + spark_dir_name += f"-scala{scala_version}" + LOGGER.info(f"Spark directory name: {spark_dir_name}") + spark_url = spark_download_url / f"spark-{spark_version}" / f"{spark_dir_name}.tgz" + + tmp_file = Path("/tmp/spark.tar.gz") + subprocess.check_call( + ["curl", "--progress-bar", "--location", "--output", tmp_file, spark_url] + ) + subprocess.check_call( + [ + "tar", + "xzf", + tmp_file, + "-C", + "/usr/local", + "--owner", + "root", + "--group", + "root", + "--no-same-owner", + ] + ) + tmp_file.unlink() + return spark_dir_name + + +def configure_spark(spark_dir_name: str, spark_home: Path) -> None: + """ + Creates a ${SPARK_HOME} symlink to a versioned spark directory + Creates a 10spark-config.sh symlink to source PYTHONPATH automatically + """ + LOGGER.info("Configuring Spark") + subprocess.check_call(["ln", "-s", f"/usr/local/{spark_dir_name}", spark_home]) + + # Add a link in the before_notebook hook in order to source PYTHONPATH automatically + CONFIG_SCRIPT = "/usr/local/bin/before-notebook.d/10spark-config.sh" + subprocess.check_call( + ["ln", "-s", spark_home / "sbin/spark-config.sh", CONFIG_SCRIPT] + ) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + + arg_parser = argparse.ArgumentParser() + arg_parser.add_argument("--spark-version", required=True) + arg_parser.add_argument("--hadoop-version", required=True) + arg_parser.add_argument("--scala-version", required=True) + arg_parser.add_argument("--spark-download-url", type=Path, required=True) + args = arg_parser.parse_args() + + args.spark_version = args.spark_version or get_latest_spark_version() + + spark_dir_name = download_spark( + spark_version=args.spark_version, + hadoop_version=args.hadoop_version, + scala_version=args.scala_version, + spark_download_url=args.spark_download_url, + ) + configure_spark( + spark_dir_name=spark_dir_name, spark_home=Path(os.environ["SPARK_HOME"]) + ) diff --git a/jupyterhub/images/datastack-notebook/.dockerignore b/jupyterhub/images/datastack-notebook/.dockerignore new file mode 100644 index 0000000..b43bf86 --- /dev/null +++ b/jupyterhub/images/datastack-notebook/.dockerignore @@ -0,0 +1 @@ +README.md diff --git a/jupyterhub/images/datastack-notebook/Dockerfile b/jupyterhub/images/datastack-notebook/Dockerfile new file mode 100644 index 0000000..27ee415 --- /dev/null +++ b/jupyterhub/images/datastack-notebook/Dockerfile @@ -0,0 +1,158 @@ +# Merge pyspark-notebook into pytorch-notebook:python-3.12 +# https://github.com/jupyter/docker-stacks/tree/main/images/pytorch-notebook +# https://github.com/jupyter/docker-stacks/tree/main/images/pyspark-notebook +# https://github.com/jupyter/docker-stacks/blob/main/images/pyspark-notebook/setup_spark.py + +FROM quay.io/jupyter/pytorch-notebook:python-3.12 + +# Fix: https://github.com/hadolint/hadolint/wiki/DL4006 +# Fix: https://github.com/koalaman/shellcheck/wiki/SC3014 +SHELL ["/bin/bash", "-o", "pipefail", "-c"] + +USER root + +# Spark dependencies +# Default values can be overridden at build time +# (ARGS are in lowercase to distinguish them from ENV) +ARG openjdk_version="17" + +RUN apt-get update --yes && \ + apt-get install --yes --no-install-recommends \ + bash jq \ + "openjdk-${openjdk_version}-jre-headless" \ + ca-certificates-java && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +# If spark_version is not set, latest Spark will be installed +ARG spark_version +ARG hadoop_version="3" +# If scala_version is not set, Spark without Scala will be installed +ARG scala_version +# URL to use for Spark downloads +# You need to use https://archive.apache.org/dist/spark/ website if you want to download old Spark versions +# But it seems to be slower, that's why we use the recommended site for download +ARG spark_download_url="https://dlcdn.apache.org/spark/" + +ENV SPARK_HOME=/usr/local/spark +ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" +ENV JAVA_HOME="/usr/lib/jvm/java-17-openjdk-amd64" +ENV PATH="${PATH}:${SPARK_HOME}/bin:${JAVA_HOME}/bin" + +COPY setup_spark.py /opt/setup-scripts/ + +# Setup Spark +RUN /opt/setup-scripts/setup_spark.py \ + --spark-version="${spark_version}" \ + --hadoop-version="${hadoop_version}" \ + --scala-version="${scala_version}" \ + --spark-download-url="${spark_download_url}" + +# Configure IPython system-wide +COPY ipython_kernel_config.py "/etc/ipython/" +RUN fix-permissions "/etc/ipython/" + +USER ${NB_UID} + +# Remove torch to fix `critical libmamba filesystem error` on executing `memba install` +RUN pip uninstall -y \ + 'torch' \ + 'torchaudio' \ + 'torchvision' + +# Install pyarrow +# NOTE: It's important to ensure compatibility between Pandas versions. +# The pandas version in this Dockerfile should match the version +# on which the Pandas API for Spark is built. +# To find the right version: +# 1. Check out the Spark branch you are on: +# 2. Find the pandas version in the file `dev/infra/Dockerfile`. +RUN mamba install --yes \ + 'aif360' \ + 'airflow' \ + 'chromadb' \ + 'dalex' \ + 'dbt' \ + 'dlt' \ + 'duckdb' \ + 'faiss' \ + 'gitpython' \ + 'grpcio-status' \ + 'grpcio' \ + 'keras' \ + 'langchain' \ + 'langchain-ai21' \ + 'langchain-anthropic' \ + 'langchain-aws' \ + 'langchain-azure-dynamic-sessions' \ + 'langchain-chroma' \ + 'langchain-community' \ + 'langchain-experimental' \ + 'langchain-fireworks' \ + 'langchain-google-genai' \ + 'langchain-groq' \ + 'langchain-mistralai' \ + 'langchain-mongodb' \ + 'langchain-nomic' \ + 'langchain-openai' \ + 'langchain-prompty' \ + 'langchain-qdrant' \ + 'langchain-robocorp' \ + 'langchain-text-splitters' \ + 'langchain-together' \ + 'langchain-voyageai' \ + 'langgraph' \ + 'langgraph-checkpoint' \ + 'langgraph-sdk' \ + 'langsmith' \ + 'litellm' \ + 'nest-asyncio' \ + 'openai' \ + 'openai-agents' \ + 'pandas=2.2.2' \ + 'pandas-profiling' \ + 'pillow' \ + 'polars' \ + 'pyarrow' \ + 'qdrant-client' \ + 'rapidfuzz' \ + 'tensorflow' \ + 'transformers' \ + 'unstructured' \ + && \ + mamba clean --all -f -y && \ + fix-permissions "${CONDA_DIR}" && \ + fix-permissions "/home/${NB_USER}" + +# RUN pip install pyspark[connect,ml,mllib,pandas-on-spark,sql]==4.0.0.dev2 +# RUN pip install pyspark[connect,ml,mllib,pandas-on-spark,sql]==3.5.4 + +RUN pip install \ + agno \ + fastembed \ + feature-engine \ + jupyter-ai \ + jupyter-ai-magics[all] \ + kreuzberg \ + langfuse \ + langchain-huggingface \ + langchain-perplexity \ + pydantic-ai \ + ragas \ + smolagents \ + tavily-python \ + tweet-preprocessor + +# Install PyTorch with pip (https://pytorch.org/get-started/locally/) +# langchain-openai must be updated to avoid pydantic v2 error +# https://github.com/run-llama/llama_index/issues/16540https://github.com/run-llama/llama_index/issues/16540 +# hadolint ignore=DL3013 +RUN pip install --no-cache-dir --index-url 'https://download.pytorch.org/whl/cpu' \ + 'torch' \ + 'torchaudio' \ + 'torchvision' && \ + pip install --upgrade langchain-openai && \ + fix-permissions "${CONDA_DIR}" && \ + fix-permissions "/home/${NB_USER}" + +WORKDIR "${HOME}" +EXPOSE 4040 diff --git a/jupyterhub/images/datastack-notebook/README.md b/jupyterhub/images/datastack-notebook/README.md new file mode 100644 index 0000000..1e1e3a3 --- /dev/null +++ b/jupyterhub/images/datastack-notebook/README.md @@ -0,0 +1,5 @@ +# Jupyter Notebook Image + +Custom Jupyter notebook kernel image derived from the official one: + +[jupyter/docker-stacks: Ready-to-run Docker images containing Jupyter applications](https://github.com/jupyter/docker-stacks) diff --git a/jupyterhub/images/datastack-notebook/ipython_kernel_config.py b/jupyterhub/images/datastack-notebook/ipython_kernel_config.py new file mode 100644 index 0000000..921e6fa --- /dev/null +++ b/jupyterhub/images/datastack-notebook/ipython_kernel_config.py @@ -0,0 +1,13 @@ +# Configuration file for ipython-kernel. +# See + +# With IPython >= 6.0.0, all outputs to stdout/stderr are captured. +# It is the case for subprocesses and output of compiled libraries like Spark. +# Those logs now both head to notebook logs and in notebooks outputs. +# Logs are particularly verbose with Spark, that is why we turn them off through this flag. +# + +# Attempt to capture and forward low-level output, e.g. produced by Extension libraries. +# Default: True +# type:ignore +c.IPKernelApp.capture_fd_output = False # noqa: F821 diff --git a/jupyterhub/images/datastack-notebook/setup_spark.py b/jupyterhub/images/datastack-notebook/setup_spark.py new file mode 100755 index 0000000..c5b7643 --- /dev/null +++ b/jupyterhub/images/datastack-notebook/setup_spark.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 +# Copyright (c) Jupyter Development Team. +# Distributed under the terms of the Modified BSD License. + +# Requirements: +# - Run as the root user +# - Required env variable: SPARK_HOME + +import argparse +import logging +import os +import subprocess +from pathlib import Path + +import requests +from bs4 import BeautifulSoup + +LOGGER = logging.getLogger(__name__) + + +def get_all_refs(url: str) -> list[str]: + """ + Get all the references for a given webpage + """ + resp = requests.get(url) + soup = BeautifulSoup(resp.text, "html.parser") + return [a["href"] for a in soup.find_all("a", href=True)] + + +def get_latest_spark_version() -> str: + """ + Returns the last version of Spark using spark archive + """ + LOGGER.info("Downloading Spark versions information") + all_refs = get_all_refs("https://archive.apache.org/dist/spark/") + versions = [ + ref.removeprefix("spark-").removesuffix("/") + for ref in all_refs + if ref.startswith("spark-") and "incubating" not in ref + ] + + # Compare versions semantically + def version_array(ver: str) -> tuple[int, int, int, str]: + # 3.5.3 -> [3, 5, 3, ""] + # 4.0.0-preview2 -> [4, 0, 0, "preview2"] + arr = ver.split(".") + assert len(arr) == 3, arr + major, minor = int(arr[0]), int(arr[1]) + patch, _, preview = arr[2].partition("-") + return (major, minor, int(patch), preview) + + latest_version = max(versions, key=lambda ver: version_array(ver)) + LOGGER.info(f"Latest version: {latest_version}") + return latest_version + + +def download_spark( + spark_version: str, + hadoop_version: str, + scala_version: str, + spark_download_url: Path, +) -> str: + """ + Downloads and unpacks spark + The resulting spark directory name is returned + """ + LOGGER.info("Downloading and unpacking Spark") + spark_dir_name = f"spark-{spark_version}-bin-hadoop{hadoop_version}" + if scala_version: + spark_dir_name += f"-scala{scala_version}" + LOGGER.info(f"Spark directory name: {spark_dir_name}") + spark_url = spark_download_url / f"spark-{spark_version}" / f"{spark_dir_name}.tgz" + + tmp_file = Path("/tmp/spark.tar.gz") + subprocess.check_call( + ["curl", "--progress-bar", "--location", "--output", tmp_file, spark_url] + ) + subprocess.check_call( + [ + "tar", + "xzf", + tmp_file, + "-C", + "/usr/local", + "--owner", + "root", + "--group", + "root", + "--no-same-owner", + ] + ) + tmp_file.unlink() + return spark_dir_name + + +def configure_spark(spark_dir_name: str, spark_home: Path) -> None: + """ + Creates a ${SPARK_HOME} symlink to a versioned spark directory + Creates a 10spark-config.sh symlink to source PYTHONPATH automatically + """ + LOGGER.info("Configuring Spark") + subprocess.check_call(["ln", "-s", f"/usr/local/{spark_dir_name}", spark_home]) + + # Add a link in the before_notebook hook in order to source PYTHONPATH automatically + CONFIG_SCRIPT = "/usr/local/bin/before-notebook.d/10spark-config.sh" + subprocess.check_call( + ["ln", "-s", spark_home / "sbin/spark-config.sh", CONFIG_SCRIPT] + ) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + + arg_parser = argparse.ArgumentParser() + arg_parser.add_argument("--spark-version", required=True) + arg_parser.add_argument("--hadoop-version", required=True) + arg_parser.add_argument("--scala-version", required=True) + arg_parser.add_argument("--spark-download-url", type=Path, required=True) + args = arg_parser.parse_args() + + args.spark_version = args.spark_version or get_latest_spark_version() + + spark_dir_name = download_spark( + spark_version=args.spark_version, + hadoop_version=args.hadoop_version, + scala_version=args.scala_version, + spark_download_url=args.spark_download_url, + ) + configure_spark( + spark_dir_name=spark_dir_name, spark_home=Path(os.environ["SPARK_HOME"]) + ) diff --git a/jupyterhub/jupyterhub-values.gomplate.yaml b/jupyterhub/jupyterhub-values.gomplate.yaml new file mode 100644 index 0000000..b312ee7 --- /dev/null +++ b/jupyterhub/jupyterhub-values.gomplate.yaml @@ -0,0 +1,155 @@ +hub: + config: + JupyterHub: + authenticator_class: generic-oauth + admin_access: false + + Authenticator: + enable_auth_state: true + allow_all: true # allow all Keycloak users + + GenericOAuthenticator: + client_id: {{ .Env.JUPYTERHUB_OIDC_CLIENT_ID }} + oauth_callback_url: "https://{{ .Env.JUPYTERHUB_HOST }}/hub/oauth_callback" + authorize_url: "https://{{ .Env.KEYCLOAK_HOST }}/realms/{{ .Env.KEYCLOAK_REALM }}/protocol/openid-connect/auth" + token_url: "https://{{ .Env.KEYCLOAK_HOST }}/realms/{{ .Env.KEYCLOAK_REALM }}/protocol/openid-connect/token" + userdata_url: "https://{{ .Env.KEYCLOAK_HOST }}/realms/{{ .Env.KEYCLOAK_REALM }}/protocol/openid-connect/userinfo" + login_service: keycloak + # username_claim: email + username_claim: preferred_username + + OAuthenticator: + scope: + - openid + - profile + - email + + # db: + # pvc: + # storageClassName: longhorn + + podSecurityContext: + fsGroup: {{ .Env.JUPYTER_FSGID }} + +singleuser: + storage: + {{ if env.Getenv "PVC_NAME" -}} + type: static + static: + pvcName: {{ .Env.PVC_NAME }} + {{ else -}} + type: dynamic + dynamic: + storageClass: longhorn + storageAccessModes: + - ReadWriteOnce + {{ end -}} + capacity: 10Gi + networkPolicy: + egress: + - to: + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: chroma + ports: + - port: 8000 + protocol: TCP + - to: + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: qdrant + ports: + - port: 6333 + protocol: TCP + - port: 6334 + protocol: TCP + - port: 6335 + protocol: TCP + - to: + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: litellm + ports: + - port: 4000 + protocol: TCP + - to: + - ipBlock: + cidr: 0.0.0.0/0 + ports: + - port: 443 + protocol: TCP + domains: + - '*.shds.dev' + + image: + pullPolicy: IfNotPresent + + profileList: + # https://quay.io/repository/jupyter/pyspark-notebook + {{- if eq .Env.JUPYTER_PROFILE_MINIMAL_ENABLED "true" }} + - display_name: "Minimal Jupyter Notebook Stack" + description: "Minimal Jupyter Notebook Stack" + kubespawner_override: + image: quay.io/jupyter/minimal-notebook + {{- end }} + {{ if eq .Env.JUPYTER_PROFILE_BASE_ENABLED "true" }} + - display_name: "Base Jupyter Notebook Stack" + description: "Base Jupyter Notebook Stack" + kubespawner_override: + image: quay.io/jupyter/base-notebook + {{- end }} + {{- if eq .Env.JUPYTER_PROFILE_DATASCIENCE_ENABLED "true" }} + - display_name: "Jupyter Notebook Data Science Stack" + description: "Jupyter Notebook Data Science Stack" + kubespawner_override: + image: quay.io/jupyter/datascience-notebook + {{- end }} + {{- if eq .Env.JUPYTER_PROFILE_PYSPARK_ENABLED "true" }} + - display_name: "Jupyter Notebook Python, Spark Stack" + description: "Jupyter Notebook Python, Spark Stack" + kubespawner_override: + image: quay.io/jupyter/pyspark-notebook + {{- end }} + {{- if eq .Env.JUPYTER_PROFILE_PYTORCH_ENABLED "true" }} + - display_name: "Jupyter Notebook PyTorch Deep Learning Stack" + description: "Jupyter Notebook PyTorch Deep Learning Stack" + kubespawner_override: + image: quay.io/jupyter/pytorch-notebook + {{- end }} + {{- if eq .Env.JUPYTER_PROFILE_TENSORFLOW_ENABLED "true" }} + - display_name: "Jupyter Notebook TensorFlow Deep Learning Stack" + description: "Jupyter Notebook TensorFlow Deep Learning Stack" + kubespawner_override: + image: quay.io/jupyter/tensorflow-notebook + {{- end }} + {{- if eq .Env.JUPYTER_PROFILE_BUUN_STACK_ENABLED "true" }} + - display_name: "Buun-stack" + description: "Jupyter Notebook with buun-stack" + kubespawner_override: + image: "{{ .Env.IMAGE_REGISTRY }}/{{ .Env.KERNEL_IMAGE_BUUN_STACK_REPOSITORY }}:{{ .Env.JUPYTER_PYTHON_KERNEL_TAG }}" + {{- end }} + {{- if eq .Env.JUPYTER_PROFILE_BUUN_STACK_CUDA_ENABLED "true" }} + - display_name: "Buun-stack with CUDA" + description: "Jupyter Notebook with buun-stack and CUDA support" + kubespawner_override: + image: "{{ .Env.IMAGE_REGISTRY }}/{{ .Env.KERNEL_IMAGE_BUUN_STACK_CUDA_REPOSITORY }}:{{ .Env.JUPYTER_PYTHON_KERNEL_TAG }}" + # resources: + # requests: + # nvidia.com/gpu: "1" + {{- end }} + +imagePullSecrets: + - name: regcred + +ingress: + enabled: true + annotations: + kubernetes.io/ingress.class: traefik + traefik.ingress.kubernetes.io/router.entrypoints: websecure + ingressClassName: traefik + hosts: + - {{ .Env.JUPYTERHUB_HOST }} + pathType: Prefix + tls: + - hosts: + - {{ .Env.JUPYTERHUB_HOST }} diff --git a/jupyterhub/justfile b/jupyterhub/justfile new file mode 100644 index 0000000..723b812 --- /dev/null +++ b/jupyterhub/justfile @@ -0,0 +1,150 @@ +set fallback := true + +export JUPYTERHUB_NAMESPACE := env("JUPYTERHUB_NAMESPACE", "jupyter") +export JUPYTERHUB_CHART_VERSION := env("JUPYTERHUB_CHART_VERSION", "4.2.0") +export JUPYTERHUB_OIDC_CLIENT_ID := env("JUPYTERHUB_OIDC_CLIENT_ID", "jupyterhub") +export JUPYTERHUB_ENABLE_NFS_PV := env("JUPYTERHUB_ENABLE_NFS_PV", "") +export JUPYTER_PYTHON_KERNEL_TAG := env("JUPYTER_PYTHON_KERNEL_TAG", "python-3.12-1") +export KERNEL_IMAGE_BUUN_STACK_REPOSITORY := env("KERNEL_IMAGE_BUUN_STACK_REPOSITORY", "buun-stack-notebook") +export KERNEL_IMAGE_BUUN_STACK_CUDA_REPOSITORY := env("KERNEL_IMAGE_BUUN_STACK_CUDA_REPOSITORY", "buun-stack-cuda-notebook") +export JUPYTER_PROFILE_MINIMAL_ENABLED := env("JUPYTER_PROFILE_MINIMAL_ENABLED", "false") +export JUPYTER_PROFILE_BASE_ENABLED := env("JUPYTER_PROFILE_BASE_ENABLED", "false") +export JUPYTER_PROFILE_DATASCIENCE_ENABLED := env("JUPYTER_PROFILE_DATASCIENCE_ENABLED", "true") +export JUPYTER_PROFILE_PYSPARK_ENABLED := env("JUPYTER_PROFILE_PYSPARK_ENABLED", "false") +export JUPYTER_PROFILE_PYTORCH_ENABLED := env("JUPYTER_PROFILE_PYTORCH_ENABLED", "false") +export JUPYTER_PROFILE_TENSORFLOW_ENABLED := env("JUPYTER_PROFILE_TENSORFLOW_ENABLED", "false") +export JUPYTER_PROFILE_BUUN_STACK_ENABLED := env("JUPYTER_PROFILE_BUUN_STACK_ENABLED", "false") +export JUPYTER_PROFILE_BUUN_STACK_CUDA_ENABLED := env("JUPYTER_PROFILE_BUUN_STACK_CUDA_ENABLED", "false") +export IMAGE_REGISTRY := env("IMAGE_REGISTRY", "localhost:30500") +export KEYCLOAK_REALM := env("KEYCLOAK_REALM", "buunstack") +export LONGHORN_NAMESPACE := env("LONGHORN_NAMESPACE", "longhorn") + +[private] +default: + @just --list --unsorted --list-submodules + +# Add Helm repository +add-helm-repo: + helm repo add jupyterhub https://jupyterhub.github.io/helm-chart + helm repo update + +# Remove Helm repository +remove-helm-repo: + helm repo remove jupyterhub + +# Create JupyterHub namespace +create-namespace: + kubectl get namespace ${JUPYTERHUB_NAMESPACE} &>/dev/null || \ + kubectl create namespace ${JUPYTERHUB_NAMESPACE} + +# Delete JupyterHub namespace +delete-namespace: + kubectl delete namespace ${JUPYTERHUB_NAMESPACE} --ignore-not-found + +# Install JupyterHub +install: + #!/bin/bash + set -euo pipefail + export JUPYTERHUB_HOST=${JUPYTERHUB_HOST:-} + while [ -z "${JUPYTERHUB_HOST}" ]; do + JUPYTERHUB_HOST=$( + gum input --prompt="JupyterHub host (FQDN): " --width=100 \ + --placeholder="e.g., jupyter.example.com" + ) + done + just create-namespace + # just k8s::copy-regcred ${JUPYTERHUB_NAMESPACE} + just keycloak::create-client ${KEYCLOAK_REALM} ${JUPYTERHUB_OIDC_CLIENT_ID} \ + "https://${JUPYTERHUB_HOST}/hub/oauth_callback" + # just vault::create-jupyter-role + just add-helm-repo + export JUPYTERHUB_OIDC_CLIENT_ID=${JUPYTERHUB_OIDC_CLIENT_ID} + export KEYCLOAK_REALM=${KEYCLOAK_REALM} + export JUPYTER_PYTHON_KERNEL_TAG=${JUPYTER_PYTHON_KERNEL_TAG} + export JUPYTER_FSGID=${JUPYTER_FSGID:-100} + export PVC_NAME="" + if [ -z "${JUPYTERHUB_ENABLE_NFS_PV}" ]; then + if gum confirm "Are you going to use NFS PV?"; then + JUPYTERHUB_ENABLE_NFS_PV=true + else + JUPYTERHUB_ENABLE_NFS_PV=false + fi + fi + if [ "${JUPYTERHUB_ENABLE_NFS_PV}" = "true" ]; then + if ! helm status longhorn -n ${LONGHORN_NAMESPACE} &>/dev/null; then + echo "Longhorn is not installed. Please install Longhorn first." >&2 + exit 1 + fi + export JUPYTER_NFS_IP=${JUPYTER_NFS_IP:-} + while [ -z "${JUPYTER_NFS_IP}" ]; do + JUPYTER_NFS_IP=$( + gum input --prompt="NFS server IP address: " --width=100 \ + --placeholder="e.g., 192.168.10.1" + ) + done + export JUPYTER_NFS_PATH=${JUPYTER_NFS_PATH:-} + while [ -z "${JUPYTER_NFS_PATH}" ]; do + JUPYTER_NFS_PATH=$( + gum input --prompt="NFS server export path: " --width=100 \ + --placeholder="e.g., /volume1/drive1/jupyter" + ) + done + PVC_NAME=jupyter-nfs-pvc + if ! kubectl get pv jupyter-nfs-pv &>/dev/null; then + gomplate -f nfs-pv.gomplate.yaml | kubectl apply -f - + fi + kubectl apply -n ${JUPYTERHUB_NAMESPACE} -f nfs-pvc.yaml + fi + # https://z2jh.jupyter.org/en/stable/ + gomplate -f jupyterhub-values.gomplate.yaml -o jupyterhub-values.yaml + helm upgrade --cleanup-on-fail --install jupyterhub jupyterhub/jupyterhub \ + --version ${JUPYTERHUB_CHART_VERSION} -n ${JUPYTERHUB_NAMESPACE} \ + --timeout=20m -f jupyterhub-values.yaml + # wait deployments manually because `helm upgrade --wait` does not work for JupyterHub + just k8s::wait-deployments-ready ${JUPYTERHUB_NAMESPACE} hub proxy + +# Uninstall JupyterHub +uninstall: + #!/bin/bash + set -euo pipefail + helm uninstall jupyterhub -n ${JUPYTERHUB_NAMESPACE} --wait --ignore-not-found + kubectl delete pods -n ${JUPYTERHUB_NAMESPACE} -l app.kubernetes.io/component=singleuser-server + kubectl delete -n ${JUPYTERHUB_NAMESPACE} pvc jupyter-nfs-pvc --ignore-not-found + if kubectl get pv jupyter-nfs-pv &>/dev/null; then + kubectl patch pv jupyter-nfs-pv -p '{"spec":{"claimRef":null}}' + fi + +# Delete JupyterHub PV +delete-pv: + #!/bin/bash + set -euo pipefail + if kubectl get pv jupyter-nfs-pv &>/dev/null; then + kubectl patch pv jupyter-nfs-pv -p '{"spec":{"claimRef":null}}' + kubectl delete pv jupyter-nfs-pv + fi + +# Build Jupyter notebook kernel images +build-kernel-images: + #!/bin/bash + set -euo pipefail + ( + cd ./images/datastack-notebook + docker build -t \ + ${IMAGE_REGISTRY}/${KERNEL_IMAGE_BUUN_STACK_REPOSITORY}:${JUPYTER_PYTHON_KERNEL_TAG} \ + --build-arg spark_version="3.5.4" \ + --build-arg spark_download_url="https://archive.apache.org/dist/spark/" \ + . + ) + ( + cd ./images/datastack-cuda-notebook + docker build -t \ + ${IMAGE_REGISTRY}/${KERNEL_IMAGE_BUUN_STACK_CUDA_REPOSITORY}:${JUPYTER_PYTHON_KERNEL_TAG} \ + --build-arg spark_version="3.5.4" \ + --build-arg spark_download_url="https://archive.apache.org/dist/spark/" \ + . + ) + +# Push Jupyter notebook kernel images +push-kernel-images: build-kernel-images + docker push ${IMAGE_REGISTRY}/${KERNEL_IMAGE_BUUN_STACK_REPOSITORY}:${JUPYTER_PYTHON_KERNEL_TAG} + docker push ${IMAGE_REGISTRY}/${KERNEL_IMAGE_BUUN_STACK_CUDA_REPOSITORY}:${JUPYTER_PYTHON_KERNEL_TAG} diff --git a/jupyterhub/nfs-pv.gomplate.yaml b/jupyterhub/nfs-pv.gomplate.yaml new file mode 100644 index 0000000..a02a0db --- /dev/null +++ b/jupyterhub/nfs-pv.gomplate.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: PersistentVolume +metadata: + name: jupyter-nfs-pv +spec: + capacity: + storage: 10Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: longhorn + volumeMode: Filesystem + nfs: + server: {{ .Env.JUPYTER_NFS_IP }} + path: {{ .Env.JUPYTER_NFS_PATH }} diff --git a/jupyterhub/nfs-pvc.yaml b/jupyterhub/nfs-pvc.yaml new file mode 100644 index 0000000..63494c1 --- /dev/null +++ b/jupyterhub/nfs-pvc.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: jupyter-nfs-pvc +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 10Gi + volumeName: jupyter-nfs-pv diff --git a/justfile b/justfile index 73cf7e3..1ddf93b 100644 --- a/justfile +++ b/justfile @@ -8,6 +8,7 @@ default: mod env mod keycloak +mod jupyterhub mod k8s mod longhorn mod postgres diff --git a/k8s/justfile b/k8s/justfile index 48c5512..27a9819 100644 --- a/k8s/justfile +++ b/k8s/justfile @@ -271,3 +271,29 @@ configure-registry: echo "Restarting k3s to apply registry configuration..." ssh "${LOCAL_K8S_HOST}" "sudo systemctl restart k3s" echo "✓ Registry configuration applied" + +[positional-arguments] +wait-deployments-ready *args: + #!/bin/bash + set -euo pipefail + namespace="$1" + shift + deployments=("$@") + check_ready() { + for deployment in "${deployments[@]}"; do + ready=$(kubectl get -n ${namespace} deployment "${deployment}" \ + -o jsonpath="{.status.readyReplicas}" 2>/dev/null || true) + replicas=$(kubectl get -n ${namespace} deployment "${deployment}" \ + -o jsonpath="{.status.replicas}" 2>/dev/null || true) + if [[ "${ready}" != "${replicas}" || -z "${ready}" ]]; then + return 0 + fi + done + return 1 + } + echo -n "Waiting for deployments $@ to be ready..." + while check_ready; do + echo -n "." + sleep 2 + done + echo "ok"