feat(jupyterhub): add JupyterHub
This commit is contained in:
1
jupyterhub/.gitignore
vendored
Normal file
1
jupyterhub/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
|||||||
|
jupyterhub-values.yaml
|
||||||
1
jupyterhub/images/datastack-cuda-notebook/.dockerignore
Normal file
1
jupyterhub/images/datastack-cuda-notebook/.dockerignore
Normal file
@@ -0,0 +1 @@
|
|||||||
|
README.md
|
||||||
159
jupyterhub/images/datastack-cuda-notebook/Dockerfile
Normal file
159
jupyterhub/images/datastack-cuda-notebook/Dockerfile
Normal file
@@ -0,0 +1,159 @@
|
|||||||
|
# Merge pyspark-notebook into pytorch-notebook:cuda12-python-3.12
|
||||||
|
# https://github.com/jupyter/docker-stacks/tree/main/images/pytorch-notebook
|
||||||
|
# https://github.com/jupyter/docker-stacks/tree/main/images/pyspark-notebook
|
||||||
|
# https://github.com/jupyter/docker-stacks/blob/main/images/pyspark-notebook/setup_spark.py
|
||||||
|
|
||||||
|
FROM quay.io/jupyter/pytorch-notebook:x86_64-cuda12-python-3.12.10
|
||||||
|
|
||||||
|
# Fix: https://github.com/hadolint/hadolint/wiki/DL4006
|
||||||
|
# Fix: https://github.com/koalaman/shellcheck/wiki/SC3014
|
||||||
|
SHELL ["/bin/bash", "-o", "pipefail", "-c"]
|
||||||
|
|
||||||
|
USER root
|
||||||
|
|
||||||
|
# Spark dependencies
|
||||||
|
# Default values can be overridden at build time
|
||||||
|
# (ARGS are in lowercase to distinguish them from ENV)
|
||||||
|
ARG openjdk_version="17"
|
||||||
|
|
||||||
|
RUN apt-get update --yes && \
|
||||||
|
apt-get install --yes --no-install-recommends \
|
||||||
|
bash jq \
|
||||||
|
"openjdk-${openjdk_version}-jre-headless" \
|
||||||
|
ca-certificates-java && \
|
||||||
|
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# If spark_version is not set, latest Spark will be installed
|
||||||
|
ARG spark_version
|
||||||
|
ARG hadoop_version="3"
|
||||||
|
# If scala_version is not set, Spark without Scala will be installed
|
||||||
|
ARG scala_version
|
||||||
|
# URL to use for Spark downloads
|
||||||
|
# You need to use https://archive.apache.org/dist/spark/ website if you want to download old Spark versions
|
||||||
|
# But it seems to be slower, that's why we use the recommended site for download
|
||||||
|
ARG spark_download_url="https://dlcdn.apache.org/spark/"
|
||||||
|
|
||||||
|
ENV SPARK_HOME=/usr/local/spark
|
||||||
|
ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info"
|
||||||
|
ENV JAVA_HOME="/usr/lib/jvm/java-17-openjdk-amd64"
|
||||||
|
ENV PATH="${PATH}:${SPARK_HOME}/bin:${JAVA_HOME}/bin"
|
||||||
|
|
||||||
|
COPY setup_spark.py /opt/setup-scripts/
|
||||||
|
|
||||||
|
# Setup Spark
|
||||||
|
RUN /opt/setup-scripts/setup_spark.py \
|
||||||
|
--spark-version="${spark_version}" \
|
||||||
|
--hadoop-version="${hadoop_version}" \
|
||||||
|
--scala-version="${scala_version}" \
|
||||||
|
--spark-download-url="${spark_download_url}"
|
||||||
|
|
||||||
|
# Configure IPython system-wide
|
||||||
|
COPY ipython_kernel_config.py "/etc/ipython/"
|
||||||
|
RUN fix-permissions "/etc/ipython/"
|
||||||
|
|
||||||
|
USER ${NB_UID}
|
||||||
|
|
||||||
|
# Remove torch to fix `critical libmamba filesystem error` on executing `memba install`
|
||||||
|
RUN pip uninstall -y \
|
||||||
|
'torch' \
|
||||||
|
'torchaudio' \
|
||||||
|
'torchvision'
|
||||||
|
|
||||||
|
# Install pyarrow
|
||||||
|
# NOTE: It's important to ensure compatibility between Pandas versions.
|
||||||
|
# The pandas version in this Dockerfile should match the version
|
||||||
|
# on which the Pandas API for Spark is built.
|
||||||
|
# To find the right version:
|
||||||
|
# 1. Check out the Spark branch you are on: <https://github.com/apache/spark>
|
||||||
|
# 2. Find the pandas version in the file `dev/infra/Dockerfile`.
|
||||||
|
RUN mamba install --yes \
|
||||||
|
'aif360' \
|
||||||
|
'airflow' \
|
||||||
|
'chromadb' \
|
||||||
|
'dalex' \
|
||||||
|
'dbt' \
|
||||||
|
'dlt' \
|
||||||
|
'duckdb' \
|
||||||
|
'faiss' \
|
||||||
|
'gitpython' \
|
||||||
|
'grpcio-status' \
|
||||||
|
'grpcio' \
|
||||||
|
'keras' \
|
||||||
|
'langchain' \
|
||||||
|
'langchain-ai21' \
|
||||||
|
'langchain-anthropic' \
|
||||||
|
'langchain-aws' \
|
||||||
|
'langchain-azure-dynamic-sessions' \
|
||||||
|
'langchain-chroma' \
|
||||||
|
'langchain-community' \
|
||||||
|
'langchain-experimental' \
|
||||||
|
'langchain-fireworks' \
|
||||||
|
'langchain-google-genai' \
|
||||||
|
'langchain-groq' \
|
||||||
|
'langchain-mistralai' \
|
||||||
|
'langchain-mongodb' \
|
||||||
|
'langchain-nomic' \
|
||||||
|
'langchain-openai' \
|
||||||
|
'langchain-prompty' \
|
||||||
|
'langchain-qdrant' \
|
||||||
|
'langchain-robocorp' \
|
||||||
|
'langchain-text-splitters' \
|
||||||
|
'langchain-together' \
|
||||||
|
'langchain-voyageai' \
|
||||||
|
'langgraph' \
|
||||||
|
'langgraph-checkpoint' \
|
||||||
|
'langgraph-sdk' \
|
||||||
|
'langsmith' \
|
||||||
|
'litellm' \
|
||||||
|
'nest-asyncio' \
|
||||||
|
'openai' \
|
||||||
|
'openai-agents' \
|
||||||
|
'pandas=2.2.2' \
|
||||||
|
'pandas-profiling' \
|
||||||
|
'pillow' \
|
||||||
|
'polars' \
|
||||||
|
'pyarrow' \
|
||||||
|
'qdrant-client' \
|
||||||
|
'rapidfuzz' \
|
||||||
|
'tensorflow' \
|
||||||
|
'transformers' \
|
||||||
|
'unstructured' \
|
||||||
|
&& \
|
||||||
|
mamba clean --all -f -y && \
|
||||||
|
fix-permissions "${CONDA_DIR}" && \
|
||||||
|
fix-permissions "/home/${NB_USER}"
|
||||||
|
|
||||||
|
# RUN pip install pyspark[connect,ml,mllib,pandas-on-spark,sql]==4.0.0.dev2
|
||||||
|
# RUN pip install pyspark[connect,ml,mllib,pandas-on-spark,sql]==3.5.4
|
||||||
|
|
||||||
|
RUN pip install \
|
||||||
|
agno \
|
||||||
|
fastembed \
|
||||||
|
feature-engine \
|
||||||
|
jupyter-ai \
|
||||||
|
jupyter-ai-magics[all] \
|
||||||
|
kreuzberg \
|
||||||
|
langchain-huggingface \
|
||||||
|
langchain-perplexity \
|
||||||
|
langfuse \
|
||||||
|
pydantic-ai \
|
||||||
|
ragas \
|
||||||
|
smolagents \
|
||||||
|
tavily-python \
|
||||||
|
tweet-preprocessor
|
||||||
|
|
||||||
|
# Install PyTorch with pip (https://pytorch.org/get-started/locally/)
|
||||||
|
# langchain-openai must be updated to avoid pydantic v2 error
|
||||||
|
# https://github.com/run-llama/llama_index/issues/16540https://github.com/run-llama/llama_index/issues/16540
|
||||||
|
# hadolint ignore=DL3013
|
||||||
|
RUN pip install --no-cache-dir --extra-index-url=https://pypi.nvidia.com --index-url 'https://download.pytorch.org/whl/cu124' \
|
||||||
|
'torch' \
|
||||||
|
'torchaudio' \
|
||||||
|
'torchvision' && \
|
||||||
|
pip install --upgrade langchain-openai && \
|
||||||
|
fix-permissions "${CONDA_DIR}" && \
|
||||||
|
fix-permissions "/home/${NB_USER}"
|
||||||
|
|
||||||
|
|
||||||
|
WORKDIR "${HOME}"
|
||||||
|
EXPOSE 4040
|
||||||
5
jupyterhub/images/datastack-cuda-notebook/README.md
Normal file
5
jupyterhub/images/datastack-cuda-notebook/README.md
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
# Jupyter Notebook Image
|
||||||
|
|
||||||
|
Custom Jupyter notebook kernel image derived from the official one:
|
||||||
|
|
||||||
|
[jupyter/docker-stacks: Ready-to-run Docker images containing Jupyter applications](https://github.com/jupyter/docker-stacks)
|
||||||
@@ -0,0 +1,13 @@
|
|||||||
|
# Configuration file for ipython-kernel.
|
||||||
|
# See <https://ipython.readthedocs.io/en/stable/config/options/kernel.html>
|
||||||
|
|
||||||
|
# With IPython >= 6.0.0, all outputs to stdout/stderr are captured.
|
||||||
|
# It is the case for subprocesses and output of compiled libraries like Spark.
|
||||||
|
# Those logs now both head to notebook logs and in notebooks outputs.
|
||||||
|
# Logs are particularly verbose with Spark, that is why we turn them off through this flag.
|
||||||
|
# <https://github.com/jupyter/docker-stacks/issues/1423>
|
||||||
|
|
||||||
|
# Attempt to capture and forward low-level output, e.g. produced by Extension libraries.
|
||||||
|
# Default: True
|
||||||
|
# type:ignore
|
||||||
|
c.IPKernelApp.capture_fd_output = False # noqa: F821
|
||||||
131
jupyterhub/images/datastack-cuda-notebook/setup_spark.py
Executable file
131
jupyterhub/images/datastack-cuda-notebook/setup_spark.py
Executable file
@@ -0,0 +1,131 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# Copyright (c) Jupyter Development Team.
|
||||||
|
# Distributed under the terms of the Modified BSD License.
|
||||||
|
|
||||||
|
# Requirements:
|
||||||
|
# - Run as the root user
|
||||||
|
# - Required env variable: SPARK_HOME
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
LOGGER = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def get_all_refs(url: str) -> list[str]:
|
||||||
|
"""
|
||||||
|
Get all the references for a given webpage
|
||||||
|
"""
|
||||||
|
resp = requests.get(url)
|
||||||
|
soup = BeautifulSoup(resp.text, "html.parser")
|
||||||
|
return [a["href"] for a in soup.find_all("a", href=True)]
|
||||||
|
|
||||||
|
|
||||||
|
def get_latest_spark_version() -> str:
|
||||||
|
"""
|
||||||
|
Returns the last version of Spark using spark archive
|
||||||
|
"""
|
||||||
|
LOGGER.info("Downloading Spark versions information")
|
||||||
|
all_refs = get_all_refs("https://archive.apache.org/dist/spark/")
|
||||||
|
versions = [
|
||||||
|
ref.removeprefix("spark-").removesuffix("/")
|
||||||
|
for ref in all_refs
|
||||||
|
if ref.startswith("spark-") and "incubating" not in ref
|
||||||
|
]
|
||||||
|
|
||||||
|
# Compare versions semantically
|
||||||
|
def version_array(ver: str) -> tuple[int, int, int, str]:
|
||||||
|
# 3.5.3 -> [3, 5, 3, ""]
|
||||||
|
# 4.0.0-preview2 -> [4, 0, 0, "preview2"]
|
||||||
|
arr = ver.split(".")
|
||||||
|
assert len(arr) == 3, arr
|
||||||
|
major, minor = int(arr[0]), int(arr[1])
|
||||||
|
patch, _, preview = arr[2].partition("-")
|
||||||
|
return (major, minor, int(patch), preview)
|
||||||
|
|
||||||
|
latest_version = max(versions, key=lambda ver: version_array(ver))
|
||||||
|
LOGGER.info(f"Latest version: {latest_version}")
|
||||||
|
return latest_version
|
||||||
|
|
||||||
|
|
||||||
|
def download_spark(
|
||||||
|
spark_version: str,
|
||||||
|
hadoop_version: str,
|
||||||
|
scala_version: str,
|
||||||
|
spark_download_url: Path,
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Downloads and unpacks spark
|
||||||
|
The resulting spark directory name is returned
|
||||||
|
"""
|
||||||
|
LOGGER.info("Downloading and unpacking Spark")
|
||||||
|
spark_dir_name = f"spark-{spark_version}-bin-hadoop{hadoop_version}"
|
||||||
|
if scala_version:
|
||||||
|
spark_dir_name += f"-scala{scala_version}"
|
||||||
|
LOGGER.info(f"Spark directory name: {spark_dir_name}")
|
||||||
|
spark_url = spark_download_url / f"spark-{spark_version}" / f"{spark_dir_name}.tgz"
|
||||||
|
|
||||||
|
tmp_file = Path("/tmp/spark.tar.gz")
|
||||||
|
subprocess.check_call(
|
||||||
|
["curl", "--progress-bar", "--location", "--output", tmp_file, spark_url]
|
||||||
|
)
|
||||||
|
subprocess.check_call(
|
||||||
|
[
|
||||||
|
"tar",
|
||||||
|
"xzf",
|
||||||
|
tmp_file,
|
||||||
|
"-C",
|
||||||
|
"/usr/local",
|
||||||
|
"--owner",
|
||||||
|
"root",
|
||||||
|
"--group",
|
||||||
|
"root",
|
||||||
|
"--no-same-owner",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
tmp_file.unlink()
|
||||||
|
return spark_dir_name
|
||||||
|
|
||||||
|
|
||||||
|
def configure_spark(spark_dir_name: str, spark_home: Path) -> None:
|
||||||
|
"""
|
||||||
|
Creates a ${SPARK_HOME} symlink to a versioned spark directory
|
||||||
|
Creates a 10spark-config.sh symlink to source PYTHONPATH automatically
|
||||||
|
"""
|
||||||
|
LOGGER.info("Configuring Spark")
|
||||||
|
subprocess.check_call(["ln", "-s", f"/usr/local/{spark_dir_name}", spark_home])
|
||||||
|
|
||||||
|
# Add a link in the before_notebook hook in order to source PYTHONPATH automatically
|
||||||
|
CONFIG_SCRIPT = "/usr/local/bin/before-notebook.d/10spark-config.sh"
|
||||||
|
subprocess.check_call(
|
||||||
|
["ln", "-s", spark_home / "sbin/spark-config.sh", CONFIG_SCRIPT]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
|
arg_parser = argparse.ArgumentParser()
|
||||||
|
arg_parser.add_argument("--spark-version", required=True)
|
||||||
|
arg_parser.add_argument("--hadoop-version", required=True)
|
||||||
|
arg_parser.add_argument("--scala-version", required=True)
|
||||||
|
arg_parser.add_argument("--spark-download-url", type=Path, required=True)
|
||||||
|
args = arg_parser.parse_args()
|
||||||
|
|
||||||
|
args.spark_version = args.spark_version or get_latest_spark_version()
|
||||||
|
|
||||||
|
spark_dir_name = download_spark(
|
||||||
|
spark_version=args.spark_version,
|
||||||
|
hadoop_version=args.hadoop_version,
|
||||||
|
scala_version=args.scala_version,
|
||||||
|
spark_download_url=args.spark_download_url,
|
||||||
|
)
|
||||||
|
configure_spark(
|
||||||
|
spark_dir_name=spark_dir_name, spark_home=Path(os.environ["SPARK_HOME"])
|
||||||
|
)
|
||||||
1
jupyterhub/images/datastack-notebook/.dockerignore
Normal file
1
jupyterhub/images/datastack-notebook/.dockerignore
Normal file
@@ -0,0 +1 @@
|
|||||||
|
README.md
|
||||||
158
jupyterhub/images/datastack-notebook/Dockerfile
Normal file
158
jupyterhub/images/datastack-notebook/Dockerfile
Normal file
@@ -0,0 +1,158 @@
|
|||||||
|
# Merge pyspark-notebook into pytorch-notebook:python-3.12
|
||||||
|
# https://github.com/jupyter/docker-stacks/tree/main/images/pytorch-notebook
|
||||||
|
# https://github.com/jupyter/docker-stacks/tree/main/images/pyspark-notebook
|
||||||
|
# https://github.com/jupyter/docker-stacks/blob/main/images/pyspark-notebook/setup_spark.py
|
||||||
|
|
||||||
|
FROM quay.io/jupyter/pytorch-notebook:python-3.12
|
||||||
|
|
||||||
|
# Fix: https://github.com/hadolint/hadolint/wiki/DL4006
|
||||||
|
# Fix: https://github.com/koalaman/shellcheck/wiki/SC3014
|
||||||
|
SHELL ["/bin/bash", "-o", "pipefail", "-c"]
|
||||||
|
|
||||||
|
USER root
|
||||||
|
|
||||||
|
# Spark dependencies
|
||||||
|
# Default values can be overridden at build time
|
||||||
|
# (ARGS are in lowercase to distinguish them from ENV)
|
||||||
|
ARG openjdk_version="17"
|
||||||
|
|
||||||
|
RUN apt-get update --yes && \
|
||||||
|
apt-get install --yes --no-install-recommends \
|
||||||
|
bash jq \
|
||||||
|
"openjdk-${openjdk_version}-jre-headless" \
|
||||||
|
ca-certificates-java && \
|
||||||
|
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# If spark_version is not set, latest Spark will be installed
|
||||||
|
ARG spark_version
|
||||||
|
ARG hadoop_version="3"
|
||||||
|
# If scala_version is not set, Spark without Scala will be installed
|
||||||
|
ARG scala_version
|
||||||
|
# URL to use for Spark downloads
|
||||||
|
# You need to use https://archive.apache.org/dist/spark/ website if you want to download old Spark versions
|
||||||
|
# But it seems to be slower, that's why we use the recommended site for download
|
||||||
|
ARG spark_download_url="https://dlcdn.apache.org/spark/"
|
||||||
|
|
||||||
|
ENV SPARK_HOME=/usr/local/spark
|
||||||
|
ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info"
|
||||||
|
ENV JAVA_HOME="/usr/lib/jvm/java-17-openjdk-amd64"
|
||||||
|
ENV PATH="${PATH}:${SPARK_HOME}/bin:${JAVA_HOME}/bin"
|
||||||
|
|
||||||
|
COPY setup_spark.py /opt/setup-scripts/
|
||||||
|
|
||||||
|
# Setup Spark
|
||||||
|
RUN /opt/setup-scripts/setup_spark.py \
|
||||||
|
--spark-version="${spark_version}" \
|
||||||
|
--hadoop-version="${hadoop_version}" \
|
||||||
|
--scala-version="${scala_version}" \
|
||||||
|
--spark-download-url="${spark_download_url}"
|
||||||
|
|
||||||
|
# Configure IPython system-wide
|
||||||
|
COPY ipython_kernel_config.py "/etc/ipython/"
|
||||||
|
RUN fix-permissions "/etc/ipython/"
|
||||||
|
|
||||||
|
USER ${NB_UID}
|
||||||
|
|
||||||
|
# Remove torch to fix `critical libmamba filesystem error` on executing `memba install`
|
||||||
|
RUN pip uninstall -y \
|
||||||
|
'torch' \
|
||||||
|
'torchaudio' \
|
||||||
|
'torchvision'
|
||||||
|
|
||||||
|
# Install pyarrow
|
||||||
|
# NOTE: It's important to ensure compatibility between Pandas versions.
|
||||||
|
# The pandas version in this Dockerfile should match the version
|
||||||
|
# on which the Pandas API for Spark is built.
|
||||||
|
# To find the right version:
|
||||||
|
# 1. Check out the Spark branch you are on: <https://github.com/apache/spark>
|
||||||
|
# 2. Find the pandas version in the file `dev/infra/Dockerfile`.
|
||||||
|
RUN mamba install --yes \
|
||||||
|
'aif360' \
|
||||||
|
'airflow' \
|
||||||
|
'chromadb' \
|
||||||
|
'dalex' \
|
||||||
|
'dbt' \
|
||||||
|
'dlt' \
|
||||||
|
'duckdb' \
|
||||||
|
'faiss' \
|
||||||
|
'gitpython' \
|
||||||
|
'grpcio-status' \
|
||||||
|
'grpcio' \
|
||||||
|
'keras' \
|
||||||
|
'langchain' \
|
||||||
|
'langchain-ai21' \
|
||||||
|
'langchain-anthropic' \
|
||||||
|
'langchain-aws' \
|
||||||
|
'langchain-azure-dynamic-sessions' \
|
||||||
|
'langchain-chroma' \
|
||||||
|
'langchain-community' \
|
||||||
|
'langchain-experimental' \
|
||||||
|
'langchain-fireworks' \
|
||||||
|
'langchain-google-genai' \
|
||||||
|
'langchain-groq' \
|
||||||
|
'langchain-mistralai' \
|
||||||
|
'langchain-mongodb' \
|
||||||
|
'langchain-nomic' \
|
||||||
|
'langchain-openai' \
|
||||||
|
'langchain-prompty' \
|
||||||
|
'langchain-qdrant' \
|
||||||
|
'langchain-robocorp' \
|
||||||
|
'langchain-text-splitters' \
|
||||||
|
'langchain-together' \
|
||||||
|
'langchain-voyageai' \
|
||||||
|
'langgraph' \
|
||||||
|
'langgraph-checkpoint' \
|
||||||
|
'langgraph-sdk' \
|
||||||
|
'langsmith' \
|
||||||
|
'litellm' \
|
||||||
|
'nest-asyncio' \
|
||||||
|
'openai' \
|
||||||
|
'openai-agents' \
|
||||||
|
'pandas=2.2.2' \
|
||||||
|
'pandas-profiling' \
|
||||||
|
'pillow' \
|
||||||
|
'polars' \
|
||||||
|
'pyarrow' \
|
||||||
|
'qdrant-client' \
|
||||||
|
'rapidfuzz' \
|
||||||
|
'tensorflow' \
|
||||||
|
'transformers' \
|
||||||
|
'unstructured' \
|
||||||
|
&& \
|
||||||
|
mamba clean --all -f -y && \
|
||||||
|
fix-permissions "${CONDA_DIR}" && \
|
||||||
|
fix-permissions "/home/${NB_USER}"
|
||||||
|
|
||||||
|
# RUN pip install pyspark[connect,ml,mllib,pandas-on-spark,sql]==4.0.0.dev2
|
||||||
|
# RUN pip install pyspark[connect,ml,mllib,pandas-on-spark,sql]==3.5.4
|
||||||
|
|
||||||
|
RUN pip install \
|
||||||
|
agno \
|
||||||
|
fastembed \
|
||||||
|
feature-engine \
|
||||||
|
jupyter-ai \
|
||||||
|
jupyter-ai-magics[all] \
|
||||||
|
kreuzberg \
|
||||||
|
langfuse \
|
||||||
|
langchain-huggingface \
|
||||||
|
langchain-perplexity \
|
||||||
|
pydantic-ai \
|
||||||
|
ragas \
|
||||||
|
smolagents \
|
||||||
|
tavily-python \
|
||||||
|
tweet-preprocessor
|
||||||
|
|
||||||
|
# Install PyTorch with pip (https://pytorch.org/get-started/locally/)
|
||||||
|
# langchain-openai must be updated to avoid pydantic v2 error
|
||||||
|
# https://github.com/run-llama/llama_index/issues/16540https://github.com/run-llama/llama_index/issues/16540
|
||||||
|
# hadolint ignore=DL3013
|
||||||
|
RUN pip install --no-cache-dir --index-url 'https://download.pytorch.org/whl/cpu' \
|
||||||
|
'torch' \
|
||||||
|
'torchaudio' \
|
||||||
|
'torchvision' && \
|
||||||
|
pip install --upgrade langchain-openai && \
|
||||||
|
fix-permissions "${CONDA_DIR}" && \
|
||||||
|
fix-permissions "/home/${NB_USER}"
|
||||||
|
|
||||||
|
WORKDIR "${HOME}"
|
||||||
|
EXPOSE 4040
|
||||||
5
jupyterhub/images/datastack-notebook/README.md
Normal file
5
jupyterhub/images/datastack-notebook/README.md
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
# Jupyter Notebook Image
|
||||||
|
|
||||||
|
Custom Jupyter notebook kernel image derived from the official one:
|
||||||
|
|
||||||
|
[jupyter/docker-stacks: Ready-to-run Docker images containing Jupyter applications](https://github.com/jupyter/docker-stacks)
|
||||||
@@ -0,0 +1,13 @@
|
|||||||
|
# Configuration file for ipython-kernel.
|
||||||
|
# See <https://ipython.readthedocs.io/en/stable/config/options/kernel.html>
|
||||||
|
|
||||||
|
# With IPython >= 6.0.0, all outputs to stdout/stderr are captured.
|
||||||
|
# It is the case for subprocesses and output of compiled libraries like Spark.
|
||||||
|
# Those logs now both head to notebook logs and in notebooks outputs.
|
||||||
|
# Logs are particularly verbose with Spark, that is why we turn them off through this flag.
|
||||||
|
# <https://github.com/jupyter/docker-stacks/issues/1423>
|
||||||
|
|
||||||
|
# Attempt to capture and forward low-level output, e.g. produced by Extension libraries.
|
||||||
|
# Default: True
|
||||||
|
# type:ignore
|
||||||
|
c.IPKernelApp.capture_fd_output = False # noqa: F821
|
||||||
131
jupyterhub/images/datastack-notebook/setup_spark.py
Executable file
131
jupyterhub/images/datastack-notebook/setup_spark.py
Executable file
@@ -0,0 +1,131 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# Copyright (c) Jupyter Development Team.
|
||||||
|
# Distributed under the terms of the Modified BSD License.
|
||||||
|
|
||||||
|
# Requirements:
|
||||||
|
# - Run as the root user
|
||||||
|
# - Required env variable: SPARK_HOME
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
LOGGER = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def get_all_refs(url: str) -> list[str]:
|
||||||
|
"""
|
||||||
|
Get all the references for a given webpage
|
||||||
|
"""
|
||||||
|
resp = requests.get(url)
|
||||||
|
soup = BeautifulSoup(resp.text, "html.parser")
|
||||||
|
return [a["href"] for a in soup.find_all("a", href=True)]
|
||||||
|
|
||||||
|
|
||||||
|
def get_latest_spark_version() -> str:
|
||||||
|
"""
|
||||||
|
Returns the last version of Spark using spark archive
|
||||||
|
"""
|
||||||
|
LOGGER.info("Downloading Spark versions information")
|
||||||
|
all_refs = get_all_refs("https://archive.apache.org/dist/spark/")
|
||||||
|
versions = [
|
||||||
|
ref.removeprefix("spark-").removesuffix("/")
|
||||||
|
for ref in all_refs
|
||||||
|
if ref.startswith("spark-") and "incubating" not in ref
|
||||||
|
]
|
||||||
|
|
||||||
|
# Compare versions semantically
|
||||||
|
def version_array(ver: str) -> tuple[int, int, int, str]:
|
||||||
|
# 3.5.3 -> [3, 5, 3, ""]
|
||||||
|
# 4.0.0-preview2 -> [4, 0, 0, "preview2"]
|
||||||
|
arr = ver.split(".")
|
||||||
|
assert len(arr) == 3, arr
|
||||||
|
major, minor = int(arr[0]), int(arr[1])
|
||||||
|
patch, _, preview = arr[2].partition("-")
|
||||||
|
return (major, minor, int(patch), preview)
|
||||||
|
|
||||||
|
latest_version = max(versions, key=lambda ver: version_array(ver))
|
||||||
|
LOGGER.info(f"Latest version: {latest_version}")
|
||||||
|
return latest_version
|
||||||
|
|
||||||
|
|
||||||
|
def download_spark(
|
||||||
|
spark_version: str,
|
||||||
|
hadoop_version: str,
|
||||||
|
scala_version: str,
|
||||||
|
spark_download_url: Path,
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Downloads and unpacks spark
|
||||||
|
The resulting spark directory name is returned
|
||||||
|
"""
|
||||||
|
LOGGER.info("Downloading and unpacking Spark")
|
||||||
|
spark_dir_name = f"spark-{spark_version}-bin-hadoop{hadoop_version}"
|
||||||
|
if scala_version:
|
||||||
|
spark_dir_name += f"-scala{scala_version}"
|
||||||
|
LOGGER.info(f"Spark directory name: {spark_dir_name}")
|
||||||
|
spark_url = spark_download_url / f"spark-{spark_version}" / f"{spark_dir_name}.tgz"
|
||||||
|
|
||||||
|
tmp_file = Path("/tmp/spark.tar.gz")
|
||||||
|
subprocess.check_call(
|
||||||
|
["curl", "--progress-bar", "--location", "--output", tmp_file, spark_url]
|
||||||
|
)
|
||||||
|
subprocess.check_call(
|
||||||
|
[
|
||||||
|
"tar",
|
||||||
|
"xzf",
|
||||||
|
tmp_file,
|
||||||
|
"-C",
|
||||||
|
"/usr/local",
|
||||||
|
"--owner",
|
||||||
|
"root",
|
||||||
|
"--group",
|
||||||
|
"root",
|
||||||
|
"--no-same-owner",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
tmp_file.unlink()
|
||||||
|
return spark_dir_name
|
||||||
|
|
||||||
|
|
||||||
|
def configure_spark(spark_dir_name: str, spark_home: Path) -> None:
|
||||||
|
"""
|
||||||
|
Creates a ${SPARK_HOME} symlink to a versioned spark directory
|
||||||
|
Creates a 10spark-config.sh symlink to source PYTHONPATH automatically
|
||||||
|
"""
|
||||||
|
LOGGER.info("Configuring Spark")
|
||||||
|
subprocess.check_call(["ln", "-s", f"/usr/local/{spark_dir_name}", spark_home])
|
||||||
|
|
||||||
|
# Add a link in the before_notebook hook in order to source PYTHONPATH automatically
|
||||||
|
CONFIG_SCRIPT = "/usr/local/bin/before-notebook.d/10spark-config.sh"
|
||||||
|
subprocess.check_call(
|
||||||
|
["ln", "-s", spark_home / "sbin/spark-config.sh", CONFIG_SCRIPT]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
|
arg_parser = argparse.ArgumentParser()
|
||||||
|
arg_parser.add_argument("--spark-version", required=True)
|
||||||
|
arg_parser.add_argument("--hadoop-version", required=True)
|
||||||
|
arg_parser.add_argument("--scala-version", required=True)
|
||||||
|
arg_parser.add_argument("--spark-download-url", type=Path, required=True)
|
||||||
|
args = arg_parser.parse_args()
|
||||||
|
|
||||||
|
args.spark_version = args.spark_version or get_latest_spark_version()
|
||||||
|
|
||||||
|
spark_dir_name = download_spark(
|
||||||
|
spark_version=args.spark_version,
|
||||||
|
hadoop_version=args.hadoop_version,
|
||||||
|
scala_version=args.scala_version,
|
||||||
|
spark_download_url=args.spark_download_url,
|
||||||
|
)
|
||||||
|
configure_spark(
|
||||||
|
spark_dir_name=spark_dir_name, spark_home=Path(os.environ["SPARK_HOME"])
|
||||||
|
)
|
||||||
155
jupyterhub/jupyterhub-values.gomplate.yaml
Normal file
155
jupyterhub/jupyterhub-values.gomplate.yaml
Normal file
@@ -0,0 +1,155 @@
|
|||||||
|
hub:
|
||||||
|
config:
|
||||||
|
JupyterHub:
|
||||||
|
authenticator_class: generic-oauth
|
||||||
|
admin_access: false
|
||||||
|
|
||||||
|
Authenticator:
|
||||||
|
enable_auth_state: true
|
||||||
|
allow_all: true # allow all Keycloak users
|
||||||
|
|
||||||
|
GenericOAuthenticator:
|
||||||
|
client_id: {{ .Env.JUPYTERHUB_OIDC_CLIENT_ID }}
|
||||||
|
oauth_callback_url: "https://{{ .Env.JUPYTERHUB_HOST }}/hub/oauth_callback"
|
||||||
|
authorize_url: "https://{{ .Env.KEYCLOAK_HOST }}/realms/{{ .Env.KEYCLOAK_REALM }}/protocol/openid-connect/auth"
|
||||||
|
token_url: "https://{{ .Env.KEYCLOAK_HOST }}/realms/{{ .Env.KEYCLOAK_REALM }}/protocol/openid-connect/token"
|
||||||
|
userdata_url: "https://{{ .Env.KEYCLOAK_HOST }}/realms/{{ .Env.KEYCLOAK_REALM }}/protocol/openid-connect/userinfo"
|
||||||
|
login_service: keycloak
|
||||||
|
# username_claim: email
|
||||||
|
username_claim: preferred_username
|
||||||
|
|
||||||
|
OAuthenticator:
|
||||||
|
scope:
|
||||||
|
- openid
|
||||||
|
- profile
|
||||||
|
- email
|
||||||
|
|
||||||
|
# db:
|
||||||
|
# pvc:
|
||||||
|
# storageClassName: longhorn
|
||||||
|
|
||||||
|
podSecurityContext:
|
||||||
|
fsGroup: {{ .Env.JUPYTER_FSGID }}
|
||||||
|
|
||||||
|
singleuser:
|
||||||
|
storage:
|
||||||
|
{{ if env.Getenv "PVC_NAME" -}}
|
||||||
|
type: static
|
||||||
|
static:
|
||||||
|
pvcName: {{ .Env.PVC_NAME }}
|
||||||
|
{{ else -}}
|
||||||
|
type: dynamic
|
||||||
|
dynamic:
|
||||||
|
storageClass: longhorn
|
||||||
|
storageAccessModes:
|
||||||
|
- ReadWriteOnce
|
||||||
|
{{ end -}}
|
||||||
|
capacity: 10Gi
|
||||||
|
networkPolicy:
|
||||||
|
egress:
|
||||||
|
- to:
|
||||||
|
- namespaceSelector:
|
||||||
|
matchLabels:
|
||||||
|
kubernetes.io/metadata.name: chroma
|
||||||
|
ports:
|
||||||
|
- port: 8000
|
||||||
|
protocol: TCP
|
||||||
|
- to:
|
||||||
|
- namespaceSelector:
|
||||||
|
matchLabels:
|
||||||
|
kubernetes.io/metadata.name: qdrant
|
||||||
|
ports:
|
||||||
|
- port: 6333
|
||||||
|
protocol: TCP
|
||||||
|
- port: 6334
|
||||||
|
protocol: TCP
|
||||||
|
- port: 6335
|
||||||
|
protocol: TCP
|
||||||
|
- to:
|
||||||
|
- namespaceSelector:
|
||||||
|
matchLabels:
|
||||||
|
kubernetes.io/metadata.name: litellm
|
||||||
|
ports:
|
||||||
|
- port: 4000
|
||||||
|
protocol: TCP
|
||||||
|
- to:
|
||||||
|
- ipBlock:
|
||||||
|
cidr: 0.0.0.0/0
|
||||||
|
ports:
|
||||||
|
- port: 443
|
||||||
|
protocol: TCP
|
||||||
|
domains:
|
||||||
|
- '*.shds.dev'
|
||||||
|
|
||||||
|
image:
|
||||||
|
pullPolicy: IfNotPresent
|
||||||
|
|
||||||
|
profileList:
|
||||||
|
# https://quay.io/repository/jupyter/pyspark-notebook
|
||||||
|
{{- if eq .Env.JUPYTER_PROFILE_MINIMAL_ENABLED "true" }}
|
||||||
|
- display_name: "Minimal Jupyter Notebook Stack"
|
||||||
|
description: "Minimal Jupyter Notebook Stack"
|
||||||
|
kubespawner_override:
|
||||||
|
image: quay.io/jupyter/minimal-notebook
|
||||||
|
{{- end }}
|
||||||
|
{{ if eq .Env.JUPYTER_PROFILE_BASE_ENABLED "true" }}
|
||||||
|
- display_name: "Base Jupyter Notebook Stack"
|
||||||
|
description: "Base Jupyter Notebook Stack"
|
||||||
|
kubespawner_override:
|
||||||
|
image: quay.io/jupyter/base-notebook
|
||||||
|
{{- end }}
|
||||||
|
{{- if eq .Env.JUPYTER_PROFILE_DATASCIENCE_ENABLED "true" }}
|
||||||
|
- display_name: "Jupyter Notebook Data Science Stack"
|
||||||
|
description: "Jupyter Notebook Data Science Stack"
|
||||||
|
kubespawner_override:
|
||||||
|
image: quay.io/jupyter/datascience-notebook
|
||||||
|
{{- end }}
|
||||||
|
{{- if eq .Env.JUPYTER_PROFILE_PYSPARK_ENABLED "true" }}
|
||||||
|
- display_name: "Jupyter Notebook Python, Spark Stack"
|
||||||
|
description: "Jupyter Notebook Python, Spark Stack"
|
||||||
|
kubespawner_override:
|
||||||
|
image: quay.io/jupyter/pyspark-notebook
|
||||||
|
{{- end }}
|
||||||
|
{{- if eq .Env.JUPYTER_PROFILE_PYTORCH_ENABLED "true" }}
|
||||||
|
- display_name: "Jupyter Notebook PyTorch Deep Learning Stack"
|
||||||
|
description: "Jupyter Notebook PyTorch Deep Learning Stack"
|
||||||
|
kubespawner_override:
|
||||||
|
image: quay.io/jupyter/pytorch-notebook
|
||||||
|
{{- end }}
|
||||||
|
{{- if eq .Env.JUPYTER_PROFILE_TENSORFLOW_ENABLED "true" }}
|
||||||
|
- display_name: "Jupyter Notebook TensorFlow Deep Learning Stack"
|
||||||
|
description: "Jupyter Notebook TensorFlow Deep Learning Stack"
|
||||||
|
kubespawner_override:
|
||||||
|
image: quay.io/jupyter/tensorflow-notebook
|
||||||
|
{{- end }}
|
||||||
|
{{- if eq .Env.JUPYTER_PROFILE_BUUN_STACK_ENABLED "true" }}
|
||||||
|
- display_name: "Buun-stack"
|
||||||
|
description: "Jupyter Notebook with buun-stack"
|
||||||
|
kubespawner_override:
|
||||||
|
image: "{{ .Env.IMAGE_REGISTRY }}/{{ .Env.KERNEL_IMAGE_BUUN_STACK_REPOSITORY }}:{{ .Env.JUPYTER_PYTHON_KERNEL_TAG }}"
|
||||||
|
{{- end }}
|
||||||
|
{{- if eq .Env.JUPYTER_PROFILE_BUUN_STACK_CUDA_ENABLED "true" }}
|
||||||
|
- display_name: "Buun-stack with CUDA"
|
||||||
|
description: "Jupyter Notebook with buun-stack and CUDA support"
|
||||||
|
kubespawner_override:
|
||||||
|
image: "{{ .Env.IMAGE_REGISTRY }}/{{ .Env.KERNEL_IMAGE_BUUN_STACK_CUDA_REPOSITORY }}:{{ .Env.JUPYTER_PYTHON_KERNEL_TAG }}"
|
||||||
|
# resources:
|
||||||
|
# requests:
|
||||||
|
# nvidia.com/gpu: "1"
|
||||||
|
{{- end }}
|
||||||
|
|
||||||
|
imagePullSecrets:
|
||||||
|
- name: regcred
|
||||||
|
|
||||||
|
ingress:
|
||||||
|
enabled: true
|
||||||
|
annotations:
|
||||||
|
kubernetes.io/ingress.class: traefik
|
||||||
|
traefik.ingress.kubernetes.io/router.entrypoints: websecure
|
||||||
|
ingressClassName: traefik
|
||||||
|
hosts:
|
||||||
|
- {{ .Env.JUPYTERHUB_HOST }}
|
||||||
|
pathType: Prefix
|
||||||
|
tls:
|
||||||
|
- hosts:
|
||||||
|
- {{ .Env.JUPYTERHUB_HOST }}
|
||||||
150
jupyterhub/justfile
Normal file
150
jupyterhub/justfile
Normal file
@@ -0,0 +1,150 @@
|
|||||||
|
set fallback := true
|
||||||
|
|
||||||
|
export JUPYTERHUB_NAMESPACE := env("JUPYTERHUB_NAMESPACE", "jupyter")
|
||||||
|
export JUPYTERHUB_CHART_VERSION := env("JUPYTERHUB_CHART_VERSION", "4.2.0")
|
||||||
|
export JUPYTERHUB_OIDC_CLIENT_ID := env("JUPYTERHUB_OIDC_CLIENT_ID", "jupyterhub")
|
||||||
|
export JUPYTERHUB_ENABLE_NFS_PV := env("JUPYTERHUB_ENABLE_NFS_PV", "")
|
||||||
|
export JUPYTER_PYTHON_KERNEL_TAG := env("JUPYTER_PYTHON_KERNEL_TAG", "python-3.12-1")
|
||||||
|
export KERNEL_IMAGE_BUUN_STACK_REPOSITORY := env("KERNEL_IMAGE_BUUN_STACK_REPOSITORY", "buun-stack-notebook")
|
||||||
|
export KERNEL_IMAGE_BUUN_STACK_CUDA_REPOSITORY := env("KERNEL_IMAGE_BUUN_STACK_CUDA_REPOSITORY", "buun-stack-cuda-notebook")
|
||||||
|
export JUPYTER_PROFILE_MINIMAL_ENABLED := env("JUPYTER_PROFILE_MINIMAL_ENABLED", "false")
|
||||||
|
export JUPYTER_PROFILE_BASE_ENABLED := env("JUPYTER_PROFILE_BASE_ENABLED", "false")
|
||||||
|
export JUPYTER_PROFILE_DATASCIENCE_ENABLED := env("JUPYTER_PROFILE_DATASCIENCE_ENABLED", "true")
|
||||||
|
export JUPYTER_PROFILE_PYSPARK_ENABLED := env("JUPYTER_PROFILE_PYSPARK_ENABLED", "false")
|
||||||
|
export JUPYTER_PROFILE_PYTORCH_ENABLED := env("JUPYTER_PROFILE_PYTORCH_ENABLED", "false")
|
||||||
|
export JUPYTER_PROFILE_TENSORFLOW_ENABLED := env("JUPYTER_PROFILE_TENSORFLOW_ENABLED", "false")
|
||||||
|
export JUPYTER_PROFILE_BUUN_STACK_ENABLED := env("JUPYTER_PROFILE_BUUN_STACK_ENABLED", "false")
|
||||||
|
export JUPYTER_PROFILE_BUUN_STACK_CUDA_ENABLED := env("JUPYTER_PROFILE_BUUN_STACK_CUDA_ENABLED", "false")
|
||||||
|
export IMAGE_REGISTRY := env("IMAGE_REGISTRY", "localhost:30500")
|
||||||
|
export KEYCLOAK_REALM := env("KEYCLOAK_REALM", "buunstack")
|
||||||
|
export LONGHORN_NAMESPACE := env("LONGHORN_NAMESPACE", "longhorn")
|
||||||
|
|
||||||
|
[private]
|
||||||
|
default:
|
||||||
|
@just --list --unsorted --list-submodules
|
||||||
|
|
||||||
|
# Add Helm repository
|
||||||
|
add-helm-repo:
|
||||||
|
helm repo add jupyterhub https://jupyterhub.github.io/helm-chart
|
||||||
|
helm repo update
|
||||||
|
|
||||||
|
# Remove Helm repository
|
||||||
|
remove-helm-repo:
|
||||||
|
helm repo remove jupyterhub
|
||||||
|
|
||||||
|
# Create JupyterHub namespace
|
||||||
|
create-namespace:
|
||||||
|
kubectl get namespace ${JUPYTERHUB_NAMESPACE} &>/dev/null || \
|
||||||
|
kubectl create namespace ${JUPYTERHUB_NAMESPACE}
|
||||||
|
|
||||||
|
# Delete JupyterHub namespace
|
||||||
|
delete-namespace:
|
||||||
|
kubectl delete namespace ${JUPYTERHUB_NAMESPACE} --ignore-not-found
|
||||||
|
|
||||||
|
# Install JupyterHub
|
||||||
|
install:
|
||||||
|
#!/bin/bash
|
||||||
|
set -euo pipefail
|
||||||
|
export JUPYTERHUB_HOST=${JUPYTERHUB_HOST:-}
|
||||||
|
while [ -z "${JUPYTERHUB_HOST}" ]; do
|
||||||
|
JUPYTERHUB_HOST=$(
|
||||||
|
gum input --prompt="JupyterHub host (FQDN): " --width=100 \
|
||||||
|
--placeholder="e.g., jupyter.example.com"
|
||||||
|
)
|
||||||
|
done
|
||||||
|
just create-namespace
|
||||||
|
# just k8s::copy-regcred ${JUPYTERHUB_NAMESPACE}
|
||||||
|
just keycloak::create-client ${KEYCLOAK_REALM} ${JUPYTERHUB_OIDC_CLIENT_ID} \
|
||||||
|
"https://${JUPYTERHUB_HOST}/hub/oauth_callback"
|
||||||
|
# just vault::create-jupyter-role
|
||||||
|
just add-helm-repo
|
||||||
|
export JUPYTERHUB_OIDC_CLIENT_ID=${JUPYTERHUB_OIDC_CLIENT_ID}
|
||||||
|
export KEYCLOAK_REALM=${KEYCLOAK_REALM}
|
||||||
|
export JUPYTER_PYTHON_KERNEL_TAG=${JUPYTER_PYTHON_KERNEL_TAG}
|
||||||
|
export JUPYTER_FSGID=${JUPYTER_FSGID:-100}
|
||||||
|
export PVC_NAME=""
|
||||||
|
if [ -z "${JUPYTERHUB_ENABLE_NFS_PV}" ]; then
|
||||||
|
if gum confirm "Are you going to use NFS PV?"; then
|
||||||
|
JUPYTERHUB_ENABLE_NFS_PV=true
|
||||||
|
else
|
||||||
|
JUPYTERHUB_ENABLE_NFS_PV=false
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
if [ "${JUPYTERHUB_ENABLE_NFS_PV}" = "true" ]; then
|
||||||
|
if ! helm status longhorn -n ${LONGHORN_NAMESPACE} &>/dev/null; then
|
||||||
|
echo "Longhorn is not installed. Please install Longhorn first." >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
export JUPYTER_NFS_IP=${JUPYTER_NFS_IP:-}
|
||||||
|
while [ -z "${JUPYTER_NFS_IP}" ]; do
|
||||||
|
JUPYTER_NFS_IP=$(
|
||||||
|
gum input --prompt="NFS server IP address: " --width=100 \
|
||||||
|
--placeholder="e.g., 192.168.10.1"
|
||||||
|
)
|
||||||
|
done
|
||||||
|
export JUPYTER_NFS_PATH=${JUPYTER_NFS_PATH:-}
|
||||||
|
while [ -z "${JUPYTER_NFS_PATH}" ]; do
|
||||||
|
JUPYTER_NFS_PATH=$(
|
||||||
|
gum input --prompt="NFS server export path: " --width=100 \
|
||||||
|
--placeholder="e.g., /volume1/drive1/jupyter"
|
||||||
|
)
|
||||||
|
done
|
||||||
|
PVC_NAME=jupyter-nfs-pvc
|
||||||
|
if ! kubectl get pv jupyter-nfs-pv &>/dev/null; then
|
||||||
|
gomplate -f nfs-pv.gomplate.yaml | kubectl apply -f -
|
||||||
|
fi
|
||||||
|
kubectl apply -n ${JUPYTERHUB_NAMESPACE} -f nfs-pvc.yaml
|
||||||
|
fi
|
||||||
|
# https://z2jh.jupyter.org/en/stable/
|
||||||
|
gomplate -f jupyterhub-values.gomplate.yaml -o jupyterhub-values.yaml
|
||||||
|
helm upgrade --cleanup-on-fail --install jupyterhub jupyterhub/jupyterhub \
|
||||||
|
--version ${JUPYTERHUB_CHART_VERSION} -n ${JUPYTERHUB_NAMESPACE} \
|
||||||
|
--timeout=20m -f jupyterhub-values.yaml
|
||||||
|
# wait deployments manually because `helm upgrade --wait` does not work for JupyterHub
|
||||||
|
just k8s::wait-deployments-ready ${JUPYTERHUB_NAMESPACE} hub proxy
|
||||||
|
|
||||||
|
# Uninstall JupyterHub
|
||||||
|
uninstall:
|
||||||
|
#!/bin/bash
|
||||||
|
set -euo pipefail
|
||||||
|
helm uninstall jupyterhub -n ${JUPYTERHUB_NAMESPACE} --wait --ignore-not-found
|
||||||
|
kubectl delete pods -n ${JUPYTERHUB_NAMESPACE} -l app.kubernetes.io/component=singleuser-server
|
||||||
|
kubectl delete -n ${JUPYTERHUB_NAMESPACE} pvc jupyter-nfs-pvc --ignore-not-found
|
||||||
|
if kubectl get pv jupyter-nfs-pv &>/dev/null; then
|
||||||
|
kubectl patch pv jupyter-nfs-pv -p '{"spec":{"claimRef":null}}'
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Delete JupyterHub PV
|
||||||
|
delete-pv:
|
||||||
|
#!/bin/bash
|
||||||
|
set -euo pipefail
|
||||||
|
if kubectl get pv jupyter-nfs-pv &>/dev/null; then
|
||||||
|
kubectl patch pv jupyter-nfs-pv -p '{"spec":{"claimRef":null}}'
|
||||||
|
kubectl delete pv jupyter-nfs-pv
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Build Jupyter notebook kernel images
|
||||||
|
build-kernel-images:
|
||||||
|
#!/bin/bash
|
||||||
|
set -euo pipefail
|
||||||
|
(
|
||||||
|
cd ./images/datastack-notebook
|
||||||
|
docker build -t \
|
||||||
|
${IMAGE_REGISTRY}/${KERNEL_IMAGE_BUUN_STACK_REPOSITORY}:${JUPYTER_PYTHON_KERNEL_TAG} \
|
||||||
|
--build-arg spark_version="3.5.4" \
|
||||||
|
--build-arg spark_download_url="https://archive.apache.org/dist/spark/" \
|
||||||
|
.
|
||||||
|
)
|
||||||
|
(
|
||||||
|
cd ./images/datastack-cuda-notebook
|
||||||
|
docker build -t \
|
||||||
|
${IMAGE_REGISTRY}/${KERNEL_IMAGE_BUUN_STACK_CUDA_REPOSITORY}:${JUPYTER_PYTHON_KERNEL_TAG} \
|
||||||
|
--build-arg spark_version="3.5.4" \
|
||||||
|
--build-arg spark_download_url="https://archive.apache.org/dist/spark/" \
|
||||||
|
.
|
||||||
|
)
|
||||||
|
|
||||||
|
# Push Jupyter notebook kernel images
|
||||||
|
push-kernel-images: build-kernel-images
|
||||||
|
docker push ${IMAGE_REGISTRY}/${KERNEL_IMAGE_BUUN_STACK_REPOSITORY}:${JUPYTER_PYTHON_KERNEL_TAG}
|
||||||
|
docker push ${IMAGE_REGISTRY}/${KERNEL_IMAGE_BUUN_STACK_CUDA_REPOSITORY}:${JUPYTER_PYTHON_KERNEL_TAG}
|
||||||
15
jupyterhub/nfs-pv.gomplate.yaml
Normal file
15
jupyterhub/nfs-pv.gomplate.yaml
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: PersistentVolume
|
||||||
|
metadata:
|
||||||
|
name: jupyter-nfs-pv
|
||||||
|
spec:
|
||||||
|
capacity:
|
||||||
|
storage: 10Gi
|
||||||
|
accessModes:
|
||||||
|
- ReadWriteOnce
|
||||||
|
persistentVolumeReclaimPolicy: Retain
|
||||||
|
storageClassName: longhorn
|
||||||
|
volumeMode: Filesystem
|
||||||
|
nfs:
|
||||||
|
server: {{ .Env.JUPYTER_NFS_IP }}
|
||||||
|
path: {{ .Env.JUPYTER_NFS_PATH }}
|
||||||
11
jupyterhub/nfs-pvc.yaml
Normal file
11
jupyterhub/nfs-pvc.yaml
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: PersistentVolumeClaim
|
||||||
|
metadata:
|
||||||
|
name: jupyter-nfs-pvc
|
||||||
|
spec:
|
||||||
|
accessModes:
|
||||||
|
- ReadWriteOnce
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
storage: 10Gi
|
||||||
|
volumeName: jupyter-nfs-pv
|
||||||
1
justfile
1
justfile
@@ -8,6 +8,7 @@ default:
|
|||||||
|
|
||||||
mod env
|
mod env
|
||||||
mod keycloak
|
mod keycloak
|
||||||
|
mod jupyterhub
|
||||||
mod k8s
|
mod k8s
|
||||||
mod longhorn
|
mod longhorn
|
||||||
mod postgres
|
mod postgres
|
||||||
|
|||||||
26
k8s/justfile
26
k8s/justfile
@@ -271,3 +271,29 @@ configure-registry:
|
|||||||
echo "Restarting k3s to apply registry configuration..."
|
echo "Restarting k3s to apply registry configuration..."
|
||||||
ssh "${LOCAL_K8S_HOST}" "sudo systemctl restart k3s"
|
ssh "${LOCAL_K8S_HOST}" "sudo systemctl restart k3s"
|
||||||
echo "✓ Registry configuration applied"
|
echo "✓ Registry configuration applied"
|
||||||
|
|
||||||
|
[positional-arguments]
|
||||||
|
wait-deployments-ready *args:
|
||||||
|
#!/bin/bash
|
||||||
|
set -euo pipefail
|
||||||
|
namespace="$1"
|
||||||
|
shift
|
||||||
|
deployments=("$@")
|
||||||
|
check_ready() {
|
||||||
|
for deployment in "${deployments[@]}"; do
|
||||||
|
ready=$(kubectl get -n ${namespace} deployment "${deployment}" \
|
||||||
|
-o jsonpath="{.status.readyReplicas}" 2>/dev/null || true)
|
||||||
|
replicas=$(kubectl get -n ${namespace} deployment "${deployment}" \
|
||||||
|
-o jsonpath="{.status.replicas}" 2>/dev/null || true)
|
||||||
|
if [[ "${ready}" != "${replicas}" || -z "${ready}" ]]; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
echo -n "Waiting for deployments $@ to be ready..."
|
||||||
|
while check_ready; do
|
||||||
|
echo -n "."
|
||||||
|
sleep 2
|
||||||
|
done
|
||||||
|
echo "ok"
|
||||||
|
|||||||
Reference in New Issue
Block a user