feat(jupyterhub): add JupyterHub
This commit is contained in:
1
jupyterhub/images/datastack-cuda-notebook/.dockerignore
Normal file
1
jupyterhub/images/datastack-cuda-notebook/.dockerignore
Normal file
@@ -0,0 +1 @@
|
||||
README.md
|
||||
159
jupyterhub/images/datastack-cuda-notebook/Dockerfile
Normal file
159
jupyterhub/images/datastack-cuda-notebook/Dockerfile
Normal file
@@ -0,0 +1,159 @@
|
||||
# Merge pyspark-notebook into pytorch-notebook:cuda12-python-3.12
|
||||
# https://github.com/jupyter/docker-stacks/tree/main/images/pytorch-notebook
|
||||
# https://github.com/jupyter/docker-stacks/tree/main/images/pyspark-notebook
|
||||
# https://github.com/jupyter/docker-stacks/blob/main/images/pyspark-notebook/setup_spark.py
|
||||
|
||||
FROM quay.io/jupyter/pytorch-notebook:x86_64-cuda12-python-3.12.10
|
||||
|
||||
# Fix: https://github.com/hadolint/hadolint/wiki/DL4006
|
||||
# Fix: https://github.com/koalaman/shellcheck/wiki/SC3014
|
||||
SHELL ["/bin/bash", "-o", "pipefail", "-c"]
|
||||
|
||||
USER root
|
||||
|
||||
# Spark dependencies
|
||||
# Default values can be overridden at build time
|
||||
# (ARGS are in lowercase to distinguish them from ENV)
|
||||
ARG openjdk_version="17"
|
||||
|
||||
RUN apt-get update --yes && \
|
||||
apt-get install --yes --no-install-recommends \
|
||||
bash jq \
|
||||
"openjdk-${openjdk_version}-jre-headless" \
|
||||
ca-certificates-java && \
|
||||
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# If spark_version is not set, latest Spark will be installed
|
||||
ARG spark_version
|
||||
ARG hadoop_version="3"
|
||||
# If scala_version is not set, Spark without Scala will be installed
|
||||
ARG scala_version
|
||||
# URL to use for Spark downloads
|
||||
# You need to use https://archive.apache.org/dist/spark/ website if you want to download old Spark versions
|
||||
# But it seems to be slower, that's why we use the recommended site for download
|
||||
ARG spark_download_url="https://dlcdn.apache.org/spark/"
|
||||
|
||||
ENV SPARK_HOME=/usr/local/spark
|
||||
ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info"
|
||||
ENV JAVA_HOME="/usr/lib/jvm/java-17-openjdk-amd64"
|
||||
ENV PATH="${PATH}:${SPARK_HOME}/bin:${JAVA_HOME}/bin"
|
||||
|
||||
COPY setup_spark.py /opt/setup-scripts/
|
||||
|
||||
# Setup Spark
|
||||
RUN /opt/setup-scripts/setup_spark.py \
|
||||
--spark-version="${spark_version}" \
|
||||
--hadoop-version="${hadoop_version}" \
|
||||
--scala-version="${scala_version}" \
|
||||
--spark-download-url="${spark_download_url}"
|
||||
|
||||
# Configure IPython system-wide
|
||||
COPY ipython_kernel_config.py "/etc/ipython/"
|
||||
RUN fix-permissions "/etc/ipython/"
|
||||
|
||||
USER ${NB_UID}
|
||||
|
||||
# Remove torch to fix `critical libmamba filesystem error` on executing `memba install`
|
||||
RUN pip uninstall -y \
|
||||
'torch' \
|
||||
'torchaudio' \
|
||||
'torchvision'
|
||||
|
||||
# Install pyarrow
|
||||
# NOTE: It's important to ensure compatibility between Pandas versions.
|
||||
# The pandas version in this Dockerfile should match the version
|
||||
# on which the Pandas API for Spark is built.
|
||||
# To find the right version:
|
||||
# 1. Check out the Spark branch you are on: <https://github.com/apache/spark>
|
||||
# 2. Find the pandas version in the file `dev/infra/Dockerfile`.
|
||||
RUN mamba install --yes \
|
||||
'aif360' \
|
||||
'airflow' \
|
||||
'chromadb' \
|
||||
'dalex' \
|
||||
'dbt' \
|
||||
'dlt' \
|
||||
'duckdb' \
|
||||
'faiss' \
|
||||
'gitpython' \
|
||||
'grpcio-status' \
|
||||
'grpcio' \
|
||||
'keras' \
|
||||
'langchain' \
|
||||
'langchain-ai21' \
|
||||
'langchain-anthropic' \
|
||||
'langchain-aws' \
|
||||
'langchain-azure-dynamic-sessions' \
|
||||
'langchain-chroma' \
|
||||
'langchain-community' \
|
||||
'langchain-experimental' \
|
||||
'langchain-fireworks' \
|
||||
'langchain-google-genai' \
|
||||
'langchain-groq' \
|
||||
'langchain-mistralai' \
|
||||
'langchain-mongodb' \
|
||||
'langchain-nomic' \
|
||||
'langchain-openai' \
|
||||
'langchain-prompty' \
|
||||
'langchain-qdrant' \
|
||||
'langchain-robocorp' \
|
||||
'langchain-text-splitters' \
|
||||
'langchain-together' \
|
||||
'langchain-voyageai' \
|
||||
'langgraph' \
|
||||
'langgraph-checkpoint' \
|
||||
'langgraph-sdk' \
|
||||
'langsmith' \
|
||||
'litellm' \
|
||||
'nest-asyncio' \
|
||||
'openai' \
|
||||
'openai-agents' \
|
||||
'pandas=2.2.2' \
|
||||
'pandas-profiling' \
|
||||
'pillow' \
|
||||
'polars' \
|
||||
'pyarrow' \
|
||||
'qdrant-client' \
|
||||
'rapidfuzz' \
|
||||
'tensorflow' \
|
||||
'transformers' \
|
||||
'unstructured' \
|
||||
&& \
|
||||
mamba clean --all -f -y && \
|
||||
fix-permissions "${CONDA_DIR}" && \
|
||||
fix-permissions "/home/${NB_USER}"
|
||||
|
||||
# RUN pip install pyspark[connect,ml,mllib,pandas-on-spark,sql]==4.0.0.dev2
|
||||
# RUN pip install pyspark[connect,ml,mllib,pandas-on-spark,sql]==3.5.4
|
||||
|
||||
RUN pip install \
|
||||
agno \
|
||||
fastembed \
|
||||
feature-engine \
|
||||
jupyter-ai \
|
||||
jupyter-ai-magics[all] \
|
||||
kreuzberg \
|
||||
langchain-huggingface \
|
||||
langchain-perplexity \
|
||||
langfuse \
|
||||
pydantic-ai \
|
||||
ragas \
|
||||
smolagents \
|
||||
tavily-python \
|
||||
tweet-preprocessor
|
||||
|
||||
# Install PyTorch with pip (https://pytorch.org/get-started/locally/)
|
||||
# langchain-openai must be updated to avoid pydantic v2 error
|
||||
# https://github.com/run-llama/llama_index/issues/16540https://github.com/run-llama/llama_index/issues/16540
|
||||
# hadolint ignore=DL3013
|
||||
RUN pip install --no-cache-dir --extra-index-url=https://pypi.nvidia.com --index-url 'https://download.pytorch.org/whl/cu124' \
|
||||
'torch' \
|
||||
'torchaudio' \
|
||||
'torchvision' && \
|
||||
pip install --upgrade langchain-openai && \
|
||||
fix-permissions "${CONDA_DIR}" && \
|
||||
fix-permissions "/home/${NB_USER}"
|
||||
|
||||
|
||||
WORKDIR "${HOME}"
|
||||
EXPOSE 4040
|
||||
5
jupyterhub/images/datastack-cuda-notebook/README.md
Normal file
5
jupyterhub/images/datastack-cuda-notebook/README.md
Normal file
@@ -0,0 +1,5 @@
|
||||
# Jupyter Notebook Image
|
||||
|
||||
Custom Jupyter notebook kernel image derived from the official one:
|
||||
|
||||
[jupyter/docker-stacks: Ready-to-run Docker images containing Jupyter applications](https://github.com/jupyter/docker-stacks)
|
||||
@@ -0,0 +1,13 @@
|
||||
# Configuration file for ipython-kernel.
|
||||
# See <https://ipython.readthedocs.io/en/stable/config/options/kernel.html>
|
||||
|
||||
# With IPython >= 6.0.0, all outputs to stdout/stderr are captured.
|
||||
# It is the case for subprocesses and output of compiled libraries like Spark.
|
||||
# Those logs now both head to notebook logs and in notebooks outputs.
|
||||
# Logs are particularly verbose with Spark, that is why we turn them off through this flag.
|
||||
# <https://github.com/jupyter/docker-stacks/issues/1423>
|
||||
|
||||
# Attempt to capture and forward low-level output, e.g. produced by Extension libraries.
|
||||
# Default: True
|
||||
# type:ignore
|
||||
c.IPKernelApp.capture_fd_output = False # noqa: F821
|
||||
131
jupyterhub/images/datastack-cuda-notebook/setup_spark.py
Executable file
131
jupyterhub/images/datastack-cuda-notebook/setup_spark.py
Executable file
@@ -0,0 +1,131 @@
|
||||
#!/usr/bin/env python3
|
||||
# Copyright (c) Jupyter Development Team.
|
||||
# Distributed under the terms of the Modified BSD License.
|
||||
|
||||
# Requirements:
|
||||
# - Run as the root user
|
||||
# - Required env variable: SPARK_HOME
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
LOGGER = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def get_all_refs(url: str) -> list[str]:
|
||||
"""
|
||||
Get all the references for a given webpage
|
||||
"""
|
||||
resp = requests.get(url)
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
return [a["href"] for a in soup.find_all("a", href=True)]
|
||||
|
||||
|
||||
def get_latest_spark_version() -> str:
|
||||
"""
|
||||
Returns the last version of Spark using spark archive
|
||||
"""
|
||||
LOGGER.info("Downloading Spark versions information")
|
||||
all_refs = get_all_refs("https://archive.apache.org/dist/spark/")
|
||||
versions = [
|
||||
ref.removeprefix("spark-").removesuffix("/")
|
||||
for ref in all_refs
|
||||
if ref.startswith("spark-") and "incubating" not in ref
|
||||
]
|
||||
|
||||
# Compare versions semantically
|
||||
def version_array(ver: str) -> tuple[int, int, int, str]:
|
||||
# 3.5.3 -> [3, 5, 3, ""]
|
||||
# 4.0.0-preview2 -> [4, 0, 0, "preview2"]
|
||||
arr = ver.split(".")
|
||||
assert len(arr) == 3, arr
|
||||
major, minor = int(arr[0]), int(arr[1])
|
||||
patch, _, preview = arr[2].partition("-")
|
||||
return (major, minor, int(patch), preview)
|
||||
|
||||
latest_version = max(versions, key=lambda ver: version_array(ver))
|
||||
LOGGER.info(f"Latest version: {latest_version}")
|
||||
return latest_version
|
||||
|
||||
|
||||
def download_spark(
|
||||
spark_version: str,
|
||||
hadoop_version: str,
|
||||
scala_version: str,
|
||||
spark_download_url: Path,
|
||||
) -> str:
|
||||
"""
|
||||
Downloads and unpacks spark
|
||||
The resulting spark directory name is returned
|
||||
"""
|
||||
LOGGER.info("Downloading and unpacking Spark")
|
||||
spark_dir_name = f"spark-{spark_version}-bin-hadoop{hadoop_version}"
|
||||
if scala_version:
|
||||
spark_dir_name += f"-scala{scala_version}"
|
||||
LOGGER.info(f"Spark directory name: {spark_dir_name}")
|
||||
spark_url = spark_download_url / f"spark-{spark_version}" / f"{spark_dir_name}.tgz"
|
||||
|
||||
tmp_file = Path("/tmp/spark.tar.gz")
|
||||
subprocess.check_call(
|
||||
["curl", "--progress-bar", "--location", "--output", tmp_file, spark_url]
|
||||
)
|
||||
subprocess.check_call(
|
||||
[
|
||||
"tar",
|
||||
"xzf",
|
||||
tmp_file,
|
||||
"-C",
|
||||
"/usr/local",
|
||||
"--owner",
|
||||
"root",
|
||||
"--group",
|
||||
"root",
|
||||
"--no-same-owner",
|
||||
]
|
||||
)
|
||||
tmp_file.unlink()
|
||||
return spark_dir_name
|
||||
|
||||
|
||||
def configure_spark(spark_dir_name: str, spark_home: Path) -> None:
|
||||
"""
|
||||
Creates a ${SPARK_HOME} symlink to a versioned spark directory
|
||||
Creates a 10spark-config.sh symlink to source PYTHONPATH automatically
|
||||
"""
|
||||
LOGGER.info("Configuring Spark")
|
||||
subprocess.check_call(["ln", "-s", f"/usr/local/{spark_dir_name}", spark_home])
|
||||
|
||||
# Add a link in the before_notebook hook in order to source PYTHONPATH automatically
|
||||
CONFIG_SCRIPT = "/usr/local/bin/before-notebook.d/10spark-config.sh"
|
||||
subprocess.check_call(
|
||||
["ln", "-s", spark_home / "sbin/spark-config.sh", CONFIG_SCRIPT]
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
arg_parser = argparse.ArgumentParser()
|
||||
arg_parser.add_argument("--spark-version", required=True)
|
||||
arg_parser.add_argument("--hadoop-version", required=True)
|
||||
arg_parser.add_argument("--scala-version", required=True)
|
||||
arg_parser.add_argument("--spark-download-url", type=Path, required=True)
|
||||
args = arg_parser.parse_args()
|
||||
|
||||
args.spark_version = args.spark_version or get_latest_spark_version()
|
||||
|
||||
spark_dir_name = download_spark(
|
||||
spark_version=args.spark_version,
|
||||
hadoop_version=args.hadoop_version,
|
||||
scala_version=args.scala_version,
|
||||
spark_download_url=args.spark_download_url,
|
||||
)
|
||||
configure_spark(
|
||||
spark_dir_name=spark_dir_name, spark_home=Path(os.environ["SPARK_HOME"])
|
||||
)
|
||||
1
jupyterhub/images/datastack-notebook/.dockerignore
Normal file
1
jupyterhub/images/datastack-notebook/.dockerignore
Normal file
@@ -0,0 +1 @@
|
||||
README.md
|
||||
158
jupyterhub/images/datastack-notebook/Dockerfile
Normal file
158
jupyterhub/images/datastack-notebook/Dockerfile
Normal file
@@ -0,0 +1,158 @@
|
||||
# Merge pyspark-notebook into pytorch-notebook:python-3.12
|
||||
# https://github.com/jupyter/docker-stacks/tree/main/images/pytorch-notebook
|
||||
# https://github.com/jupyter/docker-stacks/tree/main/images/pyspark-notebook
|
||||
# https://github.com/jupyter/docker-stacks/blob/main/images/pyspark-notebook/setup_spark.py
|
||||
|
||||
FROM quay.io/jupyter/pytorch-notebook:python-3.12
|
||||
|
||||
# Fix: https://github.com/hadolint/hadolint/wiki/DL4006
|
||||
# Fix: https://github.com/koalaman/shellcheck/wiki/SC3014
|
||||
SHELL ["/bin/bash", "-o", "pipefail", "-c"]
|
||||
|
||||
USER root
|
||||
|
||||
# Spark dependencies
|
||||
# Default values can be overridden at build time
|
||||
# (ARGS are in lowercase to distinguish them from ENV)
|
||||
ARG openjdk_version="17"
|
||||
|
||||
RUN apt-get update --yes && \
|
||||
apt-get install --yes --no-install-recommends \
|
||||
bash jq \
|
||||
"openjdk-${openjdk_version}-jre-headless" \
|
||||
ca-certificates-java && \
|
||||
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# If spark_version is not set, latest Spark will be installed
|
||||
ARG spark_version
|
||||
ARG hadoop_version="3"
|
||||
# If scala_version is not set, Spark without Scala will be installed
|
||||
ARG scala_version
|
||||
# URL to use for Spark downloads
|
||||
# You need to use https://archive.apache.org/dist/spark/ website if you want to download old Spark versions
|
||||
# But it seems to be slower, that's why we use the recommended site for download
|
||||
ARG spark_download_url="https://dlcdn.apache.org/spark/"
|
||||
|
||||
ENV SPARK_HOME=/usr/local/spark
|
||||
ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info"
|
||||
ENV JAVA_HOME="/usr/lib/jvm/java-17-openjdk-amd64"
|
||||
ENV PATH="${PATH}:${SPARK_HOME}/bin:${JAVA_HOME}/bin"
|
||||
|
||||
COPY setup_spark.py /opt/setup-scripts/
|
||||
|
||||
# Setup Spark
|
||||
RUN /opt/setup-scripts/setup_spark.py \
|
||||
--spark-version="${spark_version}" \
|
||||
--hadoop-version="${hadoop_version}" \
|
||||
--scala-version="${scala_version}" \
|
||||
--spark-download-url="${spark_download_url}"
|
||||
|
||||
# Configure IPython system-wide
|
||||
COPY ipython_kernel_config.py "/etc/ipython/"
|
||||
RUN fix-permissions "/etc/ipython/"
|
||||
|
||||
USER ${NB_UID}
|
||||
|
||||
# Remove torch to fix `critical libmamba filesystem error` on executing `memba install`
|
||||
RUN pip uninstall -y \
|
||||
'torch' \
|
||||
'torchaudio' \
|
||||
'torchvision'
|
||||
|
||||
# Install pyarrow
|
||||
# NOTE: It's important to ensure compatibility between Pandas versions.
|
||||
# The pandas version in this Dockerfile should match the version
|
||||
# on which the Pandas API for Spark is built.
|
||||
# To find the right version:
|
||||
# 1. Check out the Spark branch you are on: <https://github.com/apache/spark>
|
||||
# 2. Find the pandas version in the file `dev/infra/Dockerfile`.
|
||||
RUN mamba install --yes \
|
||||
'aif360' \
|
||||
'airflow' \
|
||||
'chromadb' \
|
||||
'dalex' \
|
||||
'dbt' \
|
||||
'dlt' \
|
||||
'duckdb' \
|
||||
'faiss' \
|
||||
'gitpython' \
|
||||
'grpcio-status' \
|
||||
'grpcio' \
|
||||
'keras' \
|
||||
'langchain' \
|
||||
'langchain-ai21' \
|
||||
'langchain-anthropic' \
|
||||
'langchain-aws' \
|
||||
'langchain-azure-dynamic-sessions' \
|
||||
'langchain-chroma' \
|
||||
'langchain-community' \
|
||||
'langchain-experimental' \
|
||||
'langchain-fireworks' \
|
||||
'langchain-google-genai' \
|
||||
'langchain-groq' \
|
||||
'langchain-mistralai' \
|
||||
'langchain-mongodb' \
|
||||
'langchain-nomic' \
|
||||
'langchain-openai' \
|
||||
'langchain-prompty' \
|
||||
'langchain-qdrant' \
|
||||
'langchain-robocorp' \
|
||||
'langchain-text-splitters' \
|
||||
'langchain-together' \
|
||||
'langchain-voyageai' \
|
||||
'langgraph' \
|
||||
'langgraph-checkpoint' \
|
||||
'langgraph-sdk' \
|
||||
'langsmith' \
|
||||
'litellm' \
|
||||
'nest-asyncio' \
|
||||
'openai' \
|
||||
'openai-agents' \
|
||||
'pandas=2.2.2' \
|
||||
'pandas-profiling' \
|
||||
'pillow' \
|
||||
'polars' \
|
||||
'pyarrow' \
|
||||
'qdrant-client' \
|
||||
'rapidfuzz' \
|
||||
'tensorflow' \
|
||||
'transformers' \
|
||||
'unstructured' \
|
||||
&& \
|
||||
mamba clean --all -f -y && \
|
||||
fix-permissions "${CONDA_DIR}" && \
|
||||
fix-permissions "/home/${NB_USER}"
|
||||
|
||||
# RUN pip install pyspark[connect,ml,mllib,pandas-on-spark,sql]==4.0.0.dev2
|
||||
# RUN pip install pyspark[connect,ml,mllib,pandas-on-spark,sql]==3.5.4
|
||||
|
||||
RUN pip install \
|
||||
agno \
|
||||
fastembed \
|
||||
feature-engine \
|
||||
jupyter-ai \
|
||||
jupyter-ai-magics[all] \
|
||||
kreuzberg \
|
||||
langfuse \
|
||||
langchain-huggingface \
|
||||
langchain-perplexity \
|
||||
pydantic-ai \
|
||||
ragas \
|
||||
smolagents \
|
||||
tavily-python \
|
||||
tweet-preprocessor
|
||||
|
||||
# Install PyTorch with pip (https://pytorch.org/get-started/locally/)
|
||||
# langchain-openai must be updated to avoid pydantic v2 error
|
||||
# https://github.com/run-llama/llama_index/issues/16540https://github.com/run-llama/llama_index/issues/16540
|
||||
# hadolint ignore=DL3013
|
||||
RUN pip install --no-cache-dir --index-url 'https://download.pytorch.org/whl/cpu' \
|
||||
'torch' \
|
||||
'torchaudio' \
|
||||
'torchvision' && \
|
||||
pip install --upgrade langchain-openai && \
|
||||
fix-permissions "${CONDA_DIR}" && \
|
||||
fix-permissions "/home/${NB_USER}"
|
||||
|
||||
WORKDIR "${HOME}"
|
||||
EXPOSE 4040
|
||||
5
jupyterhub/images/datastack-notebook/README.md
Normal file
5
jupyterhub/images/datastack-notebook/README.md
Normal file
@@ -0,0 +1,5 @@
|
||||
# Jupyter Notebook Image
|
||||
|
||||
Custom Jupyter notebook kernel image derived from the official one:
|
||||
|
||||
[jupyter/docker-stacks: Ready-to-run Docker images containing Jupyter applications](https://github.com/jupyter/docker-stacks)
|
||||
@@ -0,0 +1,13 @@
|
||||
# Configuration file for ipython-kernel.
|
||||
# See <https://ipython.readthedocs.io/en/stable/config/options/kernel.html>
|
||||
|
||||
# With IPython >= 6.0.0, all outputs to stdout/stderr are captured.
|
||||
# It is the case for subprocesses and output of compiled libraries like Spark.
|
||||
# Those logs now both head to notebook logs and in notebooks outputs.
|
||||
# Logs are particularly verbose with Spark, that is why we turn them off through this flag.
|
||||
# <https://github.com/jupyter/docker-stacks/issues/1423>
|
||||
|
||||
# Attempt to capture and forward low-level output, e.g. produced by Extension libraries.
|
||||
# Default: True
|
||||
# type:ignore
|
||||
c.IPKernelApp.capture_fd_output = False # noqa: F821
|
||||
131
jupyterhub/images/datastack-notebook/setup_spark.py
Executable file
131
jupyterhub/images/datastack-notebook/setup_spark.py
Executable file
@@ -0,0 +1,131 @@
|
||||
#!/usr/bin/env python3
|
||||
# Copyright (c) Jupyter Development Team.
|
||||
# Distributed under the terms of the Modified BSD License.
|
||||
|
||||
# Requirements:
|
||||
# - Run as the root user
|
||||
# - Required env variable: SPARK_HOME
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
LOGGER = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def get_all_refs(url: str) -> list[str]:
|
||||
"""
|
||||
Get all the references for a given webpage
|
||||
"""
|
||||
resp = requests.get(url)
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
return [a["href"] for a in soup.find_all("a", href=True)]
|
||||
|
||||
|
||||
def get_latest_spark_version() -> str:
|
||||
"""
|
||||
Returns the last version of Spark using spark archive
|
||||
"""
|
||||
LOGGER.info("Downloading Spark versions information")
|
||||
all_refs = get_all_refs("https://archive.apache.org/dist/spark/")
|
||||
versions = [
|
||||
ref.removeprefix("spark-").removesuffix("/")
|
||||
for ref in all_refs
|
||||
if ref.startswith("spark-") and "incubating" not in ref
|
||||
]
|
||||
|
||||
# Compare versions semantically
|
||||
def version_array(ver: str) -> tuple[int, int, int, str]:
|
||||
# 3.5.3 -> [3, 5, 3, ""]
|
||||
# 4.0.0-preview2 -> [4, 0, 0, "preview2"]
|
||||
arr = ver.split(".")
|
||||
assert len(arr) == 3, arr
|
||||
major, minor = int(arr[0]), int(arr[1])
|
||||
patch, _, preview = arr[2].partition("-")
|
||||
return (major, minor, int(patch), preview)
|
||||
|
||||
latest_version = max(versions, key=lambda ver: version_array(ver))
|
||||
LOGGER.info(f"Latest version: {latest_version}")
|
||||
return latest_version
|
||||
|
||||
|
||||
def download_spark(
|
||||
spark_version: str,
|
||||
hadoop_version: str,
|
||||
scala_version: str,
|
||||
spark_download_url: Path,
|
||||
) -> str:
|
||||
"""
|
||||
Downloads and unpacks spark
|
||||
The resulting spark directory name is returned
|
||||
"""
|
||||
LOGGER.info("Downloading and unpacking Spark")
|
||||
spark_dir_name = f"spark-{spark_version}-bin-hadoop{hadoop_version}"
|
||||
if scala_version:
|
||||
spark_dir_name += f"-scala{scala_version}"
|
||||
LOGGER.info(f"Spark directory name: {spark_dir_name}")
|
||||
spark_url = spark_download_url / f"spark-{spark_version}" / f"{spark_dir_name}.tgz"
|
||||
|
||||
tmp_file = Path("/tmp/spark.tar.gz")
|
||||
subprocess.check_call(
|
||||
["curl", "--progress-bar", "--location", "--output", tmp_file, spark_url]
|
||||
)
|
||||
subprocess.check_call(
|
||||
[
|
||||
"tar",
|
||||
"xzf",
|
||||
tmp_file,
|
||||
"-C",
|
||||
"/usr/local",
|
||||
"--owner",
|
||||
"root",
|
||||
"--group",
|
||||
"root",
|
||||
"--no-same-owner",
|
||||
]
|
||||
)
|
||||
tmp_file.unlink()
|
||||
return spark_dir_name
|
||||
|
||||
|
||||
def configure_spark(spark_dir_name: str, spark_home: Path) -> None:
|
||||
"""
|
||||
Creates a ${SPARK_HOME} symlink to a versioned spark directory
|
||||
Creates a 10spark-config.sh symlink to source PYTHONPATH automatically
|
||||
"""
|
||||
LOGGER.info("Configuring Spark")
|
||||
subprocess.check_call(["ln", "-s", f"/usr/local/{spark_dir_name}", spark_home])
|
||||
|
||||
# Add a link in the before_notebook hook in order to source PYTHONPATH automatically
|
||||
CONFIG_SCRIPT = "/usr/local/bin/before-notebook.d/10spark-config.sh"
|
||||
subprocess.check_call(
|
||||
["ln", "-s", spark_home / "sbin/spark-config.sh", CONFIG_SCRIPT]
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
arg_parser = argparse.ArgumentParser()
|
||||
arg_parser.add_argument("--spark-version", required=True)
|
||||
arg_parser.add_argument("--hadoop-version", required=True)
|
||||
arg_parser.add_argument("--scala-version", required=True)
|
||||
arg_parser.add_argument("--spark-download-url", type=Path, required=True)
|
||||
args = arg_parser.parse_args()
|
||||
|
||||
args.spark_version = args.spark_version or get_latest_spark_version()
|
||||
|
||||
spark_dir_name = download_spark(
|
||||
spark_version=args.spark_version,
|
||||
hadoop_version=args.hadoop_version,
|
||||
scala_version=args.scala_version,
|
||||
spark_download_url=args.spark_download_url,
|
||||
)
|
||||
configure_spark(
|
||||
spark_dir_name=spark_dir_name, spark_home=Path(os.environ["SPARK_HOME"])
|
||||
)
|
||||
Reference in New Issue
Block a user