Files
buun-stack/mlflow/justfile
2025-11-09 21:31:33 +09:00

433 lines
17 KiB
Makefile

set fallback := true
export MLFLOW_NAMESPACE := env("MLFLOW_NAMESPACE", "mlflow")
export MLFLOW_CHART_VERSION := env("MLFLOW_CHART_VERSION", "1.8.0")
export MLFLOW_HOST := env("MLFLOW_HOST", "")
export IMAGE_REGISTRY := env("IMAGE_REGISTRY", "localhost:30500")
export MLFLOW_IMAGE_TAG := env("MLFLOW_IMAGE_TAG", "3.6.0-oidc")
export MLFLOW_IMAGE_PULL_POLICY := env("MLFLOW_IMAGE_PULL_POLICY", "IfNotPresent")
export MLFLOW_OIDC_ENABLED := env("MLFLOW_OIDC_ENABLED", "true")
export POSTGRES_NAMESPACE := env("POSTGRES_NAMESPACE", "postgres")
export MINIO_NAMESPACE := env("MINIO_NAMESPACE", "minio")
export EXTERNAL_SECRETS_NAMESPACE := env("EXTERNAL_SECRETS_NAMESPACE", "external-secrets")
export K8S_VAULT_NAMESPACE := env("K8S_VAULT_NAMESPACE", "vault")
export MONITORING_ENABLED := env("MONITORING_ENABLED", "")
export PROMETHEUS_NAMESPACE := env("PROMETHEUS_NAMESPACE", "monitoring")
export KEYCLOAK_REALM := env("KEYCLOAK_REALM", "buunstack")
export KEYCLOAK_HOST := env("KEYCLOAK_HOST", "")
[private]
default:
@just --list --unsorted --list-submodules
# Add Helm repository
add-helm-repo:
helm repo add community-charts https://community-charts.github.io/helm-charts
helm repo update
# Remove Helm repository
remove-helm-repo:
helm repo remove community-charts
# Build custom MLflow image with OIDC auth plugin
build-image:
#!/bin/bash
set -euo pipefail
echo "Building MLflow image with OIDC auth plugin..."
cd image
docker build -t ${IMAGE_REGISTRY}/mlflow:${MLFLOW_IMAGE_TAG} .
echo "Image built: ${IMAGE_REGISTRY}/mlflow:${MLFLOW_IMAGE_TAG}"
# Push custom MLflow image to registry
push-image:
#!/bin/bash
set -euo pipefail
echo "Pushing MLflow image to registry..."
docker push ${IMAGE_REGISTRY}/mlflow:${MLFLOW_IMAGE_TAG}
echo "Image pushed: ${IMAGE_REGISTRY}/mlflow:${MLFLOW_IMAGE_TAG}"
# Build and push custom MLflow image
build-and-push-image: build-image push-image
# Create namespace
create-namespace:
@kubectl get namespace ${MLFLOW_NAMESPACE} &>/dev/null || \
kubectl create namespace ${MLFLOW_NAMESPACE}
# Delete namespace
delete-namespace:
@kubectl delete namespace ${MLFLOW_NAMESPACE} --ignore-not-found
# Setup PostgreSQL database and user for MLflow
setup-postgres-db:
#!/bin/bash
set -euo pipefail
echo "Setting up PostgreSQL database for MLflow..."
if just postgres::db-exists mlflow &>/dev/null; then
echo "Database 'mlflow' already exists."
else
echo "Creating new database 'mlflow'..."
just postgres::create-db mlflow
fi
if just postgres::user-exists mlflow &>/dev/null; then
echo "User 'mlflow' already exists."
if helm status external-secrets -n ${EXTERNAL_SECRETS_NAMESPACE} &>/dev/null; then
if db_password=$(just vault::get mlflow/postgres password 2>/dev/null); then
echo "Using existing password from Vault."
else
echo "Generating new password and updating Vault..."
db_password=$(just utils::random-password)
just postgres::psql -c "ALTER USER mlflow WITH PASSWORD '${db_password}';"
fi
else
echo "Generating new password for existing user..."
db_password=$(just utils::random-password)
just postgres::psql -c "ALTER USER mlflow WITH PASSWORD '${db_password}';"
fi
else
echo "Creating new user 'mlflow'..."
db_password=$(just utils::random-password)
just postgres::create-user mlflow "${db_password}"
fi
echo "Ensuring database permissions..."
just postgres::grant mlflow mlflow
# Create mlflow_auth database for OIDC user management
if just postgres::db-exists mlflow_auth &>/dev/null; then
echo "Database 'mlflow_auth' already exists."
else
echo "Creating new database 'mlflow_auth' for OIDC authentication..."
just postgres::create-db mlflow_auth
fi
echo "Granting permissions on mlflow_auth to mlflow user..."
just postgres::grant mlflow_auth mlflow
if helm status external-secrets -n ${EXTERNAL_SECRETS_NAMESPACE} &>/dev/null; then
echo "External Secrets available. Storing credentials in Vault..."
just vault::put mlflow/postgres username=mlflow password="${db_password}"
echo "PostgreSQL credentials stored in Vault"
else
echo "External Secrets not available. Password will be stored in Kubernetes Secret only."
if helm status vault -n ${K8S_VAULT_NAMESPACE} &>/dev/null; then
just vault::put mlflow/postgres username=mlflow password="${db_password}"
echo "PostgreSQL credentials also stored in Vault for backup"
fi
fi
echo "PostgreSQL setup completed"
# Create PostgreSQL credentials secret
create-db-secret:
#!/bin/bash
set -euo pipefail
just create-namespace
echo "Creating PostgreSQL credentials secret..."
if helm status external-secrets -n ${EXTERNAL_SECRETS_NAMESPACE} &>/dev/null; then
echo "Creating ExternalSecret for PostgreSQL credentials..."
kubectl delete secret mlflow-db-secret -n ${MLFLOW_NAMESPACE} --ignore-not-found
kubectl delete externalsecret mlflow-db-external-secret -n ${MLFLOW_NAMESPACE} --ignore-not-found
gomplate -f mlflow-db-external-secret.gomplate.yaml | kubectl apply -f -
echo "Waiting for ExternalSecret to sync..."
kubectl wait --for=condition=Ready externalsecret/mlflow-db-external-secret \
-n ${MLFLOW_NAMESPACE} --timeout=60s
echo "ExternalSecret synced successfully"
else
echo "External Secrets not available. Creating Kubernetes Secret directly..."
db_username=$(just vault::get mlflow/postgres username 2>/dev/null || echo "mlflow")
db_password=$(just vault::get mlflow/postgres password 2>/dev/null)
if [ -z "${db_password}" ]; then
echo "Error: PostgreSQL password not found in Vault"
echo "Please run 'just mlflow::setup-postgres-db' first"
exit 1
fi
kubectl delete secret mlflow-db-secret -n ${MLFLOW_NAMESPACE} --ignore-not-found
kubectl create secret generic mlflow-db-secret -n ${MLFLOW_NAMESPACE} \
--from-literal=username="${db_username}" \
--from-literal=password="${db_password}"
echo "Kubernetes Secret created"
fi
# Delete PostgreSQL secret
delete-db-secret:
@kubectl delete secret mlflow-db-secret -n ${MLFLOW_NAMESPACE} --ignore-not-found
@kubectl delete externalsecret mlflow-db-external-secret -n ${MLFLOW_NAMESPACE} --ignore-not-found
# Create MinIO/S3 credentials secret
create-s3-secret:
#!/bin/bash
set -euo pipefail
just create-namespace
echo "Creating MinIO/S3 credentials secret..."
if ! kubectl get secret minio -n ${MINIO_NAMESPACE} &>/dev/null; then
echo "Error: MinIO root credentials not found"
echo "Please install MinIO first with 'just minio::install'"
exit 1
fi
accesskey=$(kubectl get secret minio -n ${MINIO_NAMESPACE} \
-o jsonpath='{.data.rootUser}' | base64 --decode)
secretkey=$(kubectl get secret minio -n ${MINIO_NAMESPACE} \
-o jsonpath='{.data.rootPassword}' | base64 --decode)
if helm status external-secrets -n ${EXTERNAL_SECRETS_NAMESPACE} &>/dev/null; then
echo "Creating ExternalSecret for MinIO credentials..."
just vault::put mlflow/s3 accesskey="${accesskey}" secretkey="${secretkey}"
kubectl delete secret mlflow-s3-secret -n ${MLFLOW_NAMESPACE} --ignore-not-found
kubectl delete externalsecret mlflow-s3-external-secret -n ${MLFLOW_NAMESPACE} --ignore-not-found
gomplate -f mlflow-s3-external-secret.gomplate.yaml | kubectl apply -f -
echo "Waiting for ExternalSecret to sync..."
kubectl wait --for=condition=Ready externalsecret/mlflow-s3-external-secret \
-n ${MLFLOW_NAMESPACE} --timeout=60s
echo "ExternalSecret synced successfully"
else
echo "External Secrets not available. Creating Kubernetes Secret directly..."
kubectl delete secret mlflow-s3-secret -n ${MLFLOW_NAMESPACE} --ignore-not-found
kubectl create secret generic mlflow-s3-secret -n ${MLFLOW_NAMESPACE} \
--from-literal=AWS_ACCESS_KEY_ID="${accesskey}" \
--from-literal=AWS_SECRET_ACCESS_KEY="${secretkey}"
echo "Kubernetes Secret created"
if helm status vault -n ${K8S_VAULT_NAMESPACE} &>/dev/null; then
just vault::put mlflow/s3 accesskey="${accesskey}" secretkey="${secretkey}"
echo "MinIO credentials also stored in Vault for backup"
fi
fi
# Delete MinIO/S3 secret
delete-s3-secret:
@kubectl delete secret mlflow-s3-secret -n ${MLFLOW_NAMESPACE} --ignore-not-found
@kubectl delete externalsecret mlflow-s3-external-secret -n ${MLFLOW_NAMESPACE} --ignore-not-found
# Install MLflow
install:
#!/bin/bash
set -euo pipefail
echo "Installing MLflow..."
just create-namespace
if ! kubectl get service postgres-cluster-rw -n ${POSTGRES_NAMESPACE} &>/dev/null; then
echo "Error: PostgreSQL cluster not found"
echo "Please install PostgreSQL first with 'just postgres::install'"
exit 1
fi
if ! kubectl get service minio -n ${MINIO_NAMESPACE} &>/dev/null; then
echo "Error: MinIO not found"
echo "Please install MinIO first with 'just minio::install'"
exit 1
fi
if [ -z "${MLFLOW_HOST}" ]; then
while [ -z "${MLFLOW_HOST}" ]; do
MLFLOW_HOST=$(
gum input --prompt="MLflow host (FQDN): " --width=100 \
--placeholder="e.g., mlflow.example.com"
)
done
fi
if helm status kube-prometheus-stack -n ${PROMETHEUS_NAMESPACE} &>/dev/null; then
if [ -z "${MONITORING_ENABLED}" ]; then
if gum confirm "Enable Prometheus monitoring (ServiceMonitor)?"; then
MONITORING_ENABLED="true"
else
MONITORING_ENABLED="false"
fi
fi
else
MONITORING_ENABLED="false"
fi
just setup-postgres-db
just create-db-secret
just create-s3-secret
if ! just minio::bucket-exists mlflow; then
echo "Creating 'mlflow' bucket in MinIO..."
just minio::create-bucket mlflow
else
echo "Bucket 'mlflow' already exists"
fi
just add-helm-repo
just keycloak::delete-client "${KEYCLOAK_REALM}" "mlflow" || true
oidc_client_secret=$(just utils::random-password)
redirect_urls="https://${MLFLOW_HOST}/callback"
just keycloak::create-client \
realm="${KEYCLOAK_REALM}" \
client_id="mlflow" \
redirect_url="${redirect_urls}" \
client_secret="${oidc_client_secret}"
echo "✓ Keycloak client 'mlflow' created"
if ! just keycloak::get-client-scope "${KEYCLOAK_REALM}" groups &>/dev/null; then
just keycloak::create-client-scope "${KEYCLOAK_REALM}" groups "User group memberships"
just keycloak::add-groups-mapper-to-scope "${KEYCLOAK_REALM}" groups
echo "✓ Groups client scope created"
else
echo "✓ Groups client scope already exists"
fi
just keycloak::add-scope-to-client "${KEYCLOAK_REALM}" mlflow groups
echo "✓ Groups scope added to mlflow client"
echo "Setting up MLflow groups..."
just keycloak::create-group mlflow-admins "" "MLflow administrators with full access" || true
just keycloak::create-group mlflow-users "" "MLflow users with basic access" || true
echo "✓ MLflow groups configured"
if helm status external-secrets -n ${EXTERNAL_SECRETS_NAMESPACE} &>/dev/null; then
echo "External Secrets Operator detected. Storing OIDC config in Vault..."
# Get PostgreSQL credentials for auth database
db_username=$(just vault::get mlflow/postgres username)
db_password=$(just vault::get mlflow/postgres password)
auth_db_uri="postgresql://${db_username}:${db_password}@postgres-cluster-rw.${POSTGRES_NAMESPACE}.svc.cluster.local:5432/mlflow_auth"
just vault::put "mlflow/oidc" \
client_id="mlflow" \
client_secret="${oidc_client_secret}" \
auth_db_uri="${auth_db_uri}"
kubectl delete secret mlflow-oidc-config -n ${MLFLOW_NAMESPACE} --ignore-not-found
kubectl delete externalsecret mlflow-oidc-external-secret -n ${MLFLOW_NAMESPACE} \
--ignore-not-found
export OIDC_CLIENT_SECRET="${oidc_client_secret}"
gomplate -f mlflow-oidc-external-secret.gomplate.yaml | kubectl apply -f -
echo "Waiting for ExternalSecret to sync..."
kubectl wait --for=condition=Ready externalsecret/mlflow-oidc-external-secret \
-n ${MLFLOW_NAMESPACE} --timeout=60s
else
echo "Creating Kubernetes secret directly..."
# Get PostgreSQL credentials for auth database
db_username=$(just vault::get mlflow/postgres username 2>/dev/null || echo "mlflow")
db_password=$(just vault::get mlflow/postgres password)
auth_db_uri="postgresql://${db_username}:${db_password}@postgres-cluster-rw.${POSTGRES_NAMESPACE}.svc.cluster.local:5432/mlflow_auth"
kubectl delete secret mlflow-oidc-config -n ${MLFLOW_NAMESPACE} --ignore-not-found
kubectl create secret generic mlflow-oidc-config -n ${MLFLOW_NAMESPACE} \
--from-literal=OIDC_CLIENT_ID="mlflow" \
--from-literal=OIDC_CLIENT_SECRET="${oidc_client_secret}" \
--from-literal=OIDC_USERS_DB_URI="${auth_db_uri}"
# Store in Vault for backup if available
if helm status vault -n ${K8S_VAULT_NAMESPACE} &>/dev/null; then
just vault::put "mlflow/oidc" client_id="mlflow" client_secret="${oidc_client_secret}" \
auth_db_uri="${auth_db_uri}"
fi
fi
export MLFLOW_OIDC_ENABLED="true"
echo "Generating Helm values with OIDC enabled..."
gomplate -f values.gomplate.yaml -o values.yaml
echo "Creating Traefik Middleware..."
gomplate -f mlflow-middleware.gomplate.yaml -o mlflow-middleware.yaml
kubectl apply -f mlflow-middleware.yaml
echo "Installing MLflow Helm chart from Community Charts with OIDC..."
helm upgrade --cleanup-on-fail --install mlflow community-charts/mlflow \
--version ${MLFLOW_CHART_VERSION} -n ${MLFLOW_NAMESPACE} --wait --timeout=10m -f values.yaml
if [ "${MONITORING_ENABLED}" = "true" ]; then
echo "Enabling Prometheus monitoring for namespace ${MLFLOW_NAMESPACE}..."
kubectl label namespace ${MLFLOW_NAMESPACE} buun.channel/enable-monitoring=true --overwrite
echo "✓ Monitoring enabled"
fi
echo ""
echo "=== MLflow installed with OIDC authentication ==="
echo "MLflow URL: https://${MLFLOW_HOST}"
echo ""
echo "OIDC authentication is enabled using Keycloak"
echo "Users can sign in with their Keycloak credentials"
# Upgrade MLflow
upgrade:
#!/bin/bash
set -euo pipefail
if [ -z "${MLFLOW_HOST}" ]; then
while [ -z "${MLFLOW_HOST}" ]; do
MLFLOW_HOST=$(
gum input --prompt="MLflow host (FQDN): " --width=100 \
--placeholder="e.g., mlflow.example.com"
)
done
fi
if helm status kube-prometheus-stack -n ${PROMETHEUS_NAMESPACE} &>/dev/null; then
if [ -z "${MONITORING_ENABLED}" ]; then
if gum confirm "Enable Prometheus monitoring (ServiceMonitor)?"; then
MONITORING_ENABLED="true"
else
MONITORING_ENABLED="false"
fi
fi
else
MONITORING_ENABLED="false"
fi
echo "Generating Helm values..."
gomplate -f values.gomplate.yaml -o values.yaml
echo "Creating Traefik Middleware..."
gomplate -f mlflow-middleware.gomplate.yaml -o mlflow-middleware.yaml
kubectl apply -f mlflow-middleware.yaml
echo "Upgrading MLflow Helm chart from Community Charts..."
helm upgrade mlflow community-charts/mlflow \
--version ${MLFLOW_CHART_VERSION} -n ${MLFLOW_NAMESPACE} --wait --timeout=10m -f values.yaml
echo "MLflow upgraded successfully"
echo "Access MLflow at: https://${MLFLOW_HOST}"
# Uninstall MLflow
uninstall delete-db='true':
#!/bin/bash
set -euo pipefail
echo "Uninstalling MLflow..."
helm uninstall mlflow -n ${MLFLOW_NAMESPACE} --ignore-not-found
just delete-db-secret
just delete-s3-secret
kubectl delete secret mlflow-oidc-config -n ${MLFLOW_NAMESPACE} --ignore-not-found
kubectl delete externalsecret mlflow-oidc-external-secret -n ${MLFLOW_NAMESPACE} \
--ignore-not-found
kubectl delete middleware mlflow-headers -n ${MLFLOW_NAMESPACE} --ignore-not-found
just delete-namespace
if [ "{{ delete-db }}" = "true" ]; then
just postgres::delete-db mlflow || true
just postgres::delete-user mlflow || true
fi
just keycloak::delete-client "${KEYCLOAK_REALM}" "mlflow" || true
echo "MLflow uninstalled"
# Clean up all MLflow resources
cleanup:
#!/bin/bash
set -euo pipefail
echo "This will delete the MLflow database, user, and all secrets."
if gum confirm "Are you sure you want to proceed?"; then
echo "Cleaning up MLflow resources..."
just postgres::delete-db mlflow || true
just postgres::delete-user mlflow || true
just vault::delete mlflow/postgres || true
just vault::delete mlflow/s3 || true
just vault::delete mlflow/oidc || true
just keycloak::delete-client "${KEYCLOAK_REALM}" "mlflow" || true
echo "Cleanup completed"
else
echo "Cleanup cancelled"
fi