From 3bd59989298c19f7618be3f0a9b517946a61a0e2 Mon Sep 17 00:00:00 2001 From: Masaki Yatsu Date: Sun, 9 Nov 2025 11:07:44 +0900 Subject: [PATCH] feat(mlflow): install MLflow --- justfile | 1 + mlflow/.gitignore | 3 + mlflow/justfile | 293 ++++++++++++++++++ .../mlflow-db-external-secret.gomplate.yaml | 23 ++ .../mlflow-s3-external-secret.gomplate.yaml | 23 ++ mlflow/values.gomplate.yaml | 108 +++++++ 6 files changed, 451 insertions(+) create mode 100644 mlflow/.gitignore create mode 100644 mlflow/justfile create mode 100644 mlflow/mlflow-db-external-secret.gomplate.yaml create mode 100644 mlflow/mlflow-s3-external-secret.gomplate.yaml create mode 100644 mlflow/values.gomplate.yaml diff --git a/justfile b/justfile index b3dd54c..e56587d 100644 --- a/justfile +++ b/justfile @@ -19,6 +19,7 @@ mod k8s mod lakekeeper mod longhorn mod metabase +mod mlflow mod minio mod oauth2-proxy mod postgres diff --git a/mlflow/.gitignore b/mlflow/.gitignore new file mode 100644 index 0000000..81821cf --- /dev/null +++ b/mlflow/.gitignore @@ -0,0 +1,3 @@ +values.yaml +mlflow-db-external-secret.yaml +mlflow-s3-external-secret.yaml diff --git a/mlflow/justfile b/mlflow/justfile new file mode 100644 index 0000000..1180916 --- /dev/null +++ b/mlflow/justfile @@ -0,0 +1,293 @@ +set fallback := true + +export MLFLOW_NAMESPACE := env("MLFLOW_NAMESPACE", "mlflow") +export MLFLOW_CHART_VERSION := env("MLFLOW_CHART_VERSION", "1.8.0") +export MLFLOW_HOST := env("MLFLOW_HOST", "") +export POSTGRES_NAMESPACE := env("POSTGRES_NAMESPACE", "postgres") +export MINIO_NAMESPACE := env("MINIO_NAMESPACE", "minio") +export EXTERNAL_SECRETS_NAMESPACE := env("EXTERNAL_SECRETS_NAMESPACE", "external-secrets") +export K8S_VAULT_NAMESPACE := env("K8S_VAULT_NAMESPACE", "vault") +export PROMETHEUS_NAMESPACE := env("PROMETHEUS_NAMESPACE", "monitoring") + +[private] +default: + @just --list --unsorted --list-submodules + +# Add Helm repository +add-helm-repo: + helm repo add community-charts https://community-charts.github.io/helm-charts + helm repo update + +# Remove Helm repository +remove-helm-repo: + helm repo remove community-charts + +# Create namespace +create-namespace: + @kubectl get namespace ${MLFLOW_NAMESPACE} &>/dev/null || \ + kubectl create namespace ${MLFLOW_NAMESPACE} + +# Delete namespace +delete-namespace: + @kubectl delete namespace ${MLFLOW_NAMESPACE} --ignore-not-found + +# Setup PostgreSQL database and user for MLflow +setup-postgres-db: + #!/bin/bash + set -euo pipefail + echo "Setting up PostgreSQL database for MLflow..." + + if just postgres::db-exists mlflow &>/dev/null; then + echo "Database 'mlflow' already exists." + else + echo "Creating new database 'mlflow'..." + just postgres::create-db mlflow + fi + + if just postgres::user-exists mlflow &>/dev/null; then + echo "User 'mlflow' already exists." + if helm status external-secrets -n ${EXTERNAL_SECRETS_NAMESPACE} &>/dev/null; then + if db_password=$(just vault::get mlflow/postgres password 2>/dev/null); then + echo "Using existing password from Vault." + else + echo "Generating new password and updating Vault..." + db_password=$(just utils::random-password) + just postgres::psql -c "ALTER USER mlflow WITH PASSWORD '${db_password}';" + fi + else + echo "Generating new password for existing user..." + db_password=$(just utils::random-password) + just postgres::psql -c "ALTER USER mlflow WITH PASSWORD '${db_password}';" + fi + else + echo "Creating new user 'mlflow'..." + db_password=$(just utils::random-password) + just postgres::create-user mlflow "${db_password}" + fi + + echo "Ensuring database permissions..." + just postgres::grant mlflow mlflow + + if helm status external-secrets -n ${EXTERNAL_SECRETS_NAMESPACE} &>/dev/null; then + echo "External Secrets available. Storing credentials in Vault..." + just vault::put mlflow/postgres username=mlflow password="${db_password}" + echo "PostgreSQL credentials stored in Vault" + else + echo "External Secrets not available. Password will be stored in Kubernetes Secret only." + if helm status vault -n ${K8S_VAULT_NAMESPACE} &>/dev/null; then + just vault::put mlflow/postgres username=mlflow password="${db_password}" + echo "PostgreSQL credentials also stored in Vault for backup" + fi + fi + echo "PostgreSQL setup completed" + +# Create PostgreSQL credentials secret +create-db-secret: + #!/bin/bash + set -euo pipefail + just create-namespace + echo "Creating PostgreSQL credentials secret..." + + if helm status external-secrets -n ${EXTERNAL_SECRETS_NAMESPACE} &>/dev/null; then + echo "Creating ExternalSecret for PostgreSQL credentials..." + kubectl delete secret mlflow-db-secret -n ${MLFLOW_NAMESPACE} --ignore-not-found + kubectl delete externalsecret mlflow-db-external-secret -n ${MLFLOW_NAMESPACE} --ignore-not-found + + gomplate -f mlflow-db-external-secret.gomplate.yaml | kubectl apply -f - + + echo "Waiting for ExternalSecret to sync..." + kubectl wait --for=condition=Ready externalsecret/mlflow-db-external-secret \ + -n ${MLFLOW_NAMESPACE} --timeout=60s + echo "ExternalSecret synced successfully" + else + echo "External Secrets not available. Creating Kubernetes Secret directly..." + db_username=$(just vault::get mlflow/postgres username 2>/dev/null || echo "mlflow") + db_password=$(just vault::get mlflow/postgres password 2>/dev/null) + + if [ -z "${db_password}" ]; then + echo "Error: PostgreSQL password not found in Vault" + echo "Please run 'just mlflow::setup-postgres-db' first" + exit 1 + fi + + kubectl delete secret mlflow-db-secret -n ${MLFLOW_NAMESPACE} --ignore-not-found + kubectl create secret generic mlflow-db-secret -n ${MLFLOW_NAMESPACE} \ + --from-literal=username="${db_username}" \ + --from-literal=password="${db_password}" + echo "Kubernetes Secret created" + fi + +# Delete PostgreSQL secret +delete-db-secret: + @kubectl delete secret mlflow-db-secret -n ${MLFLOW_NAMESPACE} --ignore-not-found + @kubectl delete externalsecret mlflow-db-external-secret -n ${MLFLOW_NAMESPACE} --ignore-not-found + +# Create MinIO/S3 credentials secret +create-s3-secret: + #!/bin/bash + set -euo pipefail + just create-namespace + echo "Creating MinIO/S3 credentials secret..." + + if ! kubectl get secret minio -n ${MINIO_NAMESPACE} &>/dev/null; then + echo "Error: MinIO root credentials not found" + echo "Please install MinIO first with 'just minio::install'" + exit 1 + fi + + accesskey=$(kubectl get secret minio -n ${MINIO_NAMESPACE} \ + -o jsonpath='{.data.rootUser}' | base64 --decode) + secretkey=$(kubectl get secret minio -n ${MINIO_NAMESPACE} \ + -o jsonpath='{.data.rootPassword}' | base64 --decode) + + if helm status external-secrets -n ${EXTERNAL_SECRETS_NAMESPACE} &>/dev/null; then + echo "Creating ExternalSecret for MinIO credentials..." + just vault::put mlflow/s3 accesskey="${accesskey}" secretkey="${secretkey}" + + kubectl delete secret mlflow-s3-secret -n ${MLFLOW_NAMESPACE} --ignore-not-found + kubectl delete externalsecret mlflow-s3-external-secret -n ${MLFLOW_NAMESPACE} --ignore-not-found + + gomplate -f mlflow-s3-external-secret.gomplate.yaml | kubectl apply -f - + + echo "Waiting for ExternalSecret to sync..." + kubectl wait --for=condition=Ready externalsecret/mlflow-s3-external-secret \ + -n ${MLFLOW_NAMESPACE} --timeout=60s + echo "ExternalSecret synced successfully" + else + echo "External Secrets not available. Creating Kubernetes Secret directly..." + kubectl delete secret mlflow-s3-secret -n ${MLFLOW_NAMESPACE} --ignore-not-found + kubectl create secret generic mlflow-s3-secret -n ${MLFLOW_NAMESPACE} \ + --from-literal=AWS_ACCESS_KEY_ID="${accesskey}" \ + --from-literal=AWS_SECRET_ACCESS_KEY="${secretkey}" + echo "Kubernetes Secret created" + + if helm status vault -n ${K8S_VAULT_NAMESPACE} &>/dev/null; then + just vault::put mlflow/s3 accesskey="${accesskey}" secretkey="${secretkey}" + echo "MinIO credentials also stored in Vault for backup" + fi + fi + +# Delete MinIO/S3 secret +delete-s3-secret: + @kubectl delete secret mlflow-s3-secret -n ${MLFLOW_NAMESPACE} --ignore-not-found + @kubectl delete externalsecret mlflow-s3-external-secret -n ${MLFLOW_NAMESPACE} --ignore-not-found + +# Install MLflow +install: check-env + #!/bin/bash + set -euo pipefail + echo "Installing MLflow..." + just create-namespace + + if ! kubectl get service postgres-cluster-rw -n ${POSTGRES_NAMESPACE} &>/dev/null; then + echo "Error: PostgreSQL cluster not found" + echo "Please install PostgreSQL first with 'just postgres::install'" + exit 1 + fi + + if ! kubectl get service minio -n ${MINIO_NAMESPACE} &>/dev/null; then + echo "Error: MinIO not found" + echo "Please install MinIO first with 'just minio::install'" + exit 1 + fi + + just setup-postgres-db + just create-db-secret + just create-s3-secret + + # Create mlflow bucket in MinIO if it doesn't exist + if ! just minio::bucket-exists mlflow; then + echo "Creating 'mlflow' bucket in MinIO..." + just minio::create-bucket mlflow + else + echo "Bucket 'mlflow' already exists" + fi + + just add-helm-repo + + echo "Generating Helm values..." + gomplate -f values.gomplate.yaml -o values.yaml + + echo "Installing MLflow Helm chart from Community Charts..." + helm upgrade --cleanup-on-fail --install mlflow \ + community-charts/mlflow \ + --version ${MLFLOW_CHART_VERSION} \ + -n ${MLFLOW_NAMESPACE} \ + --wait \ + --timeout=10m \ + -f values.yaml + + echo "" + echo "=== MLflow installed ===" + echo "MLflow URL: https://${MLFLOW_HOST}" + echo "" + echo "Next steps:" + echo " 1. Configure OAuth2 Proxy for authentication (recommended)" + echo " 2. Access MLflow UI at https://${MLFLOW_HOST}" + +# Upgrade MLflow +upgrade: check-env + #!/bin/bash + set -euo pipefail + echo "Upgrading MLflow..." + + echo "Generating Helm values..." + gomplate -f values.gomplate.yaml -o values.yaml + + echo "Upgrading MLflow Helm chart from Community Charts..." + helm upgrade mlflow \ + community-charts/mlflow \ + --version ${MLFLOW_CHART_VERSION} \ + -n ${MLFLOW_NAMESPACE} \ + --wait \ + --timeout=10m \ + -f values.yaml + + echo "MLflow upgraded successfully" + echo "Access MLflow at: https://${MLFLOW_HOST}" + +# Uninstall MLflow +uninstall delete-db='true': + #!/bin/bash + set -euo pipefail + echo "Uninstalling MLflow..." + helm uninstall mlflow -n ${MLFLOW_NAMESPACE} --ignore-not-found + just delete-db-secret + just delete-s3-secret + just delete-namespace + if [ "{{ delete-db }}" = "true" ]; then + just postgres::delete-db mlflow || true + just postgres::delete-user mlflow || true + fi + echo "MLflow uninstalled" + +# Clean up all MLflow resources +cleanup: + #!/bin/bash + set -euo pipefail + echo "This will delete the MLflow database, user, and all secrets." + if gum confirm "Are you sure you want to proceed?"; then + echo "Cleaning up MLflow resources..." + just postgres::delete-db mlflow || true + just postgres::delete-user mlflow || true + just vault::delete mlflow/postgres || true + just vault::delete mlflow/s3 || true + echo "Cleanup completed" + else + echo "Cleanup cancelled" + fi + +# Check the environment +[private] +check-env: + #!/bin/bash + set -euo pipefail + if [ -z "${MLFLOW_HOST}" ]; then + while [ -z "${MLFLOW_HOST}" ]; do + MLFLOW_HOST=$( + gum input --prompt="MLflow host (FQDN): " --width=100 \ + --placeholder="e.g., mlflow.example.com" + ) + done + just env::set MLFLOW_HOST="${MLFLOW_HOST}" + fi diff --git a/mlflow/mlflow-db-external-secret.gomplate.yaml b/mlflow/mlflow-db-external-secret.gomplate.yaml new file mode 100644 index 0000000..d47dae3 --- /dev/null +++ b/mlflow/mlflow-db-external-secret.gomplate.yaml @@ -0,0 +1,23 @@ +--- +apiVersion: external-secrets.io/v1 +kind: ExternalSecret +metadata: + name: mlflow-db-external-secret + namespace: {{ .Env.MLFLOW_NAMESPACE }} +spec: + refreshInterval: 1h + secretStoreRef: + name: vault-secret-store + kind: ClusterSecretStore + target: + name: mlflow-db-secret + creationPolicy: Owner + data: + - secretKey: username + remoteRef: + key: mlflow/postgres + property: username + - secretKey: password + remoteRef: + key: mlflow/postgres + property: password diff --git a/mlflow/mlflow-s3-external-secret.gomplate.yaml b/mlflow/mlflow-s3-external-secret.gomplate.yaml new file mode 100644 index 0000000..44d079a --- /dev/null +++ b/mlflow/mlflow-s3-external-secret.gomplate.yaml @@ -0,0 +1,23 @@ +--- +apiVersion: external-secrets.io/v1 +kind: ExternalSecret +metadata: + name: mlflow-s3-external-secret + namespace: {{ .Env.MLFLOW_NAMESPACE }} +spec: + refreshInterval: 1h + secretStoreRef: + name: vault-secret-store + kind: ClusterSecretStore + target: + name: mlflow-s3-secret + creationPolicy: Owner + data: + - secretKey: AWS_ACCESS_KEY_ID + remoteRef: + key: mlflow/s3 + property: accesskey + - secretKey: AWS_SECRET_ACCESS_KEY + remoteRef: + key: mlflow/s3 + property: secretkey diff --git a/mlflow/values.gomplate.yaml b/mlflow/values.gomplate.yaml new file mode 100644 index 0000000..ace96d2 --- /dev/null +++ b/mlflow/values.gomplate.yaml @@ -0,0 +1,108 @@ +--- +# Replica count +replicaCount: 1 + +# Image configuration (Community Charts uses burakince/mlflow) +image: + repository: burakince/mlflow + pullPolicy: IfNotPresent + tag: "3.6.0" # MLflow 3.6.0 + +# Backend store configuration (PostgreSQL) +backendStore: + # Enable database migration + databaseMigration: true + # Enable database connection check + databaseConnectionCheck: true + + postgres: + enabled: true + host: "postgres-cluster-rw.{{ .Env.POSTGRES_NAMESPACE }}.svc.cluster.local" + port: 5432 + database: "mlflow" + driver: "" + + # Use existing Kubernetes secret for database credentials + existingDatabaseSecret: + name: "mlflow-db-secret" + usernameKey: "username" + passwordKey: "password" + +# Artifact root configuration (MinIO/S3) +artifactRoot: + # Enable proxied artifact storage + proxiedArtifactStorage: true + + s3: + enabled: true + bucket: "mlflow" + path: "" + + # Use existing Kubernetes secret for S3 credentials + existingSecret: + name: "mlflow-s3-secret" + keyOfAccessKeyId: "AWS_ACCESS_KEY_ID" + keyOfSecretAccessKey: "AWS_SECRET_ACCESS_KEY" + +# Extra environment variables for S3/MinIO configuration +extraEnvVars: + MLFLOW_S3_ENDPOINT_URL: "http://minio.{{ .Env.MINIO_NAMESPACE }}.svc.cluster.local:9000" + MLFLOW_S3_IGNORE_TLS: "true" + # Disable security middleware when using Gunicorn (env var approach) + MLFLOW_SERVER_DISABLE_SECURITY_MIDDLEWARE: "true" + +# Service configuration +service: + type: ClusterIP + port: 5000 + +# Ingress configuration +ingress: + enabled: true + className: "traefik" + annotations: + traefik.ingress.kubernetes.io/router.entrypoints: websecure + hosts: + - host: {{ .Env.MLFLOW_HOST }} + paths: + - path: / + pathType: Prefix + tls: + - hosts: + - {{ .Env.MLFLOW_HOST }} + +# ServiceMonitor for Prometheus +serviceMonitor: + enabled: true + useServicePort: false + namespace: "{{ .Env.PROMETHEUS_NAMESPACE }}" + interval: 30s + telemetryPath: /metrics + labels: + prometheus: kube-prometheus + timeout: 10s + +# Resource limits +resources: + limits: + cpu: 1000m + memory: 2Gi + requests: + cpu: 100m + memory: 512Mi + +# Security context +podSecurityContext: + fsGroup: 1001 + fsGroupChangePolicy: "OnRootMismatch" + +securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: false + runAsNonRoot: true + privileged: false + runAsUser: 1001 + runAsGroup: 1001