fix(mlflow): Fix auth and service monitor

This commit is contained in:
Masaki Yatsu
2025-11-09 21:31:33 +09:00
parent f429bcb3f8
commit d3a5aa289d
5 changed files with 143 additions and 11 deletions

1
mlflow/.gitignore vendored
View File

@@ -2,4 +2,5 @@ values.yaml
mlflow-db-external-secret.yaml mlflow-db-external-secret.yaml
mlflow-s3-external-secret.yaml mlflow-s3-external-secret.yaml
mlflow-oidc-config.yaml mlflow-oidc-config.yaml
mlflow-middleware.yaml
image/.buildx-cache image/.buildx-cache

View File

@@ -156,17 +156,115 @@ with mlflow.start_run():
#### Authentication for API Access #### Authentication for API Access
For programmatic access, create an access token: For programmatic access (Python scripts, notebooks, CI/CD), you need to create an access key.
1. Log in to MLflow UI **Step 1: Create Access Key via Web UI**
2. Navigate to Permissions UI → Create access token
3. Use token in your code: 1. Navigate to `https://your-mlflow-host/` and log in via Keycloak
2. You will be redirected to the MLflow Permission Manager UI
3. Click the **"Create access key"** button at the top of the page
4. In the dialog that appears:
- Select an expiration date (maximum 1 year from today)
- Click **"Request Token"**
5. Copy the generated access key from the "Access Key" field
6. Store it securely (you won't be able to retrieve it again)
**Step 2: Use Access Key in Python**
Set the access key as an environment variable or in your Python code:
```python ```python
import os import os
os.environ["MLFLOW_TRACKING_TOKEN"] = "your-token" import mlflow
# Method 1: Set environment variable (recommended)
os.environ["MLFLOW_TRACKING_TOKEN"] = "your-access-key-here"
os.environ["MLFLOW_TRACKING_URI"] = "https://mlflow.example.com"
# Method 2: Set tracking URI directly
mlflow.set_tracking_uri("https://mlflow.example.com")
# Now you can use MLflow client
mlflow.set_experiment("my-experiment")
with mlflow.start_run():
mlflow.log_param("alpha", 0.5)
mlflow.log_metric("rmse", 0.786)
``` ```
**Complete Example**
```python
import os
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# Configure MLflow
os.environ["MLFLOW_TRACKING_TOKEN"] = "your-access-key-here"
mlflow.set_tracking_uri("https://mlflow.example.com")
mlflow.set_experiment("iris-classification")
# Load data
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# Train and log model
with mlflow.start_run():
# Log parameters
n_estimators = 100
max_depth = 5
mlflow.log_param("n_estimators", n_estimators)
mlflow.log_param("max_depth", max_depth)
# Train model
clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
clf.fit(X_train, y_train)
# Log metrics
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
mlflow.log_metric("accuracy", accuracy)
# Log model
mlflow.sklearn.log_model(clf, "model")
print(f"Model logged with accuracy: {accuracy}")
```
**Using .env File (Recommended)**
Create a `.env` file in your project:
```bash
MLFLOW_TRACKING_URI=https://mlflow.example.com
MLFLOW_TRACKING_TOKEN=your-access-key-here
```
Load it in your Python code:
```python
from dotenv import load_dotenv
import mlflow
load_dotenv() # Loads MLFLOW_TRACKING_URI and MLFLOW_TRACKING_TOKEN
mlflow.set_experiment("my-experiment")
with mlflow.start_run():
mlflow.log_param("param1", 5)
```
**Important Notes**
- Access keys have an expiration date (max 1 year)
- Store access keys securely (use environment variables or secret management)
- Never commit access keys to version control
- Each user should create their own access key
- Expired keys need to be regenerated via the Web UI
### Model Registry ### Model Registry
Register and manage models: Register and manage models:

View File

@@ -334,10 +334,20 @@ install:
echo "Generating Helm values with OIDC enabled..." echo "Generating Helm values with OIDC enabled..."
gomplate -f values.gomplate.yaml -o values.yaml gomplate -f values.gomplate.yaml -o values.yaml
echo "Creating Traefik Middleware..."
gomplate -f mlflow-middleware.gomplate.yaml -o mlflow-middleware.yaml
kubectl apply -f mlflow-middleware.yaml
echo "Installing MLflow Helm chart from Community Charts with OIDC..." echo "Installing MLflow Helm chart from Community Charts with OIDC..."
helm upgrade --cleanup-on-fail --install mlflow community-charts/mlflow \ helm upgrade --cleanup-on-fail --install mlflow community-charts/mlflow \
--version ${MLFLOW_CHART_VERSION} -n ${MLFLOW_NAMESPACE} --wait --timeout=10m -f values.yaml --version ${MLFLOW_CHART_VERSION} -n ${MLFLOW_NAMESPACE} --wait --timeout=10m -f values.yaml
if [ "${MONITORING_ENABLED}" = "true" ]; then
echo "Enabling Prometheus monitoring for namespace ${MLFLOW_NAMESPACE}..."
kubectl label namespace ${MLFLOW_NAMESPACE} buun.channel/enable-monitoring=true --overwrite
echo "✓ Monitoring enabled"
fi
echo "" echo ""
echo "=== MLflow installed with OIDC authentication ===" echo "=== MLflow installed with OIDC authentication ==="
echo "MLflow URL: https://${MLFLOW_HOST}" echo "MLflow URL: https://${MLFLOW_HOST}"
@@ -372,6 +382,10 @@ upgrade:
echo "Generating Helm values..." echo "Generating Helm values..."
gomplate -f values.gomplate.yaml -o values.yaml gomplate -f values.gomplate.yaml -o values.yaml
echo "Creating Traefik Middleware..."
gomplate -f mlflow-middleware.gomplate.yaml -o mlflow-middleware.yaml
kubectl apply -f mlflow-middleware.yaml
echo "Upgrading MLflow Helm chart from Community Charts..." echo "Upgrading MLflow Helm chart from Community Charts..."
helm upgrade mlflow community-charts/mlflow \ helm upgrade mlflow community-charts/mlflow \
--version ${MLFLOW_CHART_VERSION} -n ${MLFLOW_NAMESPACE} --wait --timeout=10m -f values.yaml --version ${MLFLOW_CHART_VERSION} -n ${MLFLOW_NAMESPACE} --wait --timeout=10m -f values.yaml
@@ -390,6 +404,7 @@ uninstall delete-db='true':
kubectl delete secret mlflow-oidc-config -n ${MLFLOW_NAMESPACE} --ignore-not-found kubectl delete secret mlflow-oidc-config -n ${MLFLOW_NAMESPACE} --ignore-not-found
kubectl delete externalsecret mlflow-oidc-external-secret -n ${MLFLOW_NAMESPACE} \ kubectl delete externalsecret mlflow-oidc-external-secret -n ${MLFLOW_NAMESPACE} \
--ignore-not-found --ignore-not-found
kubectl delete middleware mlflow-headers -n ${MLFLOW_NAMESPACE} --ignore-not-found
just delete-namespace just delete-namespace
if [ "{{ delete-db }}" = "true" ]; then if [ "{{ delete-db }}" = "true" ]; then
just postgres::delete-db mlflow || true just postgres::delete-db mlflow || true

View File

@@ -0,0 +1,11 @@
apiVersion: traefik.io/v1alpha1
kind: Middleware
metadata:
name: mlflow-headers
namespace: {{ .Env.MLFLOW_NAMESPACE }}
spec:
headers:
customRequestHeaders:
X-Forwarded-Proto: "https"
X-Forwarded-Host: "{{ .Env.MLFLOW_HOST }}"
X-Forwarded-Port: "443"

View File

@@ -60,8 +60,8 @@ log:
# Use oidc-auth-fastapi for FastAPI/ASGI compatibility with Uvicorn # Use oidc-auth-fastapi for FastAPI/ASGI compatibility with Uvicorn
extraArgs: extraArgs:
appName: "oidc-auth-fastapi" appName: "oidc-auth-fastapi"
# Allow connections from external hostname (with and without port) # Allow connections from external hostname and Kubernetes internal access
allowedHosts: "{{ .Env.MLFLOW_HOST }},{{ .Env.MLFLOW_HOST }}:443" allowedHosts: "{{ .Env.MLFLOW_HOST }},{{ .Env.MLFLOW_HOST }}:443,mlflow.{{ .Env.MLFLOW_NAMESPACE }}.svc.cluster.local,mlflow.{{ .Env.MLFLOW_NAMESPACE }}.svc.cluster.local:5000,*"
# Extra secrets for OIDC configuration # Extra secrets for OIDC configuration
extraSecretNamesForEnvFrom: extraSecretNamesForEnvFrom:
@@ -86,13 +86,19 @@ extraEnvVars:
# Session configuration - use cachelib with filesystem backend # Session configuration - use cachelib with filesystem backend
SESSION_TYPE: "cachelib" SESSION_TYPE: "cachelib"
SESSION_CACHE_DIR: "/tmp/session" SESSION_CACHE_DIR: "/tmp/session"
# Security configuration - allow same-origin CORS and configured host
MLFLOW_SERVER_CORS_ALLOWED_ORIGINS: "https://{{ .Env.MLFLOW_HOST }}"
MLFLOW_SERVER_ALLOWED_HOSTS: "{{ .Env.MLFLOW_HOST }},{{ .Env.MLFLOW_HOST }}:443"
MLFLOW_SERVER_X_FRAME_OPTIONS: "SAMEORIGIN"
{{- else }} {{- else }}
# Extra environment variables for S3/MinIO configuration # Extra environment variables for S3/MinIO configuration (OIDC disabled)
extraEnvVars: extraEnvVars:
MLFLOW_S3_ENDPOINT_URL: "http://minio.{{ .Env.MINIO_NAMESPACE }}.svc.cluster.local:9000" MLFLOW_S3_ENDPOINT_URL: "http://minio.{{ .Env.MINIO_NAMESPACE }}.svc.cluster.local:9000"
MLFLOW_S3_IGNORE_TLS: "true" MLFLOW_S3_IGNORE_TLS: "true"
# Disable security middleware when using Gunicorn (env var approach) # Security configuration - allow same-origin CORS and configured host
MLFLOW_SERVER_DISABLE_SECURITY_MIDDLEWARE: "true" MLFLOW_SERVER_CORS_ALLOWED_ORIGINS: "https://{{ .Env.MLFLOW_HOST }}"
MLFLOW_SERVER_ALLOWED_HOSTS: "{{ .Env.MLFLOW_HOST }},{{ .Env.MLFLOW_HOST }}:443"
MLFLOW_SERVER_X_FRAME_OPTIONS: "SAMEORIGIN"
{{- end }} {{- end }}
# Service configuration # Service configuration
@@ -106,6 +112,7 @@ ingress:
className: "traefik" className: "traefik"
annotations: annotations:
traefik.ingress.kubernetes.io/router.entrypoints: websecure traefik.ingress.kubernetes.io/router.entrypoints: websecure
traefik.ingress.kubernetes.io/router.middlewares: {{ .Env.MLFLOW_NAMESPACE }}-mlflow-headers@kubernetescrd
hosts: hosts:
- host: {{ .Env.MLFLOW_HOST }} - host: {{ .Env.MLFLOW_HOST }}
paths: paths:
@@ -123,7 +130,7 @@ serviceMonitor:
interval: 30s interval: 30s
telemetryPath: /metrics telemetryPath: /metrics
labels: labels:
prometheus: kube-prometheus release: kube-prometheus-stack
timeout: 10s timeout: 10s
# Resource limits # Resource limits