fix(mlflow): Fix auth and service monitor
This commit is contained in:
1
mlflow/.gitignore
vendored
1
mlflow/.gitignore
vendored
@@ -2,4 +2,5 @@ values.yaml
|
||||
mlflow-db-external-secret.yaml
|
||||
mlflow-s3-external-secret.yaml
|
||||
mlflow-oidc-config.yaml
|
||||
mlflow-middleware.yaml
|
||||
image/.buildx-cache
|
||||
|
||||
108
mlflow/README.md
108
mlflow/README.md
@@ -156,17 +156,115 @@ with mlflow.start_run():
|
||||
|
||||
#### Authentication for API Access
|
||||
|
||||
For programmatic access, create an access token:
|
||||
For programmatic access (Python scripts, notebooks, CI/CD), you need to create an access key.
|
||||
|
||||
1. Log in to MLflow UI
|
||||
2. Navigate to Permissions UI → Create access token
|
||||
3. Use token in your code:
|
||||
**Step 1: Create Access Key via Web UI**
|
||||
|
||||
1. Navigate to `https://your-mlflow-host/` and log in via Keycloak
|
||||
2. You will be redirected to the MLflow Permission Manager UI
|
||||
3. Click the **"Create access key"** button at the top of the page
|
||||
4. In the dialog that appears:
|
||||
- Select an expiration date (maximum 1 year from today)
|
||||
- Click **"Request Token"**
|
||||
5. Copy the generated access key from the "Access Key" field
|
||||
6. Store it securely (you won't be able to retrieve it again)
|
||||
|
||||
**Step 2: Use Access Key in Python**
|
||||
|
||||
Set the access key as an environment variable or in your Python code:
|
||||
|
||||
```python
|
||||
import os
|
||||
os.environ["MLFLOW_TRACKING_TOKEN"] = "your-token"
|
||||
import mlflow
|
||||
|
||||
# Method 1: Set environment variable (recommended)
|
||||
os.environ["MLFLOW_TRACKING_TOKEN"] = "your-access-key-here"
|
||||
os.environ["MLFLOW_TRACKING_URI"] = "https://mlflow.example.com"
|
||||
|
||||
# Method 2: Set tracking URI directly
|
||||
mlflow.set_tracking_uri("https://mlflow.example.com")
|
||||
|
||||
# Now you can use MLflow client
|
||||
mlflow.set_experiment("my-experiment")
|
||||
|
||||
with mlflow.start_run():
|
||||
mlflow.log_param("alpha", 0.5)
|
||||
mlflow.log_metric("rmse", 0.786)
|
||||
```
|
||||
|
||||
**Complete Example**
|
||||
|
||||
```python
|
||||
import os
|
||||
import mlflow
|
||||
import mlflow.sklearn
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
from sklearn.datasets import load_iris
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.metrics import accuracy_score
|
||||
|
||||
# Configure MLflow
|
||||
os.environ["MLFLOW_TRACKING_TOKEN"] = "your-access-key-here"
|
||||
mlflow.set_tracking_uri("https://mlflow.example.com")
|
||||
mlflow.set_experiment("iris-classification")
|
||||
|
||||
# Load data
|
||||
X, y = load_iris(return_X_y=True)
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
|
||||
|
||||
# Train and log model
|
||||
with mlflow.start_run():
|
||||
# Log parameters
|
||||
n_estimators = 100
|
||||
max_depth = 5
|
||||
mlflow.log_param("n_estimators", n_estimators)
|
||||
mlflow.log_param("max_depth", max_depth)
|
||||
|
||||
# Train model
|
||||
clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
|
||||
clf.fit(X_train, y_train)
|
||||
|
||||
# Log metrics
|
||||
y_pred = clf.predict(X_test)
|
||||
accuracy = accuracy_score(y_test, y_pred)
|
||||
mlflow.log_metric("accuracy", accuracy)
|
||||
|
||||
# Log model
|
||||
mlflow.sklearn.log_model(clf, "model")
|
||||
|
||||
print(f"Model logged with accuracy: {accuracy}")
|
||||
```
|
||||
|
||||
**Using .env File (Recommended)**
|
||||
|
||||
Create a `.env` file in your project:
|
||||
|
||||
```bash
|
||||
MLFLOW_TRACKING_URI=https://mlflow.example.com
|
||||
MLFLOW_TRACKING_TOKEN=your-access-key-here
|
||||
```
|
||||
|
||||
Load it in your Python code:
|
||||
|
||||
```python
|
||||
from dotenv import load_dotenv
|
||||
import mlflow
|
||||
|
||||
load_dotenv() # Loads MLFLOW_TRACKING_URI and MLFLOW_TRACKING_TOKEN
|
||||
|
||||
mlflow.set_experiment("my-experiment")
|
||||
with mlflow.start_run():
|
||||
mlflow.log_param("param1", 5)
|
||||
```
|
||||
|
||||
**Important Notes**
|
||||
|
||||
- Access keys have an expiration date (max 1 year)
|
||||
- Store access keys securely (use environment variables or secret management)
|
||||
- Never commit access keys to version control
|
||||
- Each user should create their own access key
|
||||
- Expired keys need to be regenerated via the Web UI
|
||||
|
||||
### Model Registry
|
||||
|
||||
Register and manage models:
|
||||
|
||||
@@ -334,10 +334,20 @@ install:
|
||||
echo "Generating Helm values with OIDC enabled..."
|
||||
gomplate -f values.gomplate.yaml -o values.yaml
|
||||
|
||||
echo "Creating Traefik Middleware..."
|
||||
gomplate -f mlflow-middleware.gomplate.yaml -o mlflow-middleware.yaml
|
||||
kubectl apply -f mlflow-middleware.yaml
|
||||
|
||||
echo "Installing MLflow Helm chart from Community Charts with OIDC..."
|
||||
helm upgrade --cleanup-on-fail --install mlflow community-charts/mlflow \
|
||||
--version ${MLFLOW_CHART_VERSION} -n ${MLFLOW_NAMESPACE} --wait --timeout=10m -f values.yaml
|
||||
|
||||
if [ "${MONITORING_ENABLED}" = "true" ]; then
|
||||
echo "Enabling Prometheus monitoring for namespace ${MLFLOW_NAMESPACE}..."
|
||||
kubectl label namespace ${MLFLOW_NAMESPACE} buun.channel/enable-monitoring=true --overwrite
|
||||
echo "✓ Monitoring enabled"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "=== MLflow installed with OIDC authentication ==="
|
||||
echo "MLflow URL: https://${MLFLOW_HOST}"
|
||||
@@ -372,6 +382,10 @@ upgrade:
|
||||
echo "Generating Helm values..."
|
||||
gomplate -f values.gomplate.yaml -o values.yaml
|
||||
|
||||
echo "Creating Traefik Middleware..."
|
||||
gomplate -f mlflow-middleware.gomplate.yaml -o mlflow-middleware.yaml
|
||||
kubectl apply -f mlflow-middleware.yaml
|
||||
|
||||
echo "Upgrading MLflow Helm chart from Community Charts..."
|
||||
helm upgrade mlflow community-charts/mlflow \
|
||||
--version ${MLFLOW_CHART_VERSION} -n ${MLFLOW_NAMESPACE} --wait --timeout=10m -f values.yaml
|
||||
@@ -390,6 +404,7 @@ uninstall delete-db='true':
|
||||
kubectl delete secret mlflow-oidc-config -n ${MLFLOW_NAMESPACE} --ignore-not-found
|
||||
kubectl delete externalsecret mlflow-oidc-external-secret -n ${MLFLOW_NAMESPACE} \
|
||||
--ignore-not-found
|
||||
kubectl delete middleware mlflow-headers -n ${MLFLOW_NAMESPACE} --ignore-not-found
|
||||
just delete-namespace
|
||||
if [ "{{ delete-db }}" = "true" ]; then
|
||||
just postgres::delete-db mlflow || true
|
||||
|
||||
11
mlflow/mlflow-middleware.gomplate.yaml
Normal file
11
mlflow/mlflow-middleware.gomplate.yaml
Normal file
@@ -0,0 +1,11 @@
|
||||
apiVersion: traefik.io/v1alpha1
|
||||
kind: Middleware
|
||||
metadata:
|
||||
name: mlflow-headers
|
||||
namespace: {{ .Env.MLFLOW_NAMESPACE }}
|
||||
spec:
|
||||
headers:
|
||||
customRequestHeaders:
|
||||
X-Forwarded-Proto: "https"
|
||||
X-Forwarded-Host: "{{ .Env.MLFLOW_HOST }}"
|
||||
X-Forwarded-Port: "443"
|
||||
@@ -60,8 +60,8 @@ log:
|
||||
# Use oidc-auth-fastapi for FastAPI/ASGI compatibility with Uvicorn
|
||||
extraArgs:
|
||||
appName: "oidc-auth-fastapi"
|
||||
# Allow connections from external hostname (with and without port)
|
||||
allowedHosts: "{{ .Env.MLFLOW_HOST }},{{ .Env.MLFLOW_HOST }}:443"
|
||||
# Allow connections from external hostname and Kubernetes internal access
|
||||
allowedHosts: "{{ .Env.MLFLOW_HOST }},{{ .Env.MLFLOW_HOST }}:443,mlflow.{{ .Env.MLFLOW_NAMESPACE }}.svc.cluster.local,mlflow.{{ .Env.MLFLOW_NAMESPACE }}.svc.cluster.local:5000,*"
|
||||
|
||||
# Extra secrets for OIDC configuration
|
||||
extraSecretNamesForEnvFrom:
|
||||
@@ -86,13 +86,19 @@ extraEnvVars:
|
||||
# Session configuration - use cachelib with filesystem backend
|
||||
SESSION_TYPE: "cachelib"
|
||||
SESSION_CACHE_DIR: "/tmp/session"
|
||||
# Security configuration - allow same-origin CORS and configured host
|
||||
MLFLOW_SERVER_CORS_ALLOWED_ORIGINS: "https://{{ .Env.MLFLOW_HOST }}"
|
||||
MLFLOW_SERVER_ALLOWED_HOSTS: "{{ .Env.MLFLOW_HOST }},{{ .Env.MLFLOW_HOST }}:443"
|
||||
MLFLOW_SERVER_X_FRAME_OPTIONS: "SAMEORIGIN"
|
||||
{{- else }}
|
||||
# Extra environment variables for S3/MinIO configuration
|
||||
# Extra environment variables for S3/MinIO configuration (OIDC disabled)
|
||||
extraEnvVars:
|
||||
MLFLOW_S3_ENDPOINT_URL: "http://minio.{{ .Env.MINIO_NAMESPACE }}.svc.cluster.local:9000"
|
||||
MLFLOW_S3_IGNORE_TLS: "true"
|
||||
# Disable security middleware when using Gunicorn (env var approach)
|
||||
MLFLOW_SERVER_DISABLE_SECURITY_MIDDLEWARE: "true"
|
||||
# Security configuration - allow same-origin CORS and configured host
|
||||
MLFLOW_SERVER_CORS_ALLOWED_ORIGINS: "https://{{ .Env.MLFLOW_HOST }}"
|
||||
MLFLOW_SERVER_ALLOWED_HOSTS: "{{ .Env.MLFLOW_HOST }},{{ .Env.MLFLOW_HOST }}:443"
|
||||
MLFLOW_SERVER_X_FRAME_OPTIONS: "SAMEORIGIN"
|
||||
{{- end }}
|
||||
|
||||
# Service configuration
|
||||
@@ -106,6 +112,7 @@ ingress:
|
||||
className: "traefik"
|
||||
annotations:
|
||||
traefik.ingress.kubernetes.io/router.entrypoints: websecure
|
||||
traefik.ingress.kubernetes.io/router.middlewares: {{ .Env.MLFLOW_NAMESPACE }}-mlflow-headers@kubernetescrd
|
||||
hosts:
|
||||
- host: {{ .Env.MLFLOW_HOST }}
|
||||
paths:
|
||||
@@ -123,7 +130,7 @@ serviceMonitor:
|
||||
interval: 30s
|
||||
telemetryPath: /metrics
|
||||
labels:
|
||||
prometheus: kube-prometheus
|
||||
release: kube-prometheus-stack
|
||||
timeout: 10s
|
||||
|
||||
# Resource limits
|
||||
|
||||
Reference in New Issue
Block a user