fix(mlflow): Fix auth and service monitor
This commit is contained in:
1
mlflow/.gitignore
vendored
1
mlflow/.gitignore
vendored
@@ -2,4 +2,5 @@ values.yaml
|
|||||||
mlflow-db-external-secret.yaml
|
mlflow-db-external-secret.yaml
|
||||||
mlflow-s3-external-secret.yaml
|
mlflow-s3-external-secret.yaml
|
||||||
mlflow-oidc-config.yaml
|
mlflow-oidc-config.yaml
|
||||||
|
mlflow-middleware.yaml
|
||||||
image/.buildx-cache
|
image/.buildx-cache
|
||||||
|
|||||||
108
mlflow/README.md
108
mlflow/README.md
@@ -156,17 +156,115 @@ with mlflow.start_run():
|
|||||||
|
|
||||||
#### Authentication for API Access
|
#### Authentication for API Access
|
||||||
|
|
||||||
For programmatic access, create an access token:
|
For programmatic access (Python scripts, notebooks, CI/CD), you need to create an access key.
|
||||||
|
|
||||||
1. Log in to MLflow UI
|
**Step 1: Create Access Key via Web UI**
|
||||||
2. Navigate to Permissions UI → Create access token
|
|
||||||
3. Use token in your code:
|
1. Navigate to `https://your-mlflow-host/` and log in via Keycloak
|
||||||
|
2. You will be redirected to the MLflow Permission Manager UI
|
||||||
|
3. Click the **"Create access key"** button at the top of the page
|
||||||
|
4. In the dialog that appears:
|
||||||
|
- Select an expiration date (maximum 1 year from today)
|
||||||
|
- Click **"Request Token"**
|
||||||
|
5. Copy the generated access key from the "Access Key" field
|
||||||
|
6. Store it securely (you won't be able to retrieve it again)
|
||||||
|
|
||||||
|
**Step 2: Use Access Key in Python**
|
||||||
|
|
||||||
|
Set the access key as an environment variable or in your Python code:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import os
|
import os
|
||||||
os.environ["MLFLOW_TRACKING_TOKEN"] = "your-token"
|
import mlflow
|
||||||
|
|
||||||
|
# Method 1: Set environment variable (recommended)
|
||||||
|
os.environ["MLFLOW_TRACKING_TOKEN"] = "your-access-key-here"
|
||||||
|
os.environ["MLFLOW_TRACKING_URI"] = "https://mlflow.example.com"
|
||||||
|
|
||||||
|
# Method 2: Set tracking URI directly
|
||||||
|
mlflow.set_tracking_uri("https://mlflow.example.com")
|
||||||
|
|
||||||
|
# Now you can use MLflow client
|
||||||
|
mlflow.set_experiment("my-experiment")
|
||||||
|
|
||||||
|
with mlflow.start_run():
|
||||||
|
mlflow.log_param("alpha", 0.5)
|
||||||
|
mlflow.log_metric("rmse", 0.786)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
**Complete Example**
|
||||||
|
|
||||||
|
```python
|
||||||
|
import os
|
||||||
|
import mlflow
|
||||||
|
import mlflow.sklearn
|
||||||
|
from sklearn.ensemble import RandomForestClassifier
|
||||||
|
from sklearn.datasets import load_iris
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
from sklearn.metrics import accuracy_score
|
||||||
|
|
||||||
|
# Configure MLflow
|
||||||
|
os.environ["MLFLOW_TRACKING_TOKEN"] = "your-access-key-here"
|
||||||
|
mlflow.set_tracking_uri("https://mlflow.example.com")
|
||||||
|
mlflow.set_experiment("iris-classification")
|
||||||
|
|
||||||
|
# Load data
|
||||||
|
X, y = load_iris(return_X_y=True)
|
||||||
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
|
||||||
|
|
||||||
|
# Train and log model
|
||||||
|
with mlflow.start_run():
|
||||||
|
# Log parameters
|
||||||
|
n_estimators = 100
|
||||||
|
max_depth = 5
|
||||||
|
mlflow.log_param("n_estimators", n_estimators)
|
||||||
|
mlflow.log_param("max_depth", max_depth)
|
||||||
|
|
||||||
|
# Train model
|
||||||
|
clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
|
||||||
|
clf.fit(X_train, y_train)
|
||||||
|
|
||||||
|
# Log metrics
|
||||||
|
y_pred = clf.predict(X_test)
|
||||||
|
accuracy = accuracy_score(y_test, y_pred)
|
||||||
|
mlflow.log_metric("accuracy", accuracy)
|
||||||
|
|
||||||
|
# Log model
|
||||||
|
mlflow.sklearn.log_model(clf, "model")
|
||||||
|
|
||||||
|
print(f"Model logged with accuracy: {accuracy}")
|
||||||
|
```
|
||||||
|
|
||||||
|
**Using .env File (Recommended)**
|
||||||
|
|
||||||
|
Create a `.env` file in your project:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
MLFLOW_TRACKING_URI=https://mlflow.example.com
|
||||||
|
MLFLOW_TRACKING_TOKEN=your-access-key-here
|
||||||
|
```
|
||||||
|
|
||||||
|
Load it in your Python code:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
import mlflow
|
||||||
|
|
||||||
|
load_dotenv() # Loads MLFLOW_TRACKING_URI and MLFLOW_TRACKING_TOKEN
|
||||||
|
|
||||||
|
mlflow.set_experiment("my-experiment")
|
||||||
|
with mlflow.start_run():
|
||||||
|
mlflow.log_param("param1", 5)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Important Notes**
|
||||||
|
|
||||||
|
- Access keys have an expiration date (max 1 year)
|
||||||
|
- Store access keys securely (use environment variables or secret management)
|
||||||
|
- Never commit access keys to version control
|
||||||
|
- Each user should create their own access key
|
||||||
|
- Expired keys need to be regenerated via the Web UI
|
||||||
|
|
||||||
### Model Registry
|
### Model Registry
|
||||||
|
|
||||||
Register and manage models:
|
Register and manage models:
|
||||||
|
|||||||
@@ -334,10 +334,20 @@ install:
|
|||||||
echo "Generating Helm values with OIDC enabled..."
|
echo "Generating Helm values with OIDC enabled..."
|
||||||
gomplate -f values.gomplate.yaml -o values.yaml
|
gomplate -f values.gomplate.yaml -o values.yaml
|
||||||
|
|
||||||
|
echo "Creating Traefik Middleware..."
|
||||||
|
gomplate -f mlflow-middleware.gomplate.yaml -o mlflow-middleware.yaml
|
||||||
|
kubectl apply -f mlflow-middleware.yaml
|
||||||
|
|
||||||
echo "Installing MLflow Helm chart from Community Charts with OIDC..."
|
echo "Installing MLflow Helm chart from Community Charts with OIDC..."
|
||||||
helm upgrade --cleanup-on-fail --install mlflow community-charts/mlflow \
|
helm upgrade --cleanup-on-fail --install mlflow community-charts/mlflow \
|
||||||
--version ${MLFLOW_CHART_VERSION} -n ${MLFLOW_NAMESPACE} --wait --timeout=10m -f values.yaml
|
--version ${MLFLOW_CHART_VERSION} -n ${MLFLOW_NAMESPACE} --wait --timeout=10m -f values.yaml
|
||||||
|
|
||||||
|
if [ "${MONITORING_ENABLED}" = "true" ]; then
|
||||||
|
echo "Enabling Prometheus monitoring for namespace ${MLFLOW_NAMESPACE}..."
|
||||||
|
kubectl label namespace ${MLFLOW_NAMESPACE} buun.channel/enable-monitoring=true --overwrite
|
||||||
|
echo "✓ Monitoring enabled"
|
||||||
|
fi
|
||||||
|
|
||||||
echo ""
|
echo ""
|
||||||
echo "=== MLflow installed with OIDC authentication ==="
|
echo "=== MLflow installed with OIDC authentication ==="
|
||||||
echo "MLflow URL: https://${MLFLOW_HOST}"
|
echo "MLflow URL: https://${MLFLOW_HOST}"
|
||||||
@@ -372,6 +382,10 @@ upgrade:
|
|||||||
echo "Generating Helm values..."
|
echo "Generating Helm values..."
|
||||||
gomplate -f values.gomplate.yaml -o values.yaml
|
gomplate -f values.gomplate.yaml -o values.yaml
|
||||||
|
|
||||||
|
echo "Creating Traefik Middleware..."
|
||||||
|
gomplate -f mlflow-middleware.gomplate.yaml -o mlflow-middleware.yaml
|
||||||
|
kubectl apply -f mlflow-middleware.yaml
|
||||||
|
|
||||||
echo "Upgrading MLflow Helm chart from Community Charts..."
|
echo "Upgrading MLflow Helm chart from Community Charts..."
|
||||||
helm upgrade mlflow community-charts/mlflow \
|
helm upgrade mlflow community-charts/mlflow \
|
||||||
--version ${MLFLOW_CHART_VERSION} -n ${MLFLOW_NAMESPACE} --wait --timeout=10m -f values.yaml
|
--version ${MLFLOW_CHART_VERSION} -n ${MLFLOW_NAMESPACE} --wait --timeout=10m -f values.yaml
|
||||||
@@ -390,6 +404,7 @@ uninstall delete-db='true':
|
|||||||
kubectl delete secret mlflow-oidc-config -n ${MLFLOW_NAMESPACE} --ignore-not-found
|
kubectl delete secret mlflow-oidc-config -n ${MLFLOW_NAMESPACE} --ignore-not-found
|
||||||
kubectl delete externalsecret mlflow-oidc-external-secret -n ${MLFLOW_NAMESPACE} \
|
kubectl delete externalsecret mlflow-oidc-external-secret -n ${MLFLOW_NAMESPACE} \
|
||||||
--ignore-not-found
|
--ignore-not-found
|
||||||
|
kubectl delete middleware mlflow-headers -n ${MLFLOW_NAMESPACE} --ignore-not-found
|
||||||
just delete-namespace
|
just delete-namespace
|
||||||
if [ "{{ delete-db }}" = "true" ]; then
|
if [ "{{ delete-db }}" = "true" ]; then
|
||||||
just postgres::delete-db mlflow || true
|
just postgres::delete-db mlflow || true
|
||||||
|
|||||||
11
mlflow/mlflow-middleware.gomplate.yaml
Normal file
11
mlflow/mlflow-middleware.gomplate.yaml
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
apiVersion: traefik.io/v1alpha1
|
||||||
|
kind: Middleware
|
||||||
|
metadata:
|
||||||
|
name: mlflow-headers
|
||||||
|
namespace: {{ .Env.MLFLOW_NAMESPACE }}
|
||||||
|
spec:
|
||||||
|
headers:
|
||||||
|
customRequestHeaders:
|
||||||
|
X-Forwarded-Proto: "https"
|
||||||
|
X-Forwarded-Host: "{{ .Env.MLFLOW_HOST }}"
|
||||||
|
X-Forwarded-Port: "443"
|
||||||
@@ -60,8 +60,8 @@ log:
|
|||||||
# Use oidc-auth-fastapi for FastAPI/ASGI compatibility with Uvicorn
|
# Use oidc-auth-fastapi for FastAPI/ASGI compatibility with Uvicorn
|
||||||
extraArgs:
|
extraArgs:
|
||||||
appName: "oidc-auth-fastapi"
|
appName: "oidc-auth-fastapi"
|
||||||
# Allow connections from external hostname (with and without port)
|
# Allow connections from external hostname and Kubernetes internal access
|
||||||
allowedHosts: "{{ .Env.MLFLOW_HOST }},{{ .Env.MLFLOW_HOST }}:443"
|
allowedHosts: "{{ .Env.MLFLOW_HOST }},{{ .Env.MLFLOW_HOST }}:443,mlflow.{{ .Env.MLFLOW_NAMESPACE }}.svc.cluster.local,mlflow.{{ .Env.MLFLOW_NAMESPACE }}.svc.cluster.local:5000,*"
|
||||||
|
|
||||||
# Extra secrets for OIDC configuration
|
# Extra secrets for OIDC configuration
|
||||||
extraSecretNamesForEnvFrom:
|
extraSecretNamesForEnvFrom:
|
||||||
@@ -86,13 +86,19 @@ extraEnvVars:
|
|||||||
# Session configuration - use cachelib with filesystem backend
|
# Session configuration - use cachelib with filesystem backend
|
||||||
SESSION_TYPE: "cachelib"
|
SESSION_TYPE: "cachelib"
|
||||||
SESSION_CACHE_DIR: "/tmp/session"
|
SESSION_CACHE_DIR: "/tmp/session"
|
||||||
|
# Security configuration - allow same-origin CORS and configured host
|
||||||
|
MLFLOW_SERVER_CORS_ALLOWED_ORIGINS: "https://{{ .Env.MLFLOW_HOST }}"
|
||||||
|
MLFLOW_SERVER_ALLOWED_HOSTS: "{{ .Env.MLFLOW_HOST }},{{ .Env.MLFLOW_HOST }}:443"
|
||||||
|
MLFLOW_SERVER_X_FRAME_OPTIONS: "SAMEORIGIN"
|
||||||
{{- else }}
|
{{- else }}
|
||||||
# Extra environment variables for S3/MinIO configuration
|
# Extra environment variables for S3/MinIO configuration (OIDC disabled)
|
||||||
extraEnvVars:
|
extraEnvVars:
|
||||||
MLFLOW_S3_ENDPOINT_URL: "http://minio.{{ .Env.MINIO_NAMESPACE }}.svc.cluster.local:9000"
|
MLFLOW_S3_ENDPOINT_URL: "http://minio.{{ .Env.MINIO_NAMESPACE }}.svc.cluster.local:9000"
|
||||||
MLFLOW_S3_IGNORE_TLS: "true"
|
MLFLOW_S3_IGNORE_TLS: "true"
|
||||||
# Disable security middleware when using Gunicorn (env var approach)
|
# Security configuration - allow same-origin CORS and configured host
|
||||||
MLFLOW_SERVER_DISABLE_SECURITY_MIDDLEWARE: "true"
|
MLFLOW_SERVER_CORS_ALLOWED_ORIGINS: "https://{{ .Env.MLFLOW_HOST }}"
|
||||||
|
MLFLOW_SERVER_ALLOWED_HOSTS: "{{ .Env.MLFLOW_HOST }},{{ .Env.MLFLOW_HOST }}:443"
|
||||||
|
MLFLOW_SERVER_X_FRAME_OPTIONS: "SAMEORIGIN"
|
||||||
{{- end }}
|
{{- end }}
|
||||||
|
|
||||||
# Service configuration
|
# Service configuration
|
||||||
@@ -106,6 +112,7 @@ ingress:
|
|||||||
className: "traefik"
|
className: "traefik"
|
||||||
annotations:
|
annotations:
|
||||||
traefik.ingress.kubernetes.io/router.entrypoints: websecure
|
traefik.ingress.kubernetes.io/router.entrypoints: websecure
|
||||||
|
traefik.ingress.kubernetes.io/router.middlewares: {{ .Env.MLFLOW_NAMESPACE }}-mlflow-headers@kubernetescrd
|
||||||
hosts:
|
hosts:
|
||||||
- host: {{ .Env.MLFLOW_HOST }}
|
- host: {{ .Env.MLFLOW_HOST }}
|
||||||
paths:
|
paths:
|
||||||
@@ -123,7 +130,7 @@ serviceMonitor:
|
|||||||
interval: 30s
|
interval: 30s
|
||||||
telemetryPath: /metrics
|
telemetryPath: /metrics
|
||||||
labels:
|
labels:
|
||||||
prometheus: kube-prometheus
|
release: kube-prometheus-stack
|
||||||
timeout: 10s
|
timeout: 10s
|
||||||
|
|
||||||
# Resource limits
|
# Resource limits
|
||||||
|
|||||||
Reference in New Issue
Block a user