fix(prometheus): fix Grafana auth and set pod security standards

This commit is contained in:
Masaki Yatsu
2025-11-23 15:02:04 +09:00
parent fa8e2bd8c7
commit 44ba48ee2f
4 changed files with 204 additions and 4 deletions

View File

@@ -72,6 +72,57 @@ kubectl port-forward -n monitoring svc/kube-prometheus-stack-alertmanager 9093:9
Then access at `http://localhost:9093`
## Pod Security Standards
The monitoring namespace uses **privileged** Pod Security Standard enforcement.
```bash
pod-security.kubernetes.io/enforce=privileged
```
#### Why Privileged Instead of Baseline or Restricted?
The `prometheus-node-exporter` component requires the following privileged access to collect hardware and OS-level metrics:
- `hostNetwork: true` - Access to host network namespace
- `hostPID: true` - Access to host process IDs
- `hostPath` volumes - Access to host filesystem paths (`/`, `/sys`, `/proc`)
- `hostPort: 9100` - Expose metrics on host port
These requirements are incompatible with both `baseline` and `restricted` Pod Security Standards:
- **baseline** prohibits: `hostNetwork`, `hostPID`, `hostPath`, `hostPort`
- **restricted** has even stricter requirements
While these settings may seem permissive, they are necessary for node-exporter to collect system-level metrics from the host.
#### Security Measures
While using privileged enforcement at the namespace level, all other components (except node-exporter) apply restricted-level security contexts:
- **Grafana**: Non-root user (472), dropped capabilities, seccomp profile
- **Prometheus**: Non-root user (1000), read-only root filesystem, dropped capabilities
- **Alertmanager**: Non-root user (1000), read-only root filesystem, dropped capabilities
- **Prometheus Operator**: Non-root user (65534), read-only root filesystem, dropped capabilities
- **kube-state-metrics**: Non-root user (65534), read-only root filesystem, dropped capabilities
#### Alternative: Restricted Mode Without Node Metrics
To use `restricted` Pod Security Standard, disable node-exporter:
1. Add to `values.gomplate.yaml`:
```yaml
nodeExporter:
enabled: false
```
2. Update justfile to use `restricted`:
```bash
kubectl label namespace ${PROMETHEUS_NAMESPACE} \
pod-security.kubernetes.io/enforce=restricted --overwrite
```
**Trade-off**: You will lose node-level metrics (CPU, memory, disk, network at the host level), though pod-level metrics remain available.
## Configuration
Environment variables (set in `.env.local` or override):

View File

@@ -0,0 +1,21 @@
apiVersion: external-secrets.io/v1
kind: ExternalSecret
metadata:
name: grafana-oidc-credentials
namespace: {{ .Env.PROMETHEUS_NAMESPACE }}
spec:
refreshInterval: 1h
secretStoreRef:
name: vault-secret-store
kind: ClusterSecretStore
target:
name: grafana-oidc-credentials
creationPolicy: Owner
template:
data:
client-secret: "{{ `{{ .client_secret }}` }}"
data:
- secretKey: client_secret
remoteRef:
key: grafana/oidc
property: client_secret

View File

@@ -101,6 +101,14 @@ install: check-env
#!/bin/bash
set -euo pipefail
just create-namespace
# Using 'privileged' because prometheus-node-exporter requires:
# - hostNetwork, hostPID (not allowed in baseline/restricted)
# - hostPath volumes (not allowed in baseline/restricted)
# - hostPort (not allowed in baseline/restricted)
kubectl label namespace ${PROMETHEUS_NAMESPACE} \
pod-security.kubernetes.io/enforce=privileged --overwrite
just add-helm-repo
# Create credentials if not exists
@@ -182,6 +190,31 @@ setup-oidc:
# Create admin group if it doesn't exist
just keycloak::create-group "grafana-admins" "" "Grafana administrators group" || true
# Store OIDC client secret in Vault and create ExternalSecret
if helm status external-secrets -n ${EXTERNAL_SECRETS_NAMESPACE} &>/dev/null; then
echo "External Secrets Operator detected. Creating ExternalSecret..."
just vault::put grafana/oidc client_secret="${oidc_client_secret}"
kubectl delete secret grafana-oidc-credentials -n ${PROMETHEUS_NAMESPACE} --ignore-not-found
kubectl delete externalsecret grafana-oidc-credentials -n ${PROMETHEUS_NAMESPACE} --ignore-not-found
gomplate -f grafana-oidc-external-secret.gomplate.yaml | kubectl apply -f -
echo "Waiting for ExternalSecret to sync..."
kubectl wait --for=condition=Ready externalsecret/grafana-oidc-credentials \
-n ${PROMETHEUS_NAMESPACE} --timeout=60s
else
echo "External Secrets Operator not found. Creating secret directly..."
kubectl delete secret grafana-oidc-credentials -n ${PROMETHEUS_NAMESPACE} --ignore-not-found
kubectl create secret generic grafana-oidc-credentials -n ${PROMETHEUS_NAMESPACE} \
--from-literal=client-secret="${oidc_client_secret}"
if helm status vault -n ${K8S_VAULT_NAMESPACE} &>/dev/null; then
just vault::put grafana/oidc client_secret="${oidc_client_secret}"
fi
fi
# Update Helm values with OIDC configuration
export GRAFANA_OIDC_CLIENT_SECRET="${oidc_client_secret}"
export GRAFANA_OIDC_ENABLED="true"
@@ -210,8 +243,12 @@ disable-oidc:
set -euo pipefail
echo "Disabling Keycloak OIDC authentication for Grafana..."
# Clean up OIDC secrets
kubectl delete secret grafana-oidc-credentials -n ${PROMETHEUS_NAMESPACE} --ignore-not-found
kubectl delete externalsecret grafana-oidc-credentials -n ${PROMETHEUS_NAMESPACE} --ignore-not-found
# Update Helm values to disable OIDC
export GRAFANA_OIDC_ENABLED="false"
export GRAFANA_OIDC_ENABLED=""
export GRAFANA_OIDC_CLIENT_SECRET=""
gomplate -f values.gomplate.yaml -o values.yaml

View File

@@ -5,11 +5,40 @@
grafana:
enabled: true
securityContext:
runAsNonRoot: true
runAsUser: 472
runAsGroup: 472
fsGroup: 472
seccompProfile:
type: RuntimeDefault
containerSecurityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: false
runAsNonRoot: true
runAsUser: 472
runAsGroup: 472
seccompProfile:
type: RuntimeDefault
admin:
existingSecret: grafana-admin-credentials
userKey: admin-user
passwordKey: admin-password
{{- if .Env.GRAFANA_OIDC_ENABLED }}
# Reference OIDC client secret from Kubernetes Secret
envValueFrom:
GRAFANA_OIDC_CLIENT_SECRET:
secretKeyRef:
name: grafana-oidc-credentials
key: client-secret
{{- end }}
ingress:
enabled: true
ingressClassName: traefik
@@ -25,14 +54,14 @@ grafana:
grafana.ini:
server:
root_url: https://{{ .Env.GRAFANA_HOST }}
{{- if eq (.Env.GRAFANA_OIDC_ENABLED | default "false") "true" }}
{{- if .Env.GRAFANA_OIDC_ENABLED }}
auth.generic_oauth:
enabled: true
name: Keycloak
allow_sign_up: true
client_id: grafana
client_secret: {{ .Env.GRAFANA_OIDC_CLIENT_SECRET }}
scopes: openid profile email groups
client_secret: $__env{GRAFANA_OIDC_CLIENT_SECRET}
scopes: openid profile email
auth_url: https://{{ .Env.KEYCLOAK_HOST }}/realms/{{ .Env.KEYCLOAK_REALM }}/protocol/openid-connect/auth
token_url: https://{{ .Env.KEYCLOAK_HOST }}/realms/{{ .Env.KEYCLOAK_REALM }}/protocol/openid-connect/token
api_url: https://{{ .Env.KEYCLOAK_HOST }}/realms/{{ .Env.KEYCLOAK_REALM }}/protocol/openid-connect/userinfo
@@ -67,6 +96,22 @@ grafana:
# Prometheus Configuration
prometheus:
prometheusSpec:
securityContext:
runAsNonRoot: true
runAsUser: 1000
runAsGroup: 2000
fsGroup: 2000
seccompProfile:
type: RuntimeDefault
containers:
- name: prometheus
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
# Retention settings
retention: 30d
retentionSize: "50GB"
@@ -112,6 +157,22 @@ prometheus:
# Alertmanager Configuration
alertmanager:
alertmanagerSpec:
securityContext:
runAsNonRoot: true
runAsUser: 1000
runAsGroup: 2000
fsGroup: 2000
seccompProfile:
type: RuntimeDefault
containers:
- name: alertmanager
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
# Storage
storage:
volumeClaimTemplate:
@@ -170,6 +231,21 @@ kubeStateMetrics:
# kube-state-metrics subchart configuration
kube-state-metrics:
securityContext:
runAsNonRoot: true
runAsUser: 65534
runAsGroup: 65534
fsGroup: 65534
seccompProfile:
type: RuntimeDefault
containerSecurityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
# Resource configuration based on Goldilocks/VPA recommendations
resources:
requests:
@@ -196,6 +272,21 @@ prometheus-node-exporter:
# Prometheus Operator Configuration
# Resource configuration based on Goldilocks/VPA recommendations
prometheusOperator:
securityContext:
runAsNonRoot: true
runAsUser: 65534
runAsGroup: 65534
fsGroup: 65534
seccompProfile:
type: RuntimeDefault
containerSecurityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
resources:
requests:
cpu: 15m