diff --git a/CLAUDE.md b/CLAUDE.md index 588b0a3..5505240 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -98,12 +98,29 @@ kubectl --context -oidc get nodes # Test OIDC auth - **Templates**: `*.gomplate.yaml` files use environment variables from `.env.local` - **Custom Extensions**: `custom.just` can be created for additional workflows +### Resource Management + +All components should have appropriate resource requests and limits configured. See [docs/resource-management.md](docs/resource-management.md) for: + +- QoS class selection (Guaranteed vs Burstable) +- Using Goldilocks/VPA for recommendations +- Configuration guidelines and examples +- **Important**: Never set resources below Goldilocks recommendations; always round up to clean values + ### Gomplate Template Pattern **Environment Variable Management:** -- Justfile manages environment variables and their default values +- Justfile manages environment variables and their default values at the top using `export VAR := env("VAR", "default")` - Gomplate templates access variables using `{{ .Env.VAR }}` +- **IMPORTANT**: Variables exported at the top of justfile are automatically available to all recipes - do NOT use `export` again inside recipes + +**Conditional Rendering Rules:** + +- For boolean flags (enabled/disabled features), use simple truthiness check: `{{- if .Env.VAR }}` +- The justfile should set the variable to "true" (or any non-empty value) to enable, or empty string to disable +- **DO NOT use**: `{{- if eq (.Env.VAR | default "false") "true" }}` - this is redundant +- **CORRECT**: `{{- if .Env.VAR }}` - simple and clean **Example justfile pattern:** @@ -111,12 +128,15 @@ kubectl --context -oidc get nodes # Test OIDC auth # At the top of justfile - define variables with defaults export PROMETHEUS_NAMESPACE := env("PROMETHEUS_NAMESPACE", "monitoring") export GRAFANA_HOST := env("GRAFANA_HOST", "") +export MONITORING_ENABLED := env("MONITORING_ENABLED", "") -# In recipes - export variables for gomplate +# In recipes - use variables directly (already exported at top) install: #!/bin/bash set -euo pipefail - export GRAFANA_OIDC_ENABLED="${GRAFANA_OIDC_ENABLED:-false}" + if gum confirm "Enable monitoring?"; then + MONITORING_ENABLED="true" + fi gomplate -f values.gomplate.yaml -o values.yaml ``` @@ -128,8 +148,8 @@ namespace: {{ .Env.PROMETHEUS_NAMESPACE }} ingress: hosts: - {{ .Env.GRAFANA_HOST }} -{{- if eq .Env.GRAFANA_OIDC_ENABLED "true" }} - oidc: +{{- if .Env.MONITORING_ENABLED }} + monitoring: enabled: true {{- end }} ``` @@ -145,10 +165,10 @@ install: if [ -z "${MONITORING_ENABLED}" ]; then if gum confirm "Enable Prometheus monitoring?"; then MONITORING_ENABLED="true" + else + MONITORING_ENABLED="false" fi fi - else - MONITORING_ENABLED="false" fi # ... helm install @@ -161,7 +181,7 @@ install: ServiceMonitor template (`servicemonitor.gomplate.yaml`): ```yaml -{{- if eq .Env.MONITORING_ENABLED "true" }} +{{- if .Env.MONITORING_ENABLED }} apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: diff --git a/docs/resource-management.md b/docs/resource-management.md index e2dff28..a0b127f 100644 --- a/docs/resource-management.md +++ b/docs/resource-management.md @@ -190,13 +190,25 @@ Given that VPA already includes: - **Critical services**: Use VPA + 1.5-2x for extra safety margin, or use Guaranteed QoS - **New services**: Start with VPA + 1.5x, monitor, adjust after 1-2 weeks +**IMPORTANT:** Never configure resources **below** Goldilocks recommendations. Setting values lower than recommended will: +- Cause Goldilocks dashboard to flag the workload as under-resourced +- Potentially lead to performance issues or OOMKilled events +- Defeat the purpose of using VPA-based recommendations + +When rounding values, always round **up** to the next clean value, not down. + **Example:** Goldilocks recommendation: 50m CPU, 128Mi Memory -- Standard service: 50m CPU, 128Mi Memory (use as-is, rounded) +- Standard service: 50m CPU, 128Mi Memory (use as-is, rounded up if needed) - Critical service: 100m CPU, 256Mi Memory (2x for extra safety) +Goldilocks recommendation: 15m CPU, 105M Memory + +- Correct: 25m CPU, 128Mi Memory (rounded up to clean values) +- Incorrect: 10m CPU, 100Mi Memory (below recommendations, will be flagged) + #### For CRDs and Unsupported Workloads Use Grafana to check actual resource usage: