diff --git a/cert-manager/cert-manager-values.yaml b/cert-manager/cert-manager-values.yaml new file mode 100644 index 0000000..9d6e7d5 --- /dev/null +++ b/cert-manager/cert-manager-values.yaml @@ -0,0 +1,31 @@ +# cert-manager resource configuration +# Based on Goldilocks recommendations (Burstable QoS) + +# cert-manager controller +resources: + requests: + cpu: 15m + memory: 128Mi + limits: + cpu: 50m + memory: 192Mi + +# CA injector +cainjector: + resources: + requests: + cpu: 15m + memory: 192Mi + limits: + cpu: 50m + memory: 256Mi + +# Webhook +webhook: + resources: + requests: + cpu: 15m + memory: 128Mi + limits: + cpu: 50m + memory: 128Mi diff --git a/cert-manager/justfile b/cert-manager/justfile index 7e7d042..9531a02 100644 --- a/cert-manager/justfile +++ b/cert-manager/justfile @@ -26,7 +26,8 @@ install: echo "Installing cert-manager from OCI registry..." helm upgrade --cleanup-on-fail --install cert-manager \ oci://quay.io/jetstack/charts/cert-manager --version ${CERT_MANAGER_CHART_VERSION} \ - -n ${CERT_MANAGER_NAMESPACE} --set crds.enabled=true --wait --timeout=5m + -n ${CERT_MANAGER_NAMESPACE} --set crds.enabled=true --wait --timeout=5m \ + -f cert-manager-values.yaml echo "Waiting for cert-manager webhook to be ready..." kubectl wait --for=condition=ready pod -l app.kubernetes.io/name=webhook \ @@ -51,7 +52,8 @@ upgrade: echo "Upgrading cert-manager from OCI registry..." helm upgrade cert-manager oci://quay.io/jetstack/charts/cert-manager \ --version ${CERT_MANAGER_CHART_VERSION} -n ${CERT_MANAGER_NAMESPACE} \ - --set crds.enabled=true --wait --timeout=5m + --set crds.enabled=true --wait --timeout=5m \ + -f cert-manager-values.yaml echo "cert-manager upgraded successfully" diff --git a/docs/resource-management.md b/docs/resource-management.md index 9eebf0a..e2dff28 100644 --- a/docs/resource-management.md +++ b/docs/resource-management.md @@ -140,26 +140,62 @@ For CRDs, use alternative methods: ### Working with Recommendations +#### Understanding Goldilocks Recommendations + +**How Goldilocks works:** + +- Goldilocks displays recommendations directly from **Vertical Pod Autoscaler (VPA)** resources +- VPA analyzes actual resource usage and calculates recommendations with **built-in headroom** +- Goldilocks shows VPA's `target` values for Guaranteed QoS and `lowerBound` values for Burstable QoS + +**Important**: VPA recommendations **already include significant headroom** (typically 5-15x for CPU, 2-3x for memory compared to observed usage). + +**How VPA calculates recommendations:** + +- **Percentile-based**: 90th percentile for target, 50th for lower bound, 95th for upper bound +- **Safety margin**: 15% added to base calculation (configurable via `--recommendation-margin-fraction`) +- **Confidence multiplier**: Additional buffer when historical data is limited (decreases as data accumulates) +- **Minimum thresholds**: CPU 25m, Memory 250Mi +- **Data collection**: 8-day rolling window with weight decay (newer samples weighted higher) + +Example from external-secrets: + +- Actual usage: CPU 1m, Memory 77Mi +- VPA target recommendation: CPU 15m, Memory 164M (displayed as Goldilocks "Guaranteed QoS") +- VPA lowerBound recommendation: CPU 15m, Memory 105M (displayed as Goldilocks "Burstable QoS" request) +- Built-in headroom: 15x CPU, 2x Memory (includes percentile + safety margin + confidence multiplier) + #### For Standard Workloads (Supported by Goldilocks) -Review Goldilocks recommendations in the dashboard, then configure resources based on your testing status: +Review Goldilocks recommendations in the dashboard, then configure resources: -**With load testing:** +**Recommendation: Use VPA values as-is in most cases** -- Use Goldilocks recommended values with minimal headroom (1.5-2x) -- Round to clean values (50m, 100m, 200m, 512Mi, 1Gi, etc.) +Given that VPA already includes: +- 90th percentile (covers 90% of usage patterns) +- 15% safety margin +- Confidence multiplier for recent workloads +- Minimum thresholds -**Without load testing:** +**Additional headroom is typically NOT needed unless:** -- Add more headroom to handle unexpected load (3-5x) -- Round to clean values +1. **Unpredictable workload**: Traffic patterns significantly vary or are not captured in 8-day window +2. **Critical services**: Data stores (PostgreSQL, Vault) where stability is paramount +3. **Insufficient history**: Newly deployed services with < 8 days of metrics +4. **Known growth**: Expecting significant traffic increase in near future + +**Recommended approach:** + +- **Standard services (operators, auxiliary)**: Use VPA recommendations as-is, round to clean values +- **Critical services**: Use VPA + 1.5-2x for extra safety margin, or use Guaranteed QoS +- **New services**: Start with VPA + 1.5x, monitor, adjust after 1-2 weeks **Example:** Goldilocks recommendation: 50m CPU, 128Mi Memory -- With load testing: 100m CPU, 256Mi Memory (2x, rounded) -- Without load testing: 200m CPU, 512Mi Memory (4x, rounded) +- Standard service: 50m CPU, 128Mi Memory (use as-is, rounded) +- Critical service: 100m CPU, 256Mi Memory (2x for extra safety) #### For CRDs and Unsupported Workloads @@ -167,24 +203,31 @@ Use Grafana to check actual resource usage: 1. **Navigate to Grafana dashboard**: `Kubernetes / Compute Resources / Pod` 2. **Select namespace and pod** -3. **Review usage over 24+ hours** to identify peak values +3. **Review usage over 7+ days** to identify peak values and usage patterns -Then apply the same approach: +**Apply headroom manually (since VPA is not available):** -**With load testing:** +Since you're working from raw metrics without VPA's automatic calculation, manually apply similar buffers: -- Use observed peak values with minimal headroom (1.5-2x) +- **Base calculation**: Use 90th percentile or observed peak values +- **Safety margin**: Add 15-20% +- **Confidence buffer**: Add 20-50% for services with < 1 week of data +- **Minimum thresholds**: CPU 25-50m, Memory 256Mi -**Without load testing:** +**Recommended multipliers:** -- Add significant headroom (3-5x) for safety +- **Standard services**: 2-3x observed peak (approximates VPA calculation) +- **Critical services**: 3-5x observed peak (extra safety for data stores) +- **New services**: 5x observed peak, re-evaluate after 1-2 weeks **Example:** -Grafana shows peak: 40m CPU, 207Mi Memory +Grafana shows peak: 40m CPU, 200Mi Memory over 7 days -- With load testing: 100m CPU, 512Mi Memory (2.5x/2.5x, rounded) -- Without load testing: 200m CPU, 1Gi Memory (5x/5x, rounded, Guaranteed QoS) +- Standard service: 100m CPU, 512Mi Memory (2.5x, rounded) +- Critical service: 200m CPU, 1Gi Memory (5x, rounded, Guaranteed QoS recommended) + +**Note**: For CRDs, you're working from raw usage data and must manually apply the same statistical buffers that VPA provides automatically. Larger multipliers compensate for lack of percentile analysis and safety margins. ## Configuring Resources diff --git a/external-secrets/external-secrets-values.yaml b/external-secrets/external-secrets-values.yaml new file mode 100644 index 0000000..0d74123 --- /dev/null +++ b/external-secrets/external-secrets-values.yaml @@ -0,0 +1,31 @@ +# External Secrets Operator resource configuration +# Based on Goldilocks recommendations (Burstable QoS) + +# Main controller +resources: + requests: + cpu: 15m + memory: 192Mi + limits: + cpu: 50m + memory: 256Mi + +# Cert controller +certController: + resources: + requests: + cpu: 15m + memory: 192Mi + limits: + cpu: 50m + memory: 256Mi + +# Webhook +webhook: + resources: + requests: + cpu: 15m + memory: 128Mi + limits: + cpu: 50m + memory: 256Mi diff --git a/external-secrets/justfile b/external-secrets/justfile index 6a240fe..80cfd8e 100644 --- a/external-secrets/justfile +++ b/external-secrets/justfile @@ -26,7 +26,8 @@ install: helm upgrade --cleanup-on-fail \ --install external-secrets external-secrets/external-secrets \ --version ${EXTERNAL_SECRETS_CHART_VERSION} -n ${EXTERNAL_SECRETS_NAMESPACE} \ - --create-namespace --wait + --create-namespace --wait \ + -f external-secrets-values.yaml just create-external-secrets-role just create-vault-secret-store diff --git a/keycloak/justfile b/keycloak/justfile index dccf5fe..5877de1 100644 --- a/keycloak/justfile +++ b/keycloak/justfile @@ -115,6 +115,13 @@ install-operator: kubectl apply -n ${KEYCLOAK_NAMESPACE} -f https://raw.githubusercontent.com/keycloak/keycloak-k8s-resources/${KEYCLOAK_OPERATOR_VERSION}/kubernetes/kubernetes.yml kubectl wait --for=condition=available deployment/keycloak-operator -n ${KEYCLOAK_NAMESPACE} --timeout=300s + echo "Applying resource configuration based on Goldilocks/VPA recommendations..." + kubectl patch deployment keycloak-operator -n ${KEYCLOAK_NAMESPACE} --type='json' -p='[ + {"op": "replace", "path": "/spec/template/spec/containers/0/resources/requests/memory", "value": "704Mi"}, + {"op": "replace", "path": "/spec/template/spec/containers/0/resources/limits/memory", "value": "1Gi"} + ]' + kubectl wait --for=condition=available deployment/keycloak-operator -n ${KEYCLOAK_NAMESPACE} --timeout=300s + # Install Keycloak instance install: #!/bin/bash diff --git a/lakekeeper/lakekeeper-values.gomplate.yaml b/lakekeeper/lakekeeper-values.gomplate.yaml index f14d976..3378229 100644 --- a/lakekeeper/lakekeeper-values.gomplate.yaml +++ b/lakekeeper/lakekeeper-values.gomplate.yaml @@ -39,14 +39,14 @@ catalog: initialDelaySeconds: 5 periodSeconds: 5 - # Resource limits + # Resource limits (based on Goldilocks/VPA recommendations, rounded to clean values) resources: - limits: - cpu: 500m - memory: 512Mi requests: - cpu: 100m + cpu: 50m memory: 128Mi + limits: + cpu: 100m + memory: 256Mi # Database migration configuration diff --git a/langfuse/langfuse-values.gomplate.yaml b/langfuse/langfuse-values.gomplate.yaml index 9113677..67305ec 100644 --- a/langfuse/langfuse-values.gomplate.yaml +++ b/langfuse/langfuse-values.gomplate.yaml @@ -52,6 +52,40 @@ langfuse: tls: enabled: true + # Resource configuration based on Goldilocks/VPA recommendations + # CPU limits increased to handle startup spikes + web: + resources: + requests: + cpu: 15m + memory: 704Mi + limits: + cpu: 100m + memory: 1.5Gi + # Probe configuration adjusted for slow startup and response time + livenessProbe: + initialDelaySeconds: 60 + timeoutSeconds: 30 + failureThreshold: 5 + readinessProbe: + initialDelaySeconds: 60 + timeoutSeconds: 30 + failureThreshold: 5 + + worker: + resources: + requests: + cpu: 15m + memory: 512Mi + limits: + cpu: 100m + memory: 1Gi + # Probe configuration adjusted for slow startup + livenessProbe: + initialDelaySeconds: 60 + timeoutSeconds: 30 + failureThreshold: 5 + postgresql: deploy: false diff --git a/longhorn/README.md b/longhorn/README.md new file mode 100644 index 0000000..bcb9e20 --- /dev/null +++ b/longhorn/README.md @@ -0,0 +1,166 @@ +# Longhorn + +Longhorn is a lightweight, reliable, and powerful distributed block storage system for Kubernetes. + +## Table of Contents + +- [Installation](#installation) +- [Resource Configuration](#resource-configuration) +- [OAuth2-Proxy Integration](#oauth2-proxy-integration) +- [References](#references) + +## Installation + +### Prerequisites + +- Kubernetes cluster with sufficient resources +- Storage class support +- Open-iSCSI installed on nodes + +### Install Longhorn + +```bash +just longhorn::install +``` + +This command will: + +1. Add Longhorn Helm repository +2. Install Longhorn via Helm with custom values +3. Configure storage class with single replica +4. Apply resource limits to all Longhorn components (via `patch-resources` recipe) + +### Verify Installation + +```bash +# Check Longhorn pods +kubectl get pods -n longhorn + +# Check storage class +kubectl get storageclass +``` + +## Resource Configuration + +### Why We Use Kubernetes Patch Instead of Helm Values + +Longhorn Helm chart **does not support** configuring resource requests/limits for most components through `values.yaml`. + +**Known Issues:** + +- The `resources: {}` field exists in `values.yaml` but is **not used** in chart templates +- GitHub Issue: [#1502 - Add resource requests/limits to default deployment/controller rollouts](https://github.com/longhorn/longhorn/issues/1502) +- Related Issues: + - [#3186 - Resources limits in chart values.yaml not work](https://github.com/longhorn/longhorn/issues/3186) + - [Discussion #4446 - Resources section in helm chart values file isn't used?](https://github.com/longhorn/longhorn/discussions/4446) + - [Discussion #8282 - How to adjust longhorn ui and other components minimum cpu and memory request with helm](https://github.com/longhorn/longhorn/discussions/8282) + +**Pull Request Status:** + +- PR [#10187 - Allow setting requests and limits for LonghornUI, LonghornDriver and LonghornManager](https://github.com/longhorn/longhorn/pull/10187) was opened in January 2025 but **closed without merging** in April 2025. + +### Our Approach: Post-Install Patching + +Since Helm values don't work, we apply resource configurations **after installation** using `kubectl patch`: + +```bash +just longhorn::patch-resources +``` + +This recipe is automatically called by `just longhorn::install`. + +### Resource Values + +All resource values are based on **Goldilocks/VPA recommendations** and rounded to clean values following [resource management best practices](../docs/resource-management.md). + +The `patch-resources` recipe configures resources for the following components: + +- **CSI Components** (csi-attacher, csi-provisioner, csi-resizer, csi-snapshotter): Guaranteed QoS for stable CSI operations +- **Engine Image DaemonSet** (engine-image-ei-*): Guaranteed QoS +- **CSI Plugin DaemonSet** (longhorn-csi-plugin): 3 containers, Guaranteed QoS for critical CSI plugin +- **Driver Deployer** (longhorn-driver-deployer): Guaranteed QoS +- **Longhorn Manager DaemonSet** (longhorn-manager): Core component with Burstable QoS to allow CPU bursts during intensive storage operations. Includes 2 containers: main manager and pre-pull-share-manager-image +- **Longhorn UI** (longhorn-ui): Guaranteed QoS + +For specific resource values, refer to the `patch-resources` recipe in [longhorn/justfile](justfile). + +### Manual Resource Updates + +If you need to update resource configurations: + +1. **Edit the justfile:** + + ```bash + vim longhorn/justfile + # Modify the patch-resources recipe + ``` + +2. **Apply changes:** + + ```bash + just longhorn::patch-resources + ``` + +3. **Verify:** + + ```bash + kubectl get deployment -n longhorn -o jsonpath='{.spec.template.spec.containers[0].resources}' | jq + ``` + +### Future: When Helm Support is Added + +If Longhorn adds Helm values support in future versions: + +1. Move resource configurations from `patch-resources` recipe to `longhorn-values.yaml` +2. Remove or deprecate the `patch-resources` recipe +3. Update this documentation + +Monitor these GitHub issues for updates: + +- [#1502](https://github.com/longhorn/longhorn/issues/1502) +- [Discussion #8282](https://github.com/longhorn/longhorn/discussions/8282) + +## OAuth2-Proxy Integration + +Longhorn UI can be protected with OAuth2-Proxy for Keycloak authentication. + +### Setup OAuth2-Proxy + +```bash +just longhorn::oauth2-proxy-install +``` + +This will: + +1. Prompt for Longhorn hostname (FQDN) +2. Create Keycloak client +3. Deploy OAuth2-Proxy with IngressRoute +4. Apply resource limits to OAuth2-Proxy based on VPA recommendations + +**Resource Configuration:** + +OAuth2-Proxy resources are configured in the gomplate template ([oauth2-proxy/oauth2-proxy-deployment.gomplate.yaml](../oauth2-proxy/oauth2-proxy-deployment.gomplate.yaml)) with Guaranteed QoS based on Goldilocks/VPA recommendations. + +### Access Longhorn UI + +After setup, access the Longhorn UI at: + +```text +https:// +``` + +You'll be redirected to Keycloak for authentication. + +### Remove OAuth2-Proxy + +```bash +just longhorn::oauth2-proxy-uninstall +``` + +## References + +- [Longhorn Documentation](https://longhorn.io/docs/) +- [Longhorn GitHub Repository](https://github.com/longhorn/longhorn) +- [Longhorn Helm Chart](https://github.com/longhorn/charts) +- [Resource Management Best Practices](../docs/resource-management.md) +- [GitHub Issue #1502 - Resource requests/limits support](https://github.com/longhorn/longhorn/issues/1502) diff --git a/longhorn/justfile b/longhorn/justfile index ad26fd8..71b3fa2 100644 --- a/longhorn/justfile +++ b/longhorn/justfile @@ -49,6 +49,7 @@ install: --version ${LONGHORN_VERSION} -n ${LONGHORN_NAMESPACE} --create-namespace --wait \ -f longhorn-values.yaml just set-replicas 1 + just patch-resources # Uninstall Longhorn uninstall: @@ -82,6 +83,149 @@ set-replicas num='1': EOF )" +# Patch resources for Longhorn components based on Goldilocks/VPA recommendations +patch-resources: + #!/bin/bash + set -euo pipefail + echo "Patching Longhorn component resources based on Goldilocks/VPA recommendations..." + + # Patch csi-attacher deployment + kubectl patch deployment csi-attacher -n ${LONGHORN_NAMESPACE} --type='json' -p='[ + { + "op": "replace", + "path": "/spec/template/spec/containers/0/resources", + "value": { + "requests": {"cpu": "50m", "memory": "128Mi"}, + "limits": {"cpu": "50m", "memory": "128Mi"} + } + } + ]' + + # Patch csi-provisioner deployment + kubectl patch deployment csi-provisioner -n ${LONGHORN_NAMESPACE} --type='json' -p='[ + { + "op": "replace", + "path": "/spec/template/spec/containers/0/resources", + "value": { + "requests": {"cpu": "50m", "memory": "128Mi"}, + "limits": {"cpu": "50m", "memory": "128Mi"} + } + } + ]' + + # Patch csi-resizer deployment + kubectl patch deployment csi-resizer -n ${LONGHORN_NAMESPACE} --type='json' -p='[ + { + "op": "replace", + "path": "/spec/template/spec/containers/0/resources", + "value": { + "requests": {"cpu": "50m", "memory": "128Mi"}, + "limits": {"cpu": "50m", "memory": "128Mi"} + } + } + ]' + + # Patch csi-snapshotter deployment + kubectl patch deployment csi-snapshotter -n ${LONGHORN_NAMESPACE} --type='json' -p='[ + { + "op": "replace", + "path": "/spec/template/spec/containers/0/resources", + "value": { + "requests": {"cpu": "50m", "memory": "128Mi"}, + "limits": {"cpu": "50m", "memory": "128Mi"} + } + } + ]' + + # Find and patch engine-image daemonset (name includes hash) + ENGINE_IMAGE_DS=$(kubectl get daemonset -n ${LONGHORN_NAMESPACE} -o name | grep engine-image) + if [ -n "${ENGINE_IMAGE_DS}" ]; then + kubectl patch ${ENGINE_IMAGE_DS} -n ${LONGHORN_NAMESPACE} --type='json' -p='[ + { + "op": "replace", + "path": "/spec/template/spec/containers/0/resources", + "value": { + "requests": {"cpu": "50m", "memory": "128Mi"}, + "limits": {"cpu": "50m", "memory": "128Mi"} + } + } + ]' + fi + + # Patch longhorn-csi-plugin daemonset (3 containers) + kubectl patch daemonset longhorn-csi-plugin -n ${LONGHORN_NAMESPACE} --type='json' -p='[ + { + "op": "replace", + "path": "/spec/template/spec/containers/0/resources", + "value": { + "requests": {"cpu": "50m", "memory": "64Mi"}, + "limits": {"cpu": "50m", "memory": "64Mi"} + } + }, + { + "op": "replace", + "path": "/spec/template/spec/containers/1/resources", + "value": { + "requests": {"cpu": "50m", "memory": "64Mi"}, + "limits": {"cpu": "50m", "memory": "64Mi"} + } + }, + { + "op": "replace", + "path": "/spec/template/spec/containers/2/resources", + "value": { + "requests": {"cpu": "50m", "memory": "64Mi"}, + "limits": {"cpu": "50m", "memory": "64Mi"} + } + } + ]' + + # Patch longhorn-driver-deployer deployment + kubectl patch deployment longhorn-driver-deployer -n ${LONGHORN_NAMESPACE} --type='json' -p='[ + { + "op": "replace", + "path": "/spec/template/spec/containers/0/resources", + "value": { + "requests": {"cpu": "50m", "memory": "128Mi"}, + "limits": {"cpu": "50m", "memory": "128Mi"} + } + } + ]' + + # Patch longhorn-manager daemonset (2 containers - core component, add extra headroom) + kubectl patch daemonset longhorn-manager -n ${LONGHORN_NAMESPACE} --type='json' -p='[ + { + "op": "replace", + "path": "/spec/template/spec/containers/0/resources", + "value": { + "requests": {"cpu": "50m", "memory": "512Mi"}, + "limits": {"cpu": "100m", "memory": "512Mi"} + } + }, + { + "op": "replace", + "path": "/spec/template/spec/containers/1/resources", + "value": { + "requests": {"cpu": "50m", "memory": "64Mi"}, + "limits": {"cpu": "50m", "memory": "64Mi"} + } + } + ]' + + # Patch longhorn-ui deployment + kubectl patch deployment longhorn-ui -n ${LONGHORN_NAMESPACE} --type='json' -p='[ + { + "op": "replace", + "path": "/spec/template/spec/containers/0/resources", + "value": { + "requests": {"cpu": "50m", "memory": "128Mi"}, + "limits": {"cpu": "50m", "memory": "128Mi"} + } + } + ]' + + echo "All Longhorn component resources have been patched successfully!" + # Setup OAuth2-Proxy for Longhorn oauth2-proxy-install: #!/bin/bash diff --git a/minio/minio-values.gomplate.yaml b/minio/minio-values.gomplate.yaml index ca71106..c2cb778 100644 --- a/minio/minio-values.gomplate.yaml +++ b/minio/minio-values.gomplate.yaml @@ -40,3 +40,12 @@ consoleIngress: tls: - hosts: - {{ .Env.MINIO_CONSOLE_HOST }} + +# Resource configuration based on Goldilocks/VPA recommendations (rounded to clean values) +resources: + requests: + cpu: 50m + memory: 512Mi + limits: + cpu: 100m + memory: 1Gi diff --git a/oauth2-proxy/oauth2-proxy-deployment.gomplate.yaml b/oauth2-proxy/oauth2-proxy-deployment.gomplate.yaml index c808215..dbca9eb 100644 --- a/oauth2-proxy/oauth2-proxy-deployment.gomplate.yaml +++ b/oauth2-proxy/oauth2-proxy-deployment.gomplate.yaml @@ -43,6 +43,13 @@ spec: args: - --config=/etc/oauth2-proxy/config.cfg - --upstream=http://{{ .Env.UPSTREAM_SERVICE }} + resources: + requests: + cpu: 50m + memory: 128Mi + limits: + cpu: 50m + memory: 128Mi env: - name: OAUTH2_PROXY_CLIENT_ID valueFrom: diff --git a/prometheus/values.gomplate.yaml b/prometheus/values.gomplate.yaml index a76f04e..e46cb22 100644 --- a/prometheus/values.gomplate.yaml +++ b/prometheus/values.gomplate.yaml @@ -1,5 +1,7 @@ --- -# Grafana Configuration +# kube-prometheus-stack Helm chart values +# https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack/values.yaml + grafana: enabled: true @@ -44,6 +46,24 @@ grafana: enabled: true size: 10Gi + # Resource configuration based on Goldilocks/VPA recommendations + resources: + requests: + cpu: 15m + memory: 480Mi + limits: + cpu: 50m + memory: 704Mi + + sidecar: + resources: + requests: + cpu: 15m + memory: 128Mi + limits: + cpu: 25m + memory: 192Mi + # Prometheus Configuration prometheus: prometheusSpec: @@ -67,6 +87,15 @@ prometheus: matchLabels: buun.channel/enable-monitoring: "true" + # Resource configuration based on observed usage patterns + resources: + requests: + cpu: 100m + memory: 1.2Gi + limits: + cpu: 500m + memory: 2.5Gi + {{- if .Env.PROMETHEUS_HOST }} ingress: enabled: true @@ -92,6 +121,15 @@ alertmanager: requests: storage: 10Gi + # Resource configuration based on observed usage patterns + resources: + requests: + cpu: 15m + memory: 64Mi + limits: + cpu: 50m + memory: 128Mi + {{- if .Env.ALERTMANAGER_HOST }} ingress: enabled: true @@ -130,5 +168,38 @@ kubeProxy: kubeStateMetrics: enabled: true +# kube-state-metrics subchart configuration +kube-state-metrics: + # Resource configuration based on Goldilocks/VPA recommendations + resources: + requests: + cpu: 15m + memory: 128Mi + limits: + cpu: 50m + memory: 256Mi + nodeExporter: enabled: true + +# prometheus-node-exporter subchart configuration +prometheus-node-exporter: + # Resource configuration based on Goldilocks/VPA recommendations + resources: + requests: + cpu: 15m + memory: 128Mi + limits: + cpu: 50m + memory: 256Mi + +# Prometheus Operator Configuration +# Resource configuration based on Goldilocks/VPA recommendations +prometheusOperator: + resources: + requests: + cpu: 15m + memory: 110Mi + limits: + cpu: 50m + memory: 192Mi