From 0032b0c4b4b904882eda7a9aa940d2b65f5ea190 Mon Sep 17 00:00:00 2001 From: Masaki Yatsu Date: Tue, 25 Nov 2025 11:16:10 +0900 Subject: [PATCH] fix(longhorn): fix longhorn error on shutting down --- k8s/justfile | 117 ++++++++++++++++++++++++++++++++-- longhorn/longhorn-values.yaml | 4 ++ 2 files changed, 117 insertions(+), 4 deletions(-) diff --git a/k8s/justfile b/k8s/justfile index c88e58c..191691a 100644 --- a/k8s/justfile +++ b/k8s/justfile @@ -121,27 +121,136 @@ uninstall: exit 1 fi -# Stop k3s cluster (with CSI cleanup) +# Stop k3s cluster gracefully (with volume detachment and CSI cleanup) stop: #!/bin/bash set -euo pipefail - echo "Cleaning up CSI mounts before stopping k3s..." + + echo "Starting graceful k3s shutdown..." + + # Check if Longhorn is installed + if helm status longhorn -n longhorn-system &>/dev/null; then + echo "Detected Longhorn installation. Scaling down workloads..." + + # Scale down all deployments and statefulsets + echo "Scaling down Deployments..." + kubectl scale deployment --all --replicas=0 -A --timeout=60s 2>/dev/null || true + echo "Scaling down StatefulSets..." + kubectl scale statefulset --all --replicas=0 -A --timeout=60s 2>/dev/null || true + + echo "Waiting for Longhorn volumes to be detached..." + + # Wait for all volumes to be detached (max 60 seconds) + TIMEOUT=60 + ELAPSED=0 + while [ $ELAPSED -lt $TIMEOUT ]; do + # Check if any volume is still attached + ATTACHED=$(kubectl get volumes.longhorn.io -n longhorn-system -o json 2>/dev/null | \ + jq -r '.items[] | select(.status.state == "attached") | .metadata.name' 2>/dev/null || true) + + if [ -z "$ATTACHED" ]; then + echo "✓ All Longhorn volumes detached successfully" + break + fi + + ATTACHED_COUNT=$(echo "$ATTACHED" | wc -l) + echo " Still waiting for $ATTACHED_COUNT volume(s) to detach..." + sleep 2 + ELAPSED=$((ELAPSED + 2)) + done + + if [ $ELAPSED -ge $TIMEOUT ]; then + echo "⚠ Warning: Timeout waiting for volumes to detach" + echo " Remaining attached volumes:" + echo "$ATTACHED" | sed 's/^/ /' + fi + else + echo "Longhorn not detected, skipping volume detachment wait." + fi + + echo "Cleaning up CSI mounts..." ssh "${LOCAL_K8S_HOST}" 'bash -s' << 'EOF' set +e sudo pkill -9 umount 2>/dev/null shopt -s nullglob for mount in /var/lib/kubelet/plugins/kubernetes.io/csi/*/globalmount; do if [ -d "$mount" ]; then - echo "Unmounting $mount..." + echo " Unmounting $mount..." sudo umount -f "$mount" 2>/dev/null sudo umount -l "$mount" 2>/dev/null fi done exit 0 EOF + echo "Stopping k3s service..." ssh "${LOCAL_K8S_HOST}" "sudo systemctl stop k3s" - echo "k3s stopped on ${LOCAL_K8S_HOST}." + + # Wait for k3s to fully stop + echo "Waiting for k3s to fully stop..." + STOP_TIMEOUT=30 + STOP_ELAPSED=0 + while [ $STOP_ELAPSED -lt $STOP_TIMEOUT ]; do + if ! ssh "${LOCAL_K8S_HOST}" "sudo systemctl is-active --quiet k3s"; then + echo "✓ k3s stopped successfully" + break + fi + sleep 1 + STOP_ELAPSED=$((STOP_ELAPSED + 1)) + done + + if [ $STOP_ELAPSED -ge $STOP_TIMEOUT ]; then + echo "⚠ Warning: k3s did not stop within timeout" + fi + + # Cleanup all kubelet mounts after k3s stops + echo "Cleaning up all kubelet mounts..." + ssh "${LOCAL_K8S_HOST}" 'bash -s' << 'EOF' + set -euo pipefail + + # Function to unmount a path safely + unmount_path() { + local path="$1" + if mountpoint -q "$path" 2>/dev/null; then + echo " Unmounting: $path" + sudo umount "$path" 2>/dev/null || sudo umount -f "$path" 2>/dev/null || sudo umount -l "$path" 2>/dev/null || true + fi + } + + # Unmount all Longhorn CSI mounts first (most specific paths first) + if [ -d /var/lib/kubelet/pods ]; then + # Find and unmount all volume mounts in pods + find /var/lib/kubelet/pods -type d -name "mount" 2>/dev/null | sort -r | while read -r mount; do + unmount_path "$mount" + done + fi + + # Unmount CSI globalmounts + if [ -d /var/lib/kubelet/plugins/kubernetes.io/csi ]; then + find /var/lib/kubelet/plugins/kubernetes.io/csi -type d -name "globalmount" 2>/dev/null | sort -r | while read -r mount; do + unmount_path "$mount" + done + fi + + # Unmount any remaining kubelet plugin mounts + if [ -d /var/lib/kubelet/plugins ]; then + find /var/lib/kubelet/plugins -type d 2>/dev/null | sort -r | while read -r mount; do + if mountpoint -q "$mount" 2>/dev/null; then + unmount_path "$mount" + fi + done + fi + + # Final check: unmount anything still mounted under /var/lib/kubelet + mount | grep '/var/lib/kubelet' | awk '{print $3}' | sort -r | while read -r mount; do + unmount_path "$mount" + done + + echo "✓ All kubelet mounts cleaned up" + EOF + + echo "" + echo "✓ k3s stopped gracefully on ${LOCAL_K8S_HOST}." echo "You can now safely shutdown the machine." # Start k3s cluster diff --git a/longhorn/longhorn-values.yaml b/longhorn/longhorn-values.yaml index 1fecf19..648b93d 100644 --- a/longhorn/longhorn-values.yaml +++ b/longhorn/longhorn-values.yaml @@ -4,3 +4,7 @@ ingress: defaultSettings: deletingConfirmationFlag: true defaultReplicaCount: 1 + # Automatically delete orphaned data to keep storage clean + orphanAutoDeletion: true + # Force pod deletion when node goes down to trigger volume detachment + nodeDownPodDeletionPolicy: delete-both-statefulset-and-deployment-pod