fix(longhorn): fix longhorn error on shutting down
This commit is contained in:
117
k8s/justfile
117
k8s/justfile
@@ -121,27 +121,136 @@ uninstall:
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Stop k3s cluster (with CSI cleanup)
|
# Stop k3s cluster gracefully (with volume detachment and CSI cleanup)
|
||||||
stop:
|
stop:
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
echo "Cleaning up CSI mounts before stopping k3s..."
|
|
||||||
|
echo "Starting graceful k3s shutdown..."
|
||||||
|
|
||||||
|
# Check if Longhorn is installed
|
||||||
|
if helm status longhorn -n longhorn-system &>/dev/null; then
|
||||||
|
echo "Detected Longhorn installation. Scaling down workloads..."
|
||||||
|
|
||||||
|
# Scale down all deployments and statefulsets
|
||||||
|
echo "Scaling down Deployments..."
|
||||||
|
kubectl scale deployment --all --replicas=0 -A --timeout=60s 2>/dev/null || true
|
||||||
|
echo "Scaling down StatefulSets..."
|
||||||
|
kubectl scale statefulset --all --replicas=0 -A --timeout=60s 2>/dev/null || true
|
||||||
|
|
||||||
|
echo "Waiting for Longhorn volumes to be detached..."
|
||||||
|
|
||||||
|
# Wait for all volumes to be detached (max 60 seconds)
|
||||||
|
TIMEOUT=60
|
||||||
|
ELAPSED=0
|
||||||
|
while [ $ELAPSED -lt $TIMEOUT ]; do
|
||||||
|
# Check if any volume is still attached
|
||||||
|
ATTACHED=$(kubectl get volumes.longhorn.io -n longhorn-system -o json 2>/dev/null | \
|
||||||
|
jq -r '.items[] | select(.status.state == "attached") | .metadata.name' 2>/dev/null || true)
|
||||||
|
|
||||||
|
if [ -z "$ATTACHED" ]; then
|
||||||
|
echo "✓ All Longhorn volumes detached successfully"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
|
||||||
|
ATTACHED_COUNT=$(echo "$ATTACHED" | wc -l)
|
||||||
|
echo " Still waiting for $ATTACHED_COUNT volume(s) to detach..."
|
||||||
|
sleep 2
|
||||||
|
ELAPSED=$((ELAPSED + 2))
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ $ELAPSED -ge $TIMEOUT ]; then
|
||||||
|
echo "⚠ Warning: Timeout waiting for volumes to detach"
|
||||||
|
echo " Remaining attached volumes:"
|
||||||
|
echo "$ATTACHED" | sed 's/^/ /'
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo "Longhorn not detected, skipping volume detachment wait."
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Cleaning up CSI mounts..."
|
||||||
ssh "${LOCAL_K8S_HOST}" 'bash -s' << 'EOF'
|
ssh "${LOCAL_K8S_HOST}" 'bash -s' << 'EOF'
|
||||||
set +e
|
set +e
|
||||||
sudo pkill -9 umount 2>/dev/null
|
sudo pkill -9 umount 2>/dev/null
|
||||||
shopt -s nullglob
|
shopt -s nullglob
|
||||||
for mount in /var/lib/kubelet/plugins/kubernetes.io/csi/*/globalmount; do
|
for mount in /var/lib/kubelet/plugins/kubernetes.io/csi/*/globalmount; do
|
||||||
if [ -d "$mount" ]; then
|
if [ -d "$mount" ]; then
|
||||||
echo "Unmounting $mount..."
|
echo " Unmounting $mount..."
|
||||||
sudo umount -f "$mount" 2>/dev/null
|
sudo umount -f "$mount" 2>/dev/null
|
||||||
sudo umount -l "$mount" 2>/dev/null
|
sudo umount -l "$mount" 2>/dev/null
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
exit 0
|
exit 0
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
echo "Stopping k3s service..."
|
echo "Stopping k3s service..."
|
||||||
ssh "${LOCAL_K8S_HOST}" "sudo systemctl stop k3s"
|
ssh "${LOCAL_K8S_HOST}" "sudo systemctl stop k3s"
|
||||||
echo "k3s stopped on ${LOCAL_K8S_HOST}."
|
|
||||||
|
# Wait for k3s to fully stop
|
||||||
|
echo "Waiting for k3s to fully stop..."
|
||||||
|
STOP_TIMEOUT=30
|
||||||
|
STOP_ELAPSED=0
|
||||||
|
while [ $STOP_ELAPSED -lt $STOP_TIMEOUT ]; do
|
||||||
|
if ! ssh "${LOCAL_K8S_HOST}" "sudo systemctl is-active --quiet k3s"; then
|
||||||
|
echo "✓ k3s stopped successfully"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
sleep 1
|
||||||
|
STOP_ELAPSED=$((STOP_ELAPSED + 1))
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ $STOP_ELAPSED -ge $STOP_TIMEOUT ]; then
|
||||||
|
echo "⚠ Warning: k3s did not stop within timeout"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Cleanup all kubelet mounts after k3s stops
|
||||||
|
echo "Cleaning up all kubelet mounts..."
|
||||||
|
ssh "${LOCAL_K8S_HOST}" 'bash -s' << 'EOF'
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# Function to unmount a path safely
|
||||||
|
unmount_path() {
|
||||||
|
local path="$1"
|
||||||
|
if mountpoint -q "$path" 2>/dev/null; then
|
||||||
|
echo " Unmounting: $path"
|
||||||
|
sudo umount "$path" 2>/dev/null || sudo umount -f "$path" 2>/dev/null || sudo umount -l "$path" 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Unmount all Longhorn CSI mounts first (most specific paths first)
|
||||||
|
if [ -d /var/lib/kubelet/pods ]; then
|
||||||
|
# Find and unmount all volume mounts in pods
|
||||||
|
find /var/lib/kubelet/pods -type d -name "mount" 2>/dev/null | sort -r | while read -r mount; do
|
||||||
|
unmount_path "$mount"
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Unmount CSI globalmounts
|
||||||
|
if [ -d /var/lib/kubelet/plugins/kubernetes.io/csi ]; then
|
||||||
|
find /var/lib/kubelet/plugins/kubernetes.io/csi -type d -name "globalmount" 2>/dev/null | sort -r | while read -r mount; do
|
||||||
|
unmount_path "$mount"
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Unmount any remaining kubelet plugin mounts
|
||||||
|
if [ -d /var/lib/kubelet/plugins ]; then
|
||||||
|
find /var/lib/kubelet/plugins -type d 2>/dev/null | sort -r | while read -r mount; do
|
||||||
|
if mountpoint -q "$mount" 2>/dev/null; then
|
||||||
|
unmount_path "$mount"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Final check: unmount anything still mounted under /var/lib/kubelet
|
||||||
|
mount | grep '/var/lib/kubelet' | awk '{print $3}' | sort -r | while read -r mount; do
|
||||||
|
unmount_path "$mount"
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "✓ All kubelet mounts cleaned up"
|
||||||
|
EOF
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "✓ k3s stopped gracefully on ${LOCAL_K8S_HOST}."
|
||||||
echo "You can now safely shutdown the machine."
|
echo "You can now safely shutdown the machine."
|
||||||
|
|
||||||
# Start k3s cluster
|
# Start k3s cluster
|
||||||
|
|||||||
@@ -4,3 +4,7 @@ ingress:
|
|||||||
defaultSettings:
|
defaultSettings:
|
||||||
deletingConfirmationFlag: true
|
deletingConfirmationFlag: true
|
||||||
defaultReplicaCount: 1
|
defaultReplicaCount: 1
|
||||||
|
# Automatically delete orphaned data to keep storage clean
|
||||||
|
orphanAutoDeletion: true
|
||||||
|
# Force pod deletion when node goes down to trigger volume detachment
|
||||||
|
nodeDownPodDeletionPolicy: delete-both-statefulset-and-deployment-pod
|
||||||
|
|||||||
Reference in New Issue
Block a user