diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md
index 5ae79f4..d12fb69 100644
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -4,8 +4,103 @@ This document provides solutions to common issues encountered when working with
 
 ## Table of Contents
 
+- [Longhorn Issues](#longhorn-issues)
 - [Vault Issues](#vault-issues)
 
+## Longhorn Issues
+
+### EXT4 Errors on Machine Shutdown
+
+#### Symptom
+
+When shutting down the machine, you see errors like:
+
+```plain
+EXT4-fs (sdf): failed to convert unwritten extents to written extents -- potential data loss!  (inode 393220, error -30)
+```
+
+Or similar I/O errors in kernel logs:
+
+```plain
+blk_update_request: I/O error, dev sdf, sector XXXXX op 0x1:(WRITE) flags 0x0 phys_seg 1 prio class 2
+Buffer I/O error on dev dm-X, logical block XXXXX, lost sync page write
+```
+
+#### Cause
+
+This occurs when the machine is shut down without properly detaching Longhorn volumes. The standard k3s shutdown procedure (`systemctl stop k3s` or `k3s-killall.sh`) does not gracefully handle Longhorn volume detachment.
+
+When volumes are forcefully detached during shutdown:
+
+- Dirty data may not be flushed to disk
+- The filesystem encounters I/O errors trying to complete pending writes
+- This can lead to data corruption or loss
+
+Reference: <https://github.com/longhorn/longhorn/issues/7206>
+
+#### Solution
+
+Always use `just k8s::stop` before shutting down the machine:
+
+```bash
+# Gracefully stop k3s with proper Longhorn volume detachment
+just k8s::stop
+
+# Now you can safely shutdown the machine
+sudo shutdown -h now
+```
+
+The `just k8s::stop` recipe performs the following steps:
+
+1. **Drains the node** using `kubectl drain` to gracefully evict all pods
+2. **Waits for Longhorn volumes** to be fully detached
+3. **Stops k3s service** and cleans up container processes
+4. **Terminates remaining containerd-shim processes**
+
+#### Expected Warnings During Drain
+
+During the drain process, you may see warnings like:
+
+```plain
+error when evicting pods/"instance-manager-..." -n "longhorn" (will retry after 5s): Cannot evict pod as it would violate the pod's disruption budget.
+```
+
+This is normal. Longhorn's instance-manager pods are protected by PodDisruptionBudget (PDB). The drain command will retry and eventually evict them with the `--force` option.
+
+You may also see client-side throttling messages:
+
+```plain
+"Waited before sending request" delay="1.000769875s" reason="client-side throttling..."
+```
+
+This is also normal. The Kubernetes client automatically throttles requests when evicting many pods at once. These warnings do not indicate any problem.
+
+#### Starting the Cluster After Reboot
+
+After rebooting, start the cluster with:
+
+```bash
+just k8s::start
+```
+
+This will:
+
+1. Start the k3s service
+2. Wait for the node to be ready
+3. Automatically uncordon the node (which was cordoned during drain)
+
+#### Quick Reference
+
+```bash
+# Before shutdown
+just k8s::stop
+sudo shutdown -h now
+
+# After reboot
+just k8s::start
+just vault::unseal  # If Vault is installed
+```
+
 ## Vault Issues
 
 ### Vault is Sealed
diff --git a/k8s/justfile b/k8s/justfile
index 191691a..1fc32f6 100644
--- a/k8s/justfile
+++ b/k8s/justfile
@@ -126,142 +126,161 @@ stop:
     #!/bin/bash
     set -euo pipefail
 
+    START_TIME=$(date +%s)
+    elapsed() {
+        echo "$(($(date +%s) - START_TIME))s"
+    }
+
     echo "Starting graceful k3s shutdown..."
 
-    # Check if Longhorn is installed
+    # Get node name
+    NODE_NAME=$(kubectl get nodes -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true)
+
+    if [ -z "$NODE_NAME" ]; then
+        echo "⚠ Could not get node name, k3s may already be stopped"
+        echo "Running k3s-killall.sh for cleanup..."
+        ssh "${LOCAL_K8S_HOST}" "sudo /usr/local/bin/k3s-killall.sh 2>/dev/null || true"
+        echo "✓ Cleanup completed ($(elapsed))"
+        exit 0
+    fi
+
+    echo "Node: $NODE_NAME"
+
+    # Drain the node to gracefully evict all pods and detach Longhorn volumes
+    # This is the recommended way to shutdown with Longhorn (see: https://github.com/longhorn/longhorn/issues/7206)
+    echo "[$(elapsed)] Draining node to gracefully detach Longhorn volumes..."
+    kubectl drain "$NODE_NAME" \
+        --ignore-daemonsets \
+        --delete-emptydir-data \
+        --force \
+        --grace-period=30 \
+        --timeout=90s 2>&1 || {
+            echo "⚠ Drain had warnings (this is usually OK for single-node clusters)"
+        }
+    echo "[$(elapsed)] Drain completed"
+
+    # Wait for Longhorn volumes to be fully detached
     if helm status longhorn -n longhorn-system &>/dev/null; then
-        echo "Detected Longhorn installation. Scaling down workloads..."
-
-        # Scale down all deployments and statefulsets
-        echo "Scaling down Deployments..."
-        kubectl scale deployment --all --replicas=0 -A --timeout=60s 2>/dev/null || true
-        echo "Scaling down StatefulSets..."
-        kubectl scale statefulset --all --replicas=0 -A --timeout=60s 2>/dev/null || true
-
-        echo "Waiting for Longhorn volumes to be detached..."
-
-        # Wait for all volumes to be detached (max 60 seconds)
-        TIMEOUT=60
+        echo "[$(elapsed)] Waiting for Longhorn volumes to be detached..."
+        TIMEOUT=30
         ELAPSED=0
         while [ $ELAPSED -lt $TIMEOUT ]; do
-            # Check if any volume is still attached
             ATTACHED=$(kubectl get volumes.longhorn.io -n longhorn-system -o json 2>/dev/null | \
                 jq -r '.items[] | select(.status.state == "attached") | .metadata.name' 2>/dev/null || true)
 
             if [ -z "$ATTACHED" ]; then
-                echo "✓ All Longhorn volumes detached successfully"
+                echo "[$(elapsed)] ✓ All Longhorn volumes detached successfully"
                 break
             fi
 
-            ATTACHED_COUNT=$(echo "$ATTACHED" | wc -l)
+            ATTACHED_COUNT=$(echo "$ATTACHED" | grep -c . || echo 0)
             echo "  Still waiting for $ATTACHED_COUNT volume(s) to detach..."
             sleep 2
             ELAPSED=$((ELAPSED + 2))
         done
 
         if [ $ELAPSED -ge $TIMEOUT ]; then
-            echo "⚠ Warning: Timeout waiting for volumes to detach"
-            echo "  Remaining attached volumes:"
-            echo "$ATTACHED" | sed 's/^/    /'
+            echo "[$(elapsed)] ⚠ Warning: Timeout waiting for volumes to detach"
         fi
-    else
-        echo "Longhorn not detected, skipping volume detachment wait."
     fi
 
-    echo "Cleaning up CSI mounts..."
+    # Stop and disable k3s service to prevent auto-start on reboot
+    echo "[$(elapsed)] Stopping and disabling k3s service..."
+    ssh "${LOCAL_K8S_HOST}" "sudo systemctl stop k3s 2>/dev/null || true"
+    ssh "${LOCAL_K8S_HOST}" "sudo systemctl disable k3s 2>/dev/null || true"
+
+    # Run k3s-killall.sh to clean up all container processes
+    echo "[$(elapsed)] Running k3s-killall.sh to stop all container processes..."
     ssh "${LOCAL_K8S_HOST}" 'bash -s' << 'EOF'
         set +e
-        sudo pkill -9 umount 2>/dev/null
-        shopt -s nullglob
-        for mount in /var/lib/kubelet/plugins/kubernetes.io/csi/*/globalmount; do
-            if [ -d "$mount" ]; then
-                echo "  Unmounting $mount..."
-                sudo umount -f "$mount" 2>/dev/null
-                sudo umount -l "$mount" 2>/dev/null
-            fi
-        done
+
+        if [ -x /usr/local/bin/k3s-killall.sh ]; then
+            echo "  Executing /usr/local/bin/k3s-killall.sh..."
+            timeout 180 sudo /usr/local/bin/k3s-killall.sh || {
+                echo "  k3s-killall.sh timed out, forcing cleanup..."
+                # Use pgrep/kill instead of pkill -f to avoid matching ourselves
+                for pid in $(pgrep -x k3s 2>/dev/null); do
+                    sudo kill -9 "$pid" 2>/dev/null || true
+                done
+                sudo pkill -9 -x containerd-shim-runc-v2 2>/dev/null || true
+                sudo pkill -9 -x containerd 2>/dev/null || true
+            }
+        else
+            echo "  k3s-killall.sh not found, stopping manually..."
+            sudo systemctl stop k3s 2>/dev/null || true
+            for pid in $(pgrep -x k3s 2>/dev/null); do
+                sudo kill -9 "$pid" 2>/dev/null || true
+            done
+            sudo pkill -9 -x containerd-shim-runc-v2 2>/dev/null || true
+        fi
+
         exit 0
     EOF
 
-    echo "Stopping k3s service..."
-    ssh "${LOCAL_K8S_HOST}" "sudo systemctl stop k3s"
-
-    # Wait for k3s to fully stop
-    echo "Waiting for k3s to fully stop..."
-    STOP_TIMEOUT=30
-    STOP_ELAPSED=0
-    while [ $STOP_ELAPSED -lt $STOP_TIMEOUT ]; do
-        if ! ssh "${LOCAL_K8S_HOST}" "sudo systemctl is-active --quiet k3s"; then
-            echo "✓ k3s stopped successfully"
+    # Wait for containerd-shim processes to terminate
+    echo "[$(elapsed)] Waiting for containerd-shim processes to terminate..."
+    SHIM_TIMEOUT=15
+    SHIM_ELAPSED=0
+    while [ $SHIM_ELAPSED -lt $SHIM_TIMEOUT ]; do
+        SHIM_COUNT=$(ssh "${LOCAL_K8S_HOST}" "pgrep containerd-shim 2>/dev/null | wc -l | tr -d ' '")
+        SHIM_COUNT=${SHIM_COUNT:-0}
+        if [ "$SHIM_COUNT" -eq 0 ] 2>/dev/null; then
+            echo "[$(elapsed)] ✓ All containerd-shim processes terminated"
             break
         fi
-        sleep 1
-        STOP_ELAPSED=$((STOP_ELAPSED + 1))
+        echo "  Still waiting for $SHIM_COUNT containerd-shim process(es)..."
+        sleep 2
+        SHIM_ELAPSED=$((SHIM_ELAPSED + 2))
     done
 
-    if [ $STOP_ELAPSED -ge $STOP_TIMEOUT ]; then
-        echo "⚠ Warning: k3s did not stop within timeout"
+    if [ $SHIM_ELAPSED -ge $SHIM_TIMEOUT ]; then
+        echo "[$(elapsed)] ⚠ Warning: containerd-shim processes did not terminate within timeout"
+        echo "  Forcing termination..."
+        ssh "${LOCAL_K8S_HOST}" "sudo pkill -9 containerd-shim 2>/dev/null || true"
+        sleep 1
     fi
 
-    # Cleanup all kubelet mounts after k3s stops
-    echo "Cleaning up all kubelet mounts..."
-    ssh "${LOCAL_K8S_HOST}" 'bash -s' << 'EOF'
-        set -euo pipefail
-
-        # Function to unmount a path safely
-        unmount_path() {
-            local path="$1"
-            if mountpoint -q "$path" 2>/dev/null; then
-                echo "  Unmounting: $path"
-                sudo umount "$path" 2>/dev/null || sudo umount -f "$path" 2>/dev/null || sudo umount -l "$path" 2>/dev/null || true
-            fi
-        }
-
-        # Unmount all Longhorn CSI mounts first (most specific paths first)
-        if [ -d /var/lib/kubelet/pods ]; then
-            # Find and unmount all volume mounts in pods
-            find /var/lib/kubelet/pods -type d -name "mount" 2>/dev/null | sort -r | while read -r mount; do
-                unmount_path "$mount"
-            done
-        fi
-
-        # Unmount CSI globalmounts
-        if [ -d /var/lib/kubelet/plugins/kubernetes.io/csi ]; then
-            find /var/lib/kubelet/plugins/kubernetes.io/csi -type d -name "globalmount" 2>/dev/null | sort -r | while read -r mount; do
-                unmount_path "$mount"
-            done
-        fi
-
-        # Unmount any remaining kubelet plugin mounts
-        if [ -d /var/lib/kubelet/plugins ]; then
-            find /var/lib/kubelet/plugins -type d 2>/dev/null | sort -r | while read -r mount; do
-                if mountpoint -q "$mount" 2>/dev/null; then
-                    unmount_path "$mount"
-                fi
-            done
-        fi
-
-        # Final check: unmount anything still mounted under /var/lib/kubelet
-        mount | grep '/var/lib/kubelet' | awk '{print $3}' | sort -r | while read -r mount; do
-            unmount_path "$mount"
-        done
-
-        echo "✓ All kubelet mounts cleaned up"
-    EOF
-
     echo ""
-    echo "✓ k3s stopped gracefully on ${LOCAL_K8S_HOST}."
+    echo "✓ k3s stopped gracefully on ${LOCAL_K8S_HOST}. (Total: $(elapsed))"
     echo "You can now safely shutdown the machine."
+    echo ""
+    echo "IMPORTANT: k3s has been disabled and will NOT start automatically on reboot."
+    echo "After reboot, you MUST manually run:"
+    echo "  just k8s::start"
+    echo "  just vault::unseal  # If Vault is installed"
 
 # Start k3s cluster
 start:
     #!/bin/bash
     set -euo pipefail
-    echo "Starting k3s service..."
+    echo "Enabling and starting k3s service..."
+    ssh "${LOCAL_K8S_HOST}" "sudo systemctl enable k3s"
     ssh "${LOCAL_K8S_HOST}" "sudo systemctl start k3s"
     echo "Waiting for k3s to be ready..."
     sleep 5
     kubectl wait --for=condition=Ready nodes --all --timeout=60s
+
+    # Uncordon the node if it was cordoned by 'just k8s::stop'
+    NODE_NAME=$(kubectl get nodes -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true)
+    if [ -n "$NODE_NAME" ]; then
+        NODE_SCHEDULABLE=$(kubectl get node "$NODE_NAME" -o jsonpath='{.spec.unschedulable}' 2>/dev/null || echo "false")
+        if [ "$NODE_SCHEDULABLE" = "true" ]; then
+            echo "Uncordoning node $NODE_NAME..."
+            kubectl uncordon "$NODE_NAME"
+        fi
+    fi
+
+    # Wait for Longhorn CSI plugin to be ready before other pods start using volumes
+    if helm status longhorn -n longhorn &>/dev/null; then
+        echo "Waiting for Longhorn CSI plugin to be ready..."
+        if ! kubectl wait --for=condition=Ready pod -l app=longhorn-csi-plugin -n longhorn --timeout=120s 2>/dev/null; then
+            echo "⚠ Longhorn CSI plugin not ready, restarting pod..."
+            kubectl delete pod -l app=longhorn-csi-plugin -n longhorn --ignore-not-found
+            kubectl wait --for=condition=Ready pod -l app=longhorn-csi-plugin -n longhorn --timeout=120s
+        fi
+        echo "✓ Longhorn CSI plugin is ready"
+    fi
+
     echo "k3s started on ${LOCAL_K8S_HOST}."
 
 # Restart k3s cluster (with CSI cleanup)