fix(agent): use dataset used capacity for RAIDZ pools instead of zpool alloc

For RAIDZ pools, zpool ALLOC includes parity overhead, but users expect to see actual data usage. Now using dataset Used value (from statfs) when RAIDZ is detected, matching the existing fix for total capacity. Fixes the second part of #1052 where used capacity was inflated.
2026-02-18 00:17:39 +01:00 · 2026-01-10 15:25:28 +00:00
parent 80729408c1
commit 1816e2dbb8
3 changed files with 71 additions and 16 deletions
--- a/frontend-modern/src/stores/websocket.ts
+++ b/frontend-modern/src/stores/websocket.ts
@@ -99,6 +99,11 @@ export function createWebSocketStore(url: string) {
  let consecutiveEmptyHostUpdates = 0;
  let hasReceivedNonEmptyHosts = false;

+  // Track consecutive empty Kubernetes clusters payloads (same protection as dockerHosts/hosts)
+  // This prevents clusters from disappearing when transient empty arrays are received.
+  let consecutiveEmptyK8sUpdates = 0;
+  let hasReceivedNonEmptyK8sClusters = false;
+
  const mergeDockerHostRevocations = (incomingHosts: DockerHost[]) => {
    if (!Array.isArray(incomingHosts) || incomingHosts.length === 0) {
      return incomingHosts;
@@ -311,6 +316,8 @@ export function createWebSocketStore(url: string) {
      hasReceivedNonEmptyDockerHosts = false;
      consecutiveEmptyHostUpdates = 0;
      hasReceivedNonEmptyHosts = false;
+      consecutiveEmptyK8sUpdates = 0;
+      hasReceivedNonEmptyK8sClusters = false;

      // Start heartbeat to keep connection alive
      if (heartbeatInterval) {
@@ -597,11 +604,53 @@ export function createWebSocketStore(url: string) {
                : [];
              setState('removedDockerHosts', reconcile(removed, { key: 'id' }));
            }
+            // Process Kubernetes clusters with transient empty payload protection
+            // (same logic as dockerHosts/hosts to prevent UI flapping)
            if (message.data.kubernetesClusters !== undefined) {
-              const clusters = Array.isArray(message.data.kubernetesClusters)
-                ? (message.data.kubernetesClusters as KubernetesCluster[])
-                : [];
-              setState('kubernetesClusters', reconcile(clusters, { key: 'id' }));
+              if (Array.isArray(message.data.kubernetesClusters)) {
+                const incomingClusters = message.data.kubernetesClusters as KubernetesCluster[];
+                if (incomingClusters.length === 0) {
+                  consecutiveEmptyK8sUpdates += 1;
+
+                  // Check if all existing clusters are stale (>60s since lastSeen)
+                  // If so, they're probably really gone - apply the empty update immediately
+                  const now = Date.now();
+                  const staleThresholdMs = 60_000; // 60 seconds
+                  const existingClusters = state.kubernetesClusters || [];
+                  const allStale = existingClusters.length === 0 || existingClusters.every(
+                    (c) => !c.lastSeen || (now - c.lastSeen) > staleThresholdMs
+                  );
+
+                  const shouldApply =
+                    !hasReceivedNonEmptyK8sClusters ||
+                    allStale ||
+                    consecutiveEmptyK8sUpdates >= 3 ||
+                    message.type === WEBSOCKET.MESSAGE_TYPES.INITIAL_STATE;
+
+                  if (shouldApply) {
+                    logger.debug('[WebSocket] Updating kubernetesClusters', {
+                      count: incomingClusters.length,
+                      reason: allStale ? 'allStale' : 'threshold',
+                    });
+                    setState('kubernetesClusters', reconcile(incomingClusters, { key: 'id' }));
+                  } else {
+                    logger.debug('[WebSocket] Skipping transient empty kubernetesClusters payload', {
+                      streak: consecutiveEmptyK8sUpdates,
+                    });
+                  }
+                } else {
+                  consecutiveEmptyK8sUpdates = 0;
+                  hasReceivedNonEmptyK8sClusters = true;
+                  logger.debug('[WebSocket] Updating kubernetesClusters', {
+                    count: incomingClusters.length,
+                  });
+                  setState('kubernetesClusters', reconcile(incomingClusters, { key: 'id' }));
+                }
+              } else {
+                logger.warn('[WebSocket] Received non-array kubernetesClusters payload', {
+                  type: typeof message.data.kubernetesClusters,
+                });
+              }
            }
            if (message.data.removedKubernetesClusters !== undefined) {
              const removed = Array.isArray(message.data.removedKubernetesClusters)
--- a/internal/hostmetrics/zfs.go
+++ b/internal/hostmetrics/zfs.go
@@ -74,22 +74,26 @@ func disksFromZpoolStats(
 			// For RAIDZ/mirror pools, zpool SIZE is raw capacity (sum of all disks),
 			// but users expect usable capacity (accounting for parity/redundancy).
 			// The dataset's Total (from statfs) gives usable capacity.
-			// Use dataset stats when available and smaller than zpool size. (issue #1052)
+			// Similarly, zpool ALLOC includes parity overhead, but dataset Used gives
+			// actual data usage. Use dataset stats when available and smaller than
+			// zpool size. (issue #1052)
 			totalBytes := stat.Size
+			usedBytes := stat.Alloc
 			freeBytes := stat.Free
 			if ds.Total > 0 && ds.Total < stat.Size {
 				totalBytes = ds.Total
+				usedBytes = ds.Used
 				freeBytes = ds.Free
 			}

-			usage := clampPercent(calculatePercent(totalBytes, stat.Alloc))
+			usage := clampPercent(calculatePercent(totalBytes, usedBytes))
 			disks = append(disks, agentshost.Disk{
 				Device:     pool,
 				Mountpoint: mp,
 				Filesystem: "zfs",
 				Type:       "zfs",
 				TotalBytes: int64(totalBytes),
-				UsedBytes:  int64(stat.Alloc),
+				UsedBytes:  int64(usedBytes),
 				FreeBytes:  int64(freeBytes),
 				Usage:      usage,
 			})
--- a/internal/hostmetrics/zfs_test.go
+++ b/internal/hostmetrics/zfs_test.go
@@ -53,15 +53,17 @@ func TestSummarizeZFSPoolsRAIDZCapacity(t *testing.T) {
 	// Simulate a RAIDZ1 pool with 3 disks:
 	// - Raw SIZE from zpool list: 43.6 TB (sum of all disks)
 	// - Usable capacity from statfs: 29 TB (after RAIDZ1 parity overhead)
+	// - zpool ALLOC: 7 GB (includes parity data)
+	// - zfs USED: 4.6 GB (actual user data)
 	queryZpoolStats = func(ctx context.Context, pools []string) (map[string]zpoolStats, error) {
 		return map[string]zpoolStats{
-			"Main": {Size: 43600000000000, Alloc: 962000000, Free: 43599038000000},
+			"Main": {Size: 43600000000000, Alloc: 7000000000, Free: 43593000000000},
 		}, nil
 	}

-	// Dataset stats from statfs reflect usable capacity (29 TB)
+	// Dataset stats from statfs reflect usable capacity (29 TB) and actual data usage (4.6 GB)
 	datasets := []zfsDatasetUsage{
-		{Pool: "Main", Dataset: "Main", Mountpoint: "/mnt/Main", Total: 29000000000000, Used: 962000000, Free: 28999038000000},
+		{Pool: "Main", Dataset: "Main", Mountpoint: "/mnt/Main", Total: 29000000000000, Used: 4600000000, Free: 28995400000000},
 	}

 	disks := summarizeZFSPools(context.Background(), datasets)
@@ -80,20 +82,20 @@ func TestSummarizeZFSPoolsRAIDZCapacity(t *testing.T) {
 		t.Errorf("expected TotalBytes %d (usable capacity), got %d (might be using raw capacity)", expectedTotal, main.TotalBytes)
 	}

-	// Used should come from zpool stats (accurate allocation)
-	expectedUsed := int64(962000000)
+	// Used should come from dataset stats (4.6 GB actual data), not zpool alloc (7 GB with parity)
+	expectedUsed := int64(4600000000)
 	if main.UsedBytes != expectedUsed {
-		t.Errorf("expected UsedBytes %d, got %d", expectedUsed, main.UsedBytes)
+		t.Errorf("expected UsedBytes %d (dataset used), got %d (might be using zpool alloc which includes parity)", expectedUsed, main.UsedBytes)
 	}

 	// Free should use dataset stats when we're using dataset Total
-	expectedFree := int64(28999038000000)
+	expectedFree := int64(28995400000000)
 	if main.FreeBytes != expectedFree {
 		t.Errorf("expected FreeBytes %d, got %d", expectedFree, main.FreeBytes)
 	}

-	// Usage should be calculated against usable capacity
-	// 962000000 / 29000000000000 * 100 ≈ 0.003%
+	// Usage should be calculated against usable capacity with actual used data
+	// 4600000000 / 29000000000000 * 100 ≈ 0.016%
 	if main.Usage > 0.1 {
 		t.Errorf("expected usage ~0%%, got %.2f%% (might be calculated against wrong total)", main.Usage)
 	}