From 7444bd0468169dbca6adf727a21cdc6e72dcc0ab Mon Sep 17 00:00:00 2001 From: rcourtman Date: Mon, 2 Feb 2026 15:17:53 +0000 Subject: [PATCH] fix(alerts): guest alerts misclassified as node alerts when threshold disabled (#1145) In single-node setups, guest alerts had Instance == Node, causing reevaluateActiveAlertsLocked to evaluate them against NodeDefaults instead of GuestDefaults. Setting guest memory threshold to 0 (disabled) wouldn't clear existing guest alerts because they were being kept alive by the still-enabled node memory threshold. - Add resourceID colon check to distinguish guest IDs (instance:node:vmid) from node IDs (instance-node) in reevaluateActiveAlertsLocked - Clear stale alerts in checkMetric when threshold is nil or disabled - Skip hysteresis validation for disabled thresholds (Trigger <= 0) - Fix frontend tooltip: "0" not "-1" disables a threshold --- .../src/components/Alerts/ThresholdsTable.tsx | 2 +- internal/alerts/alerts.go | 8 ++- internal/alerts/alerts_test.go | 53 ++++++++++++++++ internal/alerts/threshold_update_test.go | 62 +++++++++++++++++++ internal/alerts/utility_test.go | 12 ++-- 5 files changed, 129 insertions(+), 8 deletions(-) diff --git a/frontend-modern/src/components/Alerts/ThresholdsTable.tsx b/frontend-modern/src/components/Alerts/ThresholdsTable.tsx index 45e6d005e..102e6b33d 100644 --- a/frontend-modern/src/components/Alerts/ThresholdsTable.tsx +++ b/frontend-modern/src/components/Alerts/ThresholdsTable.tsx @@ -2677,7 +2677,7 @@ export function ThresholdsTable(props: ThresholdsTableProps) {
Quick tips: Set any threshold to{' '} - -1 + 0 {' '} to disable alerts for that metric. Click on disabled thresholds showing{' '} Off to re-enable them. Resources with custom settings show a{' '} diff --git a/internal/alerts/alerts.go b/internal/alerts/alerts.go index 3d787c8c5..defc5e938 100644 --- a/internal/alerts/alerts.go +++ b/internal/alerts/alerts.go @@ -1059,6 +1059,10 @@ func ensureValidHysteresis(threshold *HysteresisThreshold, metricName string) { if threshold == nil { return } + // Disabled thresholds don't need hysteresis validation + if threshold.Trigger <= 0 { + return + } if threshold.Clear >= threshold.Trigger { log.Warn(). Str("metric", metricName). @@ -1786,7 +1790,7 @@ func (m *Manager) reevaluateActiveAlertsLocked() { // Determine the resource type from the alert's metadata or instance // We need to check what kind of resource this is - if threshold == nil && (alert.Instance == "Node" || alert.Instance == alert.Node) { + if threshold == nil && !strings.Contains(resourceID, ":") && (alert.Instance == "Node" || alert.Instance == alert.Node) { // This is a node alert // Check if all node alerts are disabled if m.config.DisableAllNodes { @@ -6006,6 +6010,8 @@ type metricOptions struct { func (m *Manager) checkMetric(resourceID, resourceName, node, instance, resourceType, metricType string, value float64, threshold *HysteresisThreshold, opts *metricOptions) { if threshold == nil || threshold.Trigger <= 0 { + alertID := fmt.Sprintf("%s-%s", resourceID, metricType) + m.clearAlert(alertID) return } diff --git a/internal/alerts/alerts_test.go b/internal/alerts/alerts_test.go index ca03be797..3565b6b70 100644 --- a/internal/alerts/alerts_test.go +++ b/internal/alerts/alerts_test.go @@ -112,6 +112,59 @@ func TestAcknowledgePersistsThroughCheckMetric(t *testing.T) { } } +func TestCheckMetricClearsAlertWhenThresholdDisabled(t *testing.T) { + m := newTestManager(t) + m.ClearActiveAlerts() + m.mu.Lock() + m.config.TimeThreshold = 0 + m.config.TimeThresholds = map[string]int{} + m.config.SuppressionWindow = 0 + m.config.MinimumDelta = 0 + m.mu.Unlock() + + // First, create an active alert with an enabled threshold + threshold := &HysteresisThreshold{Trigger: 80, Clear: 70} + m.checkMetric("res1", "Resource", "node1", "inst1", "guest", "memory", 90, threshold, nil) + + m.mu.RLock() + _, exists := m.activeAlerts["res1-memory"] + m.mu.RUnlock() + if !exists { + t.Fatalf("expected alert to be created") + } + + // Now call checkMetric with a disabled threshold (Trigger=0) — should clear the alert + disabledThreshold := &HysteresisThreshold{Trigger: 0, Clear: 0} + m.checkMetric("res1", "Resource", "node1", "inst1", "guest", "memory", 90, disabledThreshold, nil) + + m.mu.RLock() + _, stillExists := m.activeAlerts["res1-memory"] + m.mu.RUnlock() + if stillExists { + t.Errorf("expected alert to be cleared when threshold is disabled (Trigger=0)") + } + + // Also test with nil threshold + // Re-create the alert + m.checkMetric("res1", "Resource", "node1", "inst1", "guest", "memory", 90, threshold, nil) + m.mu.RLock() + _, exists = m.activeAlerts["res1-memory"] + m.mu.RUnlock() + if !exists { + t.Fatalf("expected alert to be re-created") + } + + // Call with nil threshold — should also clear + m.checkMetric("res1", "Resource", "node1", "inst1", "guest", "memory", 90, nil, nil) + + m.mu.RLock() + _, stillExists = m.activeAlerts["res1-memory"] + m.mu.RUnlock() + if stillExists { + t.Errorf("expected alert to be cleared when threshold is nil") + } +} + func TestCheckGuestSkipsAlertsWhenMetricDisabled(t *testing.T) { m := newTestManager(t) diff --git a/internal/alerts/threshold_update_test.go b/internal/alerts/threshold_update_test.go index d0597b296..978ec52b2 100644 --- a/internal/alerts/threshold_update_test.go +++ b/internal/alerts/threshold_update_test.go @@ -216,3 +216,65 @@ func TestReevaluateActiveAlertsStillAboveThreshold(t *testing.T) { t.Errorf("Expected alert to remain active since value (96%%) is still above new threshold (90%%)") } } + +// TestReevaluateActiveAlertsGuestNotMisclassifiedAsNode tests that guest alerts +// are not misclassified as node alerts when Instance == Node (single-node setups). +// This is the root cause of GitHub #1145. +func TestReevaluateActiveAlertsGuestNotMisclassifiedAsNode(t *testing.T) { + manager := NewManager() + + manager.mu.Lock() + manager.activeAlerts = make(map[string]*Alert) + manager.mu.Unlock() + + // Configure: guest memory disabled (trigger=0), node memory enabled + config := AlertConfig{ + Enabled: true, + GuestDefaults: ThresholdConfig{ + CPU: &HysteresisThreshold{Trigger: 80, Clear: 75}, + Memory: &HysteresisThreshold{Trigger: 0, Clear: 0}, // Disabled + }, + NodeDefaults: ThresholdConfig{ + CPU: &HysteresisThreshold{Trigger: 80, Clear: 75}, + Memory: &HysteresisThreshold{Trigger: 85, Clear: 80}, // Enabled + }, + StorageDefault: HysteresisThreshold{Trigger: 85, Clear: 80}, + Overrides: make(map[string]ThresholdConfig), + } + manager.UpdateConfig(config) + + // Create a guest alert where Instance == Node (single-node setup). + // Guest resource IDs contain ":" (format instance:node:vmid). + alertID := "pve1:pve1:101-memory" + alert := &Alert{ + ID: alertID, + Type: "memory", + Level: AlertLevelWarning, + ResourceID: "pve1:pve1:101", + ResourceName: "test-vm", + Node: "pve1", + Instance: "pve1", // Same as Node — triggers the bug + Message: "Memory usage is 90%", + Value: 90.0, + Threshold: 85.0, + StartTime: time.Now().Add(-5 * time.Minute), + LastSeen: time.Now(), + } + + manager.mu.Lock() + manager.activeAlerts[alertID] = alert + manager.mu.Unlock() + + // Re-apply same config to trigger reevaluation + manager.UpdateConfig(config) + time.Sleep(100 * time.Millisecond) + + // The guest alert should be resolved because GuestDefaults.Memory is disabled + manager.mu.RLock() + _, alertStillActive := manager.activeAlerts[alertID] + manager.mu.RUnlock() + + if alertStillActive { + t.Errorf("Guest alert should have been resolved when guest memory threshold is disabled, but it was misclassified as a node alert") + } +} diff --git a/internal/alerts/utility_test.go b/internal/alerts/utility_test.go index 200587fcc..627d323fe 100644 --- a/internal/alerts/utility_test.go +++ b/internal/alerts/utility_test.go @@ -1619,20 +1619,20 @@ func TestEnsureValidHysteresis(t *testing.T) { expectChange: true, }, { - name: "zero trigger with positive clear is fixed", + name: "zero trigger with positive clear is skipped (disabled)", threshold: &HysteresisThreshold{Trigger: 0, Clear: 5}, metricName: "zero", wantTrigger: 0, - wantClear: 0, // 0 - 5 = -5, clamped to 0 - expectChange: true, + wantClear: 5, // Disabled thresholds are left as-is + expectChange: false, // No change — disabled threshold skipped }, { - name: "both zero triggers auto-fix (clear >= trigger)", + name: "both zero triggers skipped (disabled)", threshold: &HysteresisThreshold{Trigger: 0, Clear: 0}, metricName: "disabled", wantTrigger: 0, - wantClear: 0, // 0 - 5 = -5, clamped to 0 (same value, but fix attempted) - expectChange: false, // Result same as input, even though fix was attempted + wantClear: 0, + expectChange: false, // No change — disabled threshold skipped }, { name: "large trigger with equal clear",