fix(alerts): guest alerts misclassified as node alerts when threshold disabled (#1145)

In single-node setups, guest alerts had Instance == Node, causing
reevaluateActiveAlertsLocked to evaluate them against NodeDefaults
instead of GuestDefaults. Setting guest memory threshold to 0 (disabled)
wouldn't clear existing guest alerts because they were being kept alive
by the still-enabled node memory threshold.

- Add resourceID colon check to distinguish guest IDs (instance:node:vmid)
  from node IDs (instance-node) in reevaluateActiveAlertsLocked
- Clear stale alerts in checkMetric when threshold is nil or disabled
- Skip hysteresis validation for disabled thresholds (Trigger <= 0)
- Fix frontend tooltip: "0" not "-1" disables a threshold
This commit is contained in:
rcourtman
2026-02-02 15:17:53 +00:00
parent 712e5846ec
commit 7444bd0468
5 changed files with 129 additions and 8 deletions

View File

@@ -2677,7 +2677,7 @@ export function ThresholdsTable(props: ThresholdsTableProps) {
<div class="text-sm text-blue-900 dark:text-blue-100">
<span class="font-medium">Quick tips:</span> Set any threshold to{' '}
<code class="px-1 py-0.5 bg-blue-100 dark:bg-blue-900/50 rounded text-xs font-mono">
-1
0
</code>{' '}
to disable alerts for that metric. Click on disabled thresholds showing{' '}
<span class="italic">Off</span> to re-enable them. Resources with custom settings show a{' '}

View File

@@ -1059,6 +1059,10 @@ func ensureValidHysteresis(threshold *HysteresisThreshold, metricName string) {
if threshold == nil {
return
}
// Disabled thresholds don't need hysteresis validation
if threshold.Trigger <= 0 {
return
}
if threshold.Clear >= threshold.Trigger {
log.Warn().
Str("metric", metricName).
@@ -1786,7 +1790,7 @@ func (m *Manager) reevaluateActiveAlertsLocked() {
// Determine the resource type from the alert's metadata or instance
// We need to check what kind of resource this is
if threshold == nil && (alert.Instance == "Node" || alert.Instance == alert.Node) {
if threshold == nil && !strings.Contains(resourceID, ":") && (alert.Instance == "Node" || alert.Instance == alert.Node) {
// This is a node alert
// Check if all node alerts are disabled
if m.config.DisableAllNodes {
@@ -6006,6 +6010,8 @@ type metricOptions struct {
func (m *Manager) checkMetric(resourceID, resourceName, node, instance, resourceType, metricType string, value float64, threshold *HysteresisThreshold, opts *metricOptions) {
if threshold == nil || threshold.Trigger <= 0 {
alertID := fmt.Sprintf("%s-%s", resourceID, metricType)
m.clearAlert(alertID)
return
}

View File

@@ -112,6 +112,59 @@ func TestAcknowledgePersistsThroughCheckMetric(t *testing.T) {
}
}
func TestCheckMetricClearsAlertWhenThresholdDisabled(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.SuppressionWindow = 0
m.config.MinimumDelta = 0
m.mu.Unlock()
// First, create an active alert with an enabled threshold
threshold := &HysteresisThreshold{Trigger: 80, Clear: 70}
m.checkMetric("res1", "Resource", "node1", "inst1", "guest", "memory", 90, threshold, nil)
m.mu.RLock()
_, exists := m.activeAlerts["res1-memory"]
m.mu.RUnlock()
if !exists {
t.Fatalf("expected alert to be created")
}
// Now call checkMetric with a disabled threshold (Trigger=0) — should clear the alert
disabledThreshold := &HysteresisThreshold{Trigger: 0, Clear: 0}
m.checkMetric("res1", "Resource", "node1", "inst1", "guest", "memory", 90, disabledThreshold, nil)
m.mu.RLock()
_, stillExists := m.activeAlerts["res1-memory"]
m.mu.RUnlock()
if stillExists {
t.Errorf("expected alert to be cleared when threshold is disabled (Trigger=0)")
}
// Also test with nil threshold
// Re-create the alert
m.checkMetric("res1", "Resource", "node1", "inst1", "guest", "memory", 90, threshold, nil)
m.mu.RLock()
_, exists = m.activeAlerts["res1-memory"]
m.mu.RUnlock()
if !exists {
t.Fatalf("expected alert to be re-created")
}
// Call with nil threshold — should also clear
m.checkMetric("res1", "Resource", "node1", "inst1", "guest", "memory", 90, nil, nil)
m.mu.RLock()
_, stillExists = m.activeAlerts["res1-memory"]
m.mu.RUnlock()
if stillExists {
t.Errorf("expected alert to be cleared when threshold is nil")
}
}
func TestCheckGuestSkipsAlertsWhenMetricDisabled(t *testing.T) {
m := newTestManager(t)

View File

@@ -216,3 +216,65 @@ func TestReevaluateActiveAlertsStillAboveThreshold(t *testing.T) {
t.Errorf("Expected alert to remain active since value (96%%) is still above new threshold (90%%)")
}
}
// TestReevaluateActiveAlertsGuestNotMisclassifiedAsNode tests that guest alerts
// are not misclassified as node alerts when Instance == Node (single-node setups).
// This is the root cause of GitHub #1145.
func TestReevaluateActiveAlertsGuestNotMisclassifiedAsNode(t *testing.T) {
manager := NewManager()
manager.mu.Lock()
manager.activeAlerts = make(map[string]*Alert)
manager.mu.Unlock()
// Configure: guest memory disabled (trigger=0), node memory enabled
config := AlertConfig{
Enabled: true,
GuestDefaults: ThresholdConfig{
CPU: &HysteresisThreshold{Trigger: 80, Clear: 75},
Memory: &HysteresisThreshold{Trigger: 0, Clear: 0}, // Disabled
},
NodeDefaults: ThresholdConfig{
CPU: &HysteresisThreshold{Trigger: 80, Clear: 75},
Memory: &HysteresisThreshold{Trigger: 85, Clear: 80}, // Enabled
},
StorageDefault: HysteresisThreshold{Trigger: 85, Clear: 80},
Overrides: make(map[string]ThresholdConfig),
}
manager.UpdateConfig(config)
// Create a guest alert where Instance == Node (single-node setup).
// Guest resource IDs contain ":" (format instance:node:vmid).
alertID := "pve1:pve1:101-memory"
alert := &Alert{
ID: alertID,
Type: "memory",
Level: AlertLevelWarning,
ResourceID: "pve1:pve1:101",
ResourceName: "test-vm",
Node: "pve1",
Instance: "pve1", // Same as Node — triggers the bug
Message: "Memory usage is 90%",
Value: 90.0,
Threshold: 85.0,
StartTime: time.Now().Add(-5 * time.Minute),
LastSeen: time.Now(),
}
manager.mu.Lock()
manager.activeAlerts[alertID] = alert
manager.mu.Unlock()
// Re-apply same config to trigger reevaluation
manager.UpdateConfig(config)
time.Sleep(100 * time.Millisecond)
// The guest alert should be resolved because GuestDefaults.Memory is disabled
manager.mu.RLock()
_, alertStillActive := manager.activeAlerts[alertID]
manager.mu.RUnlock()
if alertStillActive {
t.Errorf("Guest alert should have been resolved when guest memory threshold is disabled, but it was misclassified as a node alert")
}
}

View File

@@ -1619,20 +1619,20 @@ func TestEnsureValidHysteresis(t *testing.T) {
expectChange: true,
},
{
name: "zero trigger with positive clear is fixed",
name: "zero trigger with positive clear is skipped (disabled)",
threshold: &HysteresisThreshold{Trigger: 0, Clear: 5},
metricName: "zero",
wantTrigger: 0,
wantClear: 0, // 0 - 5 = -5, clamped to 0
expectChange: true,
wantClear: 5, // Disabled thresholds are left as-is
expectChange: false, // No change — disabled threshold skipped
},
{
name: "both zero triggers auto-fix (clear >= trigger)",
name: "both zero triggers skipped (disabled)",
threshold: &HysteresisThreshold{Trigger: 0, Clear: 0},
metricName: "disabled",
wantTrigger: 0,
wantClear: 0, // 0 - 5 = -5, clamped to 0 (same value, but fix attempted)
expectChange: false, // Result same as input, even though fix was attempted
wantClear: 0,
expectChange: false, // No change — disabled threshold skipped
},
{
name: "large trigger with equal clear",