mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-02-18 00:17:39 +01:00
fix(alerts): guest alerts misclassified as node alerts when threshold disabled (#1145)
In single-node setups, guest alerts had Instance == Node, causing reevaluateActiveAlertsLocked to evaluate them against NodeDefaults instead of GuestDefaults. Setting guest memory threshold to 0 (disabled) wouldn't clear existing guest alerts because they were being kept alive by the still-enabled node memory threshold. - Add resourceID colon check to distinguish guest IDs (instance:node:vmid) from node IDs (instance-node) in reevaluateActiveAlertsLocked - Clear stale alerts in checkMetric when threshold is nil or disabled - Skip hysteresis validation for disabled thresholds (Trigger <= 0) - Fix frontend tooltip: "0" not "-1" disables a threshold
This commit is contained in:
@@ -2677,7 +2677,7 @@ export function ThresholdsTable(props: ThresholdsTableProps) {
|
||||
<div class="text-sm text-blue-900 dark:text-blue-100">
|
||||
<span class="font-medium">Quick tips:</span> Set any threshold to{' '}
|
||||
<code class="px-1 py-0.5 bg-blue-100 dark:bg-blue-900/50 rounded text-xs font-mono">
|
||||
-1
|
||||
0
|
||||
</code>{' '}
|
||||
to disable alerts for that metric. Click on disabled thresholds showing{' '}
|
||||
<span class="italic">Off</span> to re-enable them. Resources with custom settings show a{' '}
|
||||
|
||||
@@ -1059,6 +1059,10 @@ func ensureValidHysteresis(threshold *HysteresisThreshold, metricName string) {
|
||||
if threshold == nil {
|
||||
return
|
||||
}
|
||||
// Disabled thresholds don't need hysteresis validation
|
||||
if threshold.Trigger <= 0 {
|
||||
return
|
||||
}
|
||||
if threshold.Clear >= threshold.Trigger {
|
||||
log.Warn().
|
||||
Str("metric", metricName).
|
||||
@@ -1786,7 +1790,7 @@ func (m *Manager) reevaluateActiveAlertsLocked() {
|
||||
|
||||
// Determine the resource type from the alert's metadata or instance
|
||||
// We need to check what kind of resource this is
|
||||
if threshold == nil && (alert.Instance == "Node" || alert.Instance == alert.Node) {
|
||||
if threshold == nil && !strings.Contains(resourceID, ":") && (alert.Instance == "Node" || alert.Instance == alert.Node) {
|
||||
// This is a node alert
|
||||
// Check if all node alerts are disabled
|
||||
if m.config.DisableAllNodes {
|
||||
@@ -6006,6 +6010,8 @@ type metricOptions struct {
|
||||
|
||||
func (m *Manager) checkMetric(resourceID, resourceName, node, instance, resourceType, metricType string, value float64, threshold *HysteresisThreshold, opts *metricOptions) {
|
||||
if threshold == nil || threshold.Trigger <= 0 {
|
||||
alertID := fmt.Sprintf("%s-%s", resourceID, metricType)
|
||||
m.clearAlert(alertID)
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
@@ -112,6 +112,59 @@ func TestAcknowledgePersistsThroughCheckMetric(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckMetricClearsAlertWhenThresholdDisabled(t *testing.T) {
|
||||
m := newTestManager(t)
|
||||
m.ClearActiveAlerts()
|
||||
m.mu.Lock()
|
||||
m.config.TimeThreshold = 0
|
||||
m.config.TimeThresholds = map[string]int{}
|
||||
m.config.SuppressionWindow = 0
|
||||
m.config.MinimumDelta = 0
|
||||
m.mu.Unlock()
|
||||
|
||||
// First, create an active alert with an enabled threshold
|
||||
threshold := &HysteresisThreshold{Trigger: 80, Clear: 70}
|
||||
m.checkMetric("res1", "Resource", "node1", "inst1", "guest", "memory", 90, threshold, nil)
|
||||
|
||||
m.mu.RLock()
|
||||
_, exists := m.activeAlerts["res1-memory"]
|
||||
m.mu.RUnlock()
|
||||
if !exists {
|
||||
t.Fatalf("expected alert to be created")
|
||||
}
|
||||
|
||||
// Now call checkMetric with a disabled threshold (Trigger=0) — should clear the alert
|
||||
disabledThreshold := &HysteresisThreshold{Trigger: 0, Clear: 0}
|
||||
m.checkMetric("res1", "Resource", "node1", "inst1", "guest", "memory", 90, disabledThreshold, nil)
|
||||
|
||||
m.mu.RLock()
|
||||
_, stillExists := m.activeAlerts["res1-memory"]
|
||||
m.mu.RUnlock()
|
||||
if stillExists {
|
||||
t.Errorf("expected alert to be cleared when threshold is disabled (Trigger=0)")
|
||||
}
|
||||
|
||||
// Also test with nil threshold
|
||||
// Re-create the alert
|
||||
m.checkMetric("res1", "Resource", "node1", "inst1", "guest", "memory", 90, threshold, nil)
|
||||
m.mu.RLock()
|
||||
_, exists = m.activeAlerts["res1-memory"]
|
||||
m.mu.RUnlock()
|
||||
if !exists {
|
||||
t.Fatalf("expected alert to be re-created")
|
||||
}
|
||||
|
||||
// Call with nil threshold — should also clear
|
||||
m.checkMetric("res1", "Resource", "node1", "inst1", "guest", "memory", 90, nil, nil)
|
||||
|
||||
m.mu.RLock()
|
||||
_, stillExists = m.activeAlerts["res1-memory"]
|
||||
m.mu.RUnlock()
|
||||
if stillExists {
|
||||
t.Errorf("expected alert to be cleared when threshold is nil")
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckGuestSkipsAlertsWhenMetricDisabled(t *testing.T) {
|
||||
m := newTestManager(t)
|
||||
|
||||
|
||||
@@ -216,3 +216,65 @@ func TestReevaluateActiveAlertsStillAboveThreshold(t *testing.T) {
|
||||
t.Errorf("Expected alert to remain active since value (96%%) is still above new threshold (90%%)")
|
||||
}
|
||||
}
|
||||
|
||||
// TestReevaluateActiveAlertsGuestNotMisclassifiedAsNode tests that guest alerts
|
||||
// are not misclassified as node alerts when Instance == Node (single-node setups).
|
||||
// This is the root cause of GitHub #1145.
|
||||
func TestReevaluateActiveAlertsGuestNotMisclassifiedAsNode(t *testing.T) {
|
||||
manager := NewManager()
|
||||
|
||||
manager.mu.Lock()
|
||||
manager.activeAlerts = make(map[string]*Alert)
|
||||
manager.mu.Unlock()
|
||||
|
||||
// Configure: guest memory disabled (trigger=0), node memory enabled
|
||||
config := AlertConfig{
|
||||
Enabled: true,
|
||||
GuestDefaults: ThresholdConfig{
|
||||
CPU: &HysteresisThreshold{Trigger: 80, Clear: 75},
|
||||
Memory: &HysteresisThreshold{Trigger: 0, Clear: 0}, // Disabled
|
||||
},
|
||||
NodeDefaults: ThresholdConfig{
|
||||
CPU: &HysteresisThreshold{Trigger: 80, Clear: 75},
|
||||
Memory: &HysteresisThreshold{Trigger: 85, Clear: 80}, // Enabled
|
||||
},
|
||||
StorageDefault: HysteresisThreshold{Trigger: 85, Clear: 80},
|
||||
Overrides: make(map[string]ThresholdConfig),
|
||||
}
|
||||
manager.UpdateConfig(config)
|
||||
|
||||
// Create a guest alert where Instance == Node (single-node setup).
|
||||
// Guest resource IDs contain ":" (format instance:node:vmid).
|
||||
alertID := "pve1:pve1:101-memory"
|
||||
alert := &Alert{
|
||||
ID: alertID,
|
||||
Type: "memory",
|
||||
Level: AlertLevelWarning,
|
||||
ResourceID: "pve1:pve1:101",
|
||||
ResourceName: "test-vm",
|
||||
Node: "pve1",
|
||||
Instance: "pve1", // Same as Node — triggers the bug
|
||||
Message: "Memory usage is 90%",
|
||||
Value: 90.0,
|
||||
Threshold: 85.0,
|
||||
StartTime: time.Now().Add(-5 * time.Minute),
|
||||
LastSeen: time.Now(),
|
||||
}
|
||||
|
||||
manager.mu.Lock()
|
||||
manager.activeAlerts[alertID] = alert
|
||||
manager.mu.Unlock()
|
||||
|
||||
// Re-apply same config to trigger reevaluation
|
||||
manager.UpdateConfig(config)
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
|
||||
// The guest alert should be resolved because GuestDefaults.Memory is disabled
|
||||
manager.mu.RLock()
|
||||
_, alertStillActive := manager.activeAlerts[alertID]
|
||||
manager.mu.RUnlock()
|
||||
|
||||
if alertStillActive {
|
||||
t.Errorf("Guest alert should have been resolved when guest memory threshold is disabled, but it was misclassified as a node alert")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1619,20 +1619,20 @@ func TestEnsureValidHysteresis(t *testing.T) {
|
||||
expectChange: true,
|
||||
},
|
||||
{
|
||||
name: "zero trigger with positive clear is fixed",
|
||||
name: "zero trigger with positive clear is skipped (disabled)",
|
||||
threshold: &HysteresisThreshold{Trigger: 0, Clear: 5},
|
||||
metricName: "zero",
|
||||
wantTrigger: 0,
|
||||
wantClear: 0, // 0 - 5 = -5, clamped to 0
|
||||
expectChange: true,
|
||||
wantClear: 5, // Disabled thresholds are left as-is
|
||||
expectChange: false, // No change — disabled threshold skipped
|
||||
},
|
||||
{
|
||||
name: "both zero triggers auto-fix (clear >= trigger)",
|
||||
name: "both zero triggers skipped (disabled)",
|
||||
threshold: &HysteresisThreshold{Trigger: 0, Clear: 0},
|
||||
metricName: "disabled",
|
||||
wantTrigger: 0,
|
||||
wantClear: 0, // 0 - 5 = -5, clamped to 0 (same value, but fix attempted)
|
||||
expectChange: false, // Result same as input, even though fix was attempted
|
||||
wantClear: 0,
|
||||
expectChange: false, // No change — disabled threshold skipped
|
||||
},
|
||||
{
|
||||
name: "large trigger with equal clear",
|
||||
|
||||
Reference in New Issue
Block a user