fix(monitoring): preserve recent PVE nodes on empty polls (#1094)

(cherry picked from commit 13af83f3fc)
This commit is contained in:
rcourtman
2026-02-07 14:18:33 +00:00
parent e3d6fe0a93
commit a44031f47d
2 changed files with 158 additions and 28 deletions

View File

@@ -5968,14 +5968,35 @@ func (m *Monitor) pollPVEInstance(ctx context.Context, instanceName string, clie
m.state.SetConnectionHealth(instanceName, false)
preserved := make([]models.Node, 0, len(prevInstanceNodes))
now := time.Now()
withinGraceCount := 0
forcedOfflineCount := 0
for _, prevNode := range prevInstanceNodes {
nodeCopy := prevNode
nodeCopy.Status = "offline"
nodeCopy.ConnectionHealth = "error"
nodeCopy.Uptime = 0
nodeCopy.CPU = 0
// If a node was seen recently, keep its prior state for one grace window
// to avoid false-offline flips during transient cluster/resources gaps.
if !nodeCopy.LastSeen.IsZero() && now.Sub(nodeCopy.LastSeen) < nodeOfflineGracePeriod {
withinGraceCount++
if nodeCopy.ConnectionHealth == "" || nodeCopy.ConnectionHealth == "healthy" {
nodeCopy.ConnectionHealth = "degraded"
}
} else {
forcedOfflineCount++
nodeCopy.Status = "offline"
nodeCopy.ConnectionHealth = "error"
nodeCopy.Uptime = 0
nodeCopy.CPU = 0
}
preserved = append(preserved, nodeCopy)
}
log.Warn().
Str("instance", instanceName).
Int("withinGrace", withinGraceCount).
Int("forcedOffline", forcedOfflineCount).
Dur("gracePeriod", nodeOfflineGracePeriod).
Msg("Applied empty-node fallback state handling")
modelNodes = preserved
}

View File

@@ -138,6 +138,33 @@ func (s *stubPVEClient) GetNodePendingUpdates(ctx context.Context, node string)
func floatPtr(v float64) *float64 { return &v }
func newTestPVEMonitor(instanceName string) *Monitor {
return &Monitor{
config: &config.Config{
PVEInstances: []config.PVEInstance{{
Name: instanceName,
Host: "https://pve",
}},
},
state: models.NewState(),
alertManager: alerts.NewManager(),
notificationMgr: notifications.NewNotificationManager(""),
metricsHistory: NewMetricsHistory(32, time.Hour),
nodeSnapshots: make(map[string]NodeMemorySnapshot),
guestSnapshots: make(map[string]GuestMemorySnapshot),
nodeRRDMemCache: make(map[string]rrdMemCacheEntry),
lastClusterCheck: make(map[string]time.Time),
lastPhysicalDiskPoll: make(map[string]time.Time),
failureCounts: make(map[string]int),
lastOutcome: make(map[string]taskOutcome),
pollStatusMap: make(map[string]*pollStatus),
dlqInsightMap: make(map[string]*dlqInsight),
authFailures: make(map[string]int),
lastAuthAttempt: make(map[string]time.Time),
nodeLastOnline: make(map[string]time.Time),
}
}
func TestPollPVEInstanceUsesRRDMemUsedFallback(t *testing.T) {
t.Setenv("PULSE_DATA_DIR", t.TempDir())
@@ -173,30 +200,7 @@ func TestPollPVEInstanceUsesRRDMemUsedFallback(t *testing.T) {
},
}
mon := &Monitor{
config: &config.Config{
PVEInstances: []config.PVEInstance{{
Name: "test",
Host: "https://pve",
}},
},
state: models.NewState(),
alertManager: alerts.NewManager(),
notificationMgr: notifications.NewNotificationManager(""),
metricsHistory: NewMetricsHistory(32, time.Hour),
nodeSnapshots: make(map[string]NodeMemorySnapshot),
guestSnapshots: make(map[string]GuestMemorySnapshot),
nodeRRDMemCache: make(map[string]rrdMemCacheEntry),
lastClusterCheck: make(map[string]time.Time),
lastPhysicalDiskPoll: make(map[string]time.Time),
failureCounts: make(map[string]int),
lastOutcome: make(map[string]taskOutcome),
pollStatusMap: make(map[string]*pollStatus),
dlqInsightMap: make(map[string]*dlqInsight),
authFailures: make(map[string]int),
lastAuthAttempt: make(map[string]time.Time),
nodeLastOnline: make(map[string]time.Time),
}
mon := newTestPVEMonitor("test")
defer mon.alertManager.Stop()
defer mon.notificationMgr.Stop()
@@ -233,3 +237,108 @@ func TestPollPVEInstanceUsesRRDMemUsedFallback(t *testing.T) {
t.Fatalf("expected snapshot RRD used %d, got %d", actualUsed, snap.Raw.RRDUsed)
}
}
func TestPollPVEInstancePreservesRecentNodesWhenGetNodesReturnsEmpty(t *testing.T) {
t.Setenv("PULSE_DATA_DIR", t.TempDir())
client := &stubPVEClient{
nodes: []proxmox.Node{
{
Node: "node1",
Status: "online",
CPU: 0.10,
MaxCPU: 8,
Mem: 4 * 1024 * 1024 * 1024,
MaxMem: 8 * 1024 * 1024 * 1024,
Uptime: 7200,
MaxDisk: 100 * 1024 * 1024 * 1024,
Disk: 40 * 1024 * 1024 * 1024,
},
},
nodeStatus: &proxmox.NodeStatus{
Memory: &proxmox.MemoryStatus{
Total: 8 * 1024 * 1024 * 1024,
Used: 4 * 1024 * 1024 * 1024,
Free: 4 * 1024 * 1024 * 1024,
},
},
}
mon := newTestPVEMonitor("test")
defer mon.alertManager.Stop()
defer mon.notificationMgr.Stop()
mon.pollPVEInstance(context.Background(), "test", client)
// Simulate transient API gap: node list temporarily empty.
client.nodes = nil
mon.pollPVEInstance(context.Background(), "test", client)
snapshot := mon.state.GetSnapshot()
if len(snapshot.Nodes) != 1 {
t.Fatalf("expected one node in state, got %d", len(snapshot.Nodes))
}
node := snapshot.Nodes[0]
if node.Status == "offline" {
t.Fatalf("expected recent node to remain non-offline during grace window, got %q", node.Status)
}
}
func TestPollPVEInstanceMarksStaleNodesOfflineWhenGetNodesReturnsEmpty(t *testing.T) {
t.Setenv("PULSE_DATA_DIR", t.TempDir())
client := &stubPVEClient{
nodes: []proxmox.Node{
{
Node: "node1",
Status: "online",
CPU: 0.10,
MaxCPU: 8,
Mem: 4 * 1024 * 1024 * 1024,
MaxMem: 8 * 1024 * 1024 * 1024,
Uptime: 7200,
MaxDisk: 100 * 1024 * 1024 * 1024,
Disk: 40 * 1024 * 1024 * 1024,
},
},
nodeStatus: &proxmox.NodeStatus{
Memory: &proxmox.MemoryStatus{
Total: 8 * 1024 * 1024 * 1024,
Used: 4 * 1024 * 1024 * 1024,
Free: 4 * 1024 * 1024 * 1024,
},
},
}
mon := newTestPVEMonitor("test")
defer mon.alertManager.Stop()
defer mon.notificationMgr.Stop()
mon.pollPVEInstance(context.Background(), "test", client)
first := mon.state.GetSnapshot()
if len(first.Nodes) != 1 {
t.Fatalf("expected one node after first poll, got %d", len(first.Nodes))
}
staleNode := first.Nodes[0]
staleNode.LastSeen = time.Now().Add(-nodeOfflineGracePeriod - 2*time.Second)
mon.state.UpdateNodesForInstance("test", []models.Node{staleNode})
client.nodes = nil
mon.pollPVEInstance(context.Background(), "test", client)
second := mon.state.GetSnapshot()
if len(second.Nodes) != 1 {
t.Fatalf("expected one node after fallback poll, got %d", len(second.Nodes))
}
node := second.Nodes[0]
if node.Status != "offline" {
t.Fatalf("expected stale node to be marked offline, got %q", node.Status)
}
if node.ConnectionHealth != "error" {
t.Fatalf("expected stale node connection health error, got %q", node.ConnectionHealth)
}
}