From 83e07969f07d0d17b08380dab003eba22dacbbf7 Mon Sep 17 00:00:00 2001 From: "courtmanr@gmail.com" Date: Sat, 22 Nov 2025 23:31:30 +0000 Subject: [PATCH] fix: ensure proxmox nodes are displayed even if cluster endpoints are missing Fixes #727. Previously, if temperature monitoring was enabled and a node wasn't found in ClusterEndpoints, the entire node processing was skipped. This change ensures we only skip temperature collection. --- internal/monitoring/monitor.go | 173 +++++++++++++++++---------------- 1 file changed, 88 insertions(+), 85 deletions(-) diff --git a/internal/monitoring/monitor.go b/internal/monitoring/monitor.go index cd00dfd0f..236a55f9f 100644 --- a/internal/monitoring/monitor.go +++ b/internal/monitoring/monitor.go @@ -6165,6 +6165,7 @@ func (m *Monitor) pollPVEInstance(ctx context.Context, instanceName string, clie // Prefer the resolved host for this node, with cluster overrides when available. sshHost := modelNode.Host foundNodeEndpoint := false + shouldCollect := true if modelNode.IsClusterMember && instanceCfg.IsCluster { // For cluster members, wait until we have validated endpoints @@ -6175,107 +6176,109 @@ func (m *Monitor) pollPVEInstance(ctx context.Context, instanceName string, clie Str("node", node.Node). Str("instance", instanceCfg.Name). Msg("Skipping temperature collection - cluster endpoints not yet validated") - continue - } - - hasFingerprint := instanceCfg.Fingerprint != "" - for _, ep := range instanceCfg.ClusterEndpoints { - if strings.EqualFold(ep.NodeName, node.Node) { - if effective := clusterEndpointEffectiveURL(ep, instanceCfg.VerifySSL, hasFingerprint); effective != "" { - sshHost = effective - foundNodeEndpoint = true + shouldCollect = false + } else { + hasFingerprint := instanceCfg.Fingerprint != "" + for _, ep := range instanceCfg.ClusterEndpoints { + if strings.EqualFold(ep.NodeName, node.Node) { + if effective := clusterEndpointEffectiveURL(ep, instanceCfg.VerifySSL, hasFingerprint); effective != "" { + sshHost = effective + foundNodeEndpoint = true + } + break } - break } - } - // If this node is a cluster member but we didn't find its specific endpoint, - // skip temperature collection to avoid using wrong endpoint - if !foundNodeEndpoint { - tempCancel() - log.Debug(). - Str("node", node.Node). - Str("instance", instanceCfg.Name). - Int("endpointCount", len(instanceCfg.ClusterEndpoints)). - Msg("Skipping temperature collection - node endpoint not found in cluster metadata") - continue + // If this node is a cluster member but we didn't find its specific endpoint, + // skip temperature collection to avoid using wrong endpoint + if !foundNodeEndpoint { + tempCancel() + log.Debug(). + Str("node", node.Node). + Str("instance", instanceCfg.Name). + Int("endpointCount", len(instanceCfg.ClusterEndpoints)). + Msg("Skipping temperature collection - node endpoint not found in cluster metadata") + shouldCollect = false + } } } - if strings.TrimSpace(sshHost) == "" { - sshHost = node.Node - } - - // Use HTTP proxy if configured for this instance, otherwise fall back to socket/SSH - temp, err := m.tempCollector.CollectTemperatureWithProxy(tempCtx, sshHost, node.Node, instanceCfg.TemperatureProxyURL, instanceCfg.TemperatureProxyToken) - tempCancel() - - if err == nil && temp != nil && temp.Available { - // Get the current CPU temperature (prefer package, fall back to max) - currentTemp := temp.CPUPackage - if currentTemp == 0 && temp.CPUMax > 0 { - currentTemp = temp.CPUMax + if shouldCollect { + if strings.TrimSpace(sshHost) == "" { + sshHost = node.Node } - // Find previous temperature data for this node to preserve min/max - var prevTemp *models.Temperature - for _, prevNode := range prevInstanceNodes { - if prevNode.ID == modelNode.ID && prevNode.Temperature != nil { - prevTemp = prevNode.Temperature - break + // Use HTTP proxy if configured for this instance, otherwise fall back to socket/SSH + temp, err := m.tempCollector.CollectTemperatureWithProxy(tempCtx, sshHost, node.Node, instanceCfg.TemperatureProxyURL, instanceCfg.TemperatureProxyToken) + tempCancel() + + if err == nil && temp != nil && temp.Available { + // Get the current CPU temperature (prefer package, fall back to max) + currentTemp := temp.CPUPackage + if currentTemp == 0 && temp.CPUMax > 0 { + currentTemp = temp.CPUMax } - } - // Initialize or update min/max tracking - if prevTemp != nil && prevTemp.CPUMin > 0 { - // Preserve existing min/max and update if necessary - temp.CPUMin = prevTemp.CPUMin - temp.CPUMaxRecord = prevTemp.CPUMaxRecord - temp.MinRecorded = prevTemp.MinRecorded - temp.MaxRecorded = prevTemp.MaxRecorded + // Find previous temperature data for this node to preserve min/max + var prevTemp *models.Temperature + for _, prevNode := range prevInstanceNodes { + if prevNode.ID == modelNode.ID && prevNode.Temperature != nil { + prevTemp = prevNode.Temperature + break + } + } - // Update min if current is lower - if currentTemp > 0 && currentTemp < temp.CPUMin { + // Initialize or update min/max tracking + if prevTemp != nil && prevTemp.CPUMin > 0 { + // Preserve existing min/max and update if necessary + temp.CPUMin = prevTemp.CPUMin + temp.CPUMaxRecord = prevTemp.CPUMaxRecord + temp.MinRecorded = prevTemp.MinRecorded + temp.MaxRecorded = prevTemp.MaxRecorded + + // Update min if current is lower + if currentTemp > 0 && currentTemp < temp.CPUMin { + temp.CPUMin = currentTemp + temp.MinRecorded = time.Now() + } + + // Update max if current is higher + if currentTemp > temp.CPUMaxRecord { + temp.CPUMaxRecord = currentTemp + temp.MaxRecorded = time.Now() + } + } else if currentTemp > 0 { + // First reading - initialize min/max to current value temp.CPUMin = currentTemp - temp.MinRecorded = time.Now() - } - - // Update max if current is higher - if currentTemp > temp.CPUMaxRecord { temp.CPUMaxRecord = currentTemp + temp.MinRecorded = time.Now() temp.MaxRecorded = time.Now() } - } else if currentTemp > 0 { - // First reading - initialize min/max to current value - temp.CPUMin = currentTemp - temp.CPUMaxRecord = currentTemp - temp.MinRecorded = time.Now() - temp.MaxRecorded = time.Now() - } - modelNode.Temperature = temp - log.Debug(). - Str("node", node.Node). - Str("sshHost", sshHost). - Float64("cpuPackage", temp.CPUPackage). - Float64("cpuMax", temp.CPUMax). - Float64("cpuMin", temp.CPUMin). - Float64("cpuMaxRecord", temp.CPUMaxRecord). - Int("nvmeCount", len(temp.NVMe)). - Msg("Collected temperature data") - } else if err != nil { - log.Debug(). - Str("node", node.Node). - Str("sshHost", sshHost). - Bool("isCluster", modelNode.IsClusterMember). - Int("endpointCount", len(instanceCfg.ClusterEndpoints)). - Msg("Temperature collection failed - check SSH access") - } else if temp != nil { - log.Debug(). - Str("node", node.Node). - Str("sshHost", sshHost). - Bool("available", temp.Available). - Msg("Temperature data unavailable after collection") + modelNode.Temperature = temp + log.Debug(). + Str("node", node.Node). + Str("sshHost", sshHost). + Float64("cpuPackage", temp.CPUPackage). + Float64("cpuMax", temp.CPUMax). + Float64("cpuMin", temp.CPUMin). + Float64("cpuMaxRecord", temp.CPUMaxRecord). + Int("nvmeCount", len(temp.NVMe)). + Msg("Collected temperature data") + } else if err != nil { + log.Debug(). + Str("node", node.Node). + Str("sshHost", sshHost). + Bool("isCluster", modelNode.IsClusterMember). + Int("endpointCount", len(instanceCfg.ClusterEndpoints)). + Msg("Temperature collection failed - check SSH access") + } else if temp != nil { + log.Debug(). + Str("node", node.Node). + Str("sshHost", sshHost). + Bool("available", temp.Available). + Msg("Temperature data unavailable after collection") + } } }