mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-02-18 23:41:48 +01:00
10929 lines
336 KiB
Go
10929 lines
336 KiB
Go
package monitoring
|
||
|
||
import (
|
||
"context"
|
||
"crypto/sha1"
|
||
"encoding/hex"
|
||
stderrors "errors"
|
||
"fmt"
|
||
"math"
|
||
"math/rand"
|
||
"net"
|
||
"net/url"
|
||
"os"
|
||
"path/filepath"
|
||
"sort"
|
||
"strconv"
|
||
"strings"
|
||
"sync"
|
||
"sync/atomic"
|
||
"time"
|
||
|
||
"github.com/rcourtman/pulse-go-rewrite/internal/ai/memory"
|
||
"github.com/rcourtman/pulse-go-rewrite/internal/alerts"
|
||
"github.com/rcourtman/pulse-go-rewrite/internal/config"
|
||
"github.com/rcourtman/pulse-go-rewrite/internal/discovery"
|
||
"github.com/rcourtman/pulse-go-rewrite/internal/errors"
|
||
"github.com/rcourtman/pulse-go-rewrite/internal/logging"
|
||
"github.com/rcourtman/pulse-go-rewrite/internal/mock"
|
||
"github.com/rcourtman/pulse-go-rewrite/internal/models"
|
||
"github.com/rcourtman/pulse-go-rewrite/internal/notifications"
|
||
"github.com/rcourtman/pulse-go-rewrite/internal/resources"
|
||
"github.com/rcourtman/pulse-go-rewrite/internal/system"
|
||
"github.com/rcourtman/pulse-go-rewrite/internal/types"
|
||
"github.com/rcourtman/pulse-go-rewrite/internal/websocket"
|
||
agentsdocker "github.com/rcourtman/pulse-go-rewrite/pkg/agents/docker"
|
||
agentshost "github.com/rcourtman/pulse-go-rewrite/pkg/agents/host"
|
||
"github.com/rcourtman/pulse-go-rewrite/pkg/fsfilters"
|
||
"github.com/rcourtman/pulse-go-rewrite/pkg/metrics"
|
||
"github.com/rcourtman/pulse-go-rewrite/pkg/pbs"
|
||
"github.com/rcourtman/pulse-go-rewrite/pkg/pmg"
|
||
"github.com/rcourtman/pulse-go-rewrite/pkg/proxmox"
|
||
"github.com/rs/zerolog"
|
||
"github.com/rs/zerolog/log"
|
||
)
|
||
|
||
const (
|
||
defaultTaskTimeout = 90 * time.Second
|
||
minTaskTimeout = 30 * time.Second
|
||
maxTaskTimeout = 3 * time.Minute
|
||
)
|
||
|
||
// newProxmoxClientFunc is a variable that holds the function to create a new Proxmox client.
|
||
// It is used to allow mocking the client creation in tests.
|
||
var newProxmoxClientFunc = func(cfg proxmox.ClientConfig) (PVEClientInterface, error) {
|
||
return proxmox.NewClient(cfg)
|
||
}
|
||
|
||
// PVEClientInterface defines the interface for PVE clients (both regular and cluster)
|
||
type PVEClientInterface interface {
|
||
GetNodes(ctx context.Context) ([]proxmox.Node, error)
|
||
GetNodeStatus(ctx context.Context, node string) (*proxmox.NodeStatus, error)
|
||
GetNodeRRDData(ctx context.Context, node string, timeframe string, cf string, ds []string) ([]proxmox.NodeRRDPoint, error)
|
||
GetLXCRRDData(ctx context.Context, node string, vmid int, timeframe string, cf string, ds []string) ([]proxmox.GuestRRDPoint, error)
|
||
GetVMs(ctx context.Context, node string) ([]proxmox.VM, error)
|
||
GetContainers(ctx context.Context, node string) ([]proxmox.Container, error)
|
||
GetStorage(ctx context.Context, node string) ([]proxmox.Storage, error)
|
||
GetAllStorage(ctx context.Context) ([]proxmox.Storage, error)
|
||
GetBackupTasks(ctx context.Context) ([]proxmox.Task, error)
|
||
GetReplicationStatus(ctx context.Context) ([]proxmox.ReplicationJob, error)
|
||
GetStorageContent(ctx context.Context, node, storage string) ([]proxmox.StorageContent, error)
|
||
GetVMSnapshots(ctx context.Context, node string, vmid int) ([]proxmox.Snapshot, error)
|
||
GetContainerSnapshots(ctx context.Context, node string, vmid int) ([]proxmox.Snapshot, error)
|
||
GetVMStatus(ctx context.Context, node string, vmid int) (*proxmox.VMStatus, error)
|
||
GetContainerStatus(ctx context.Context, node string, vmid int) (*proxmox.Container, error)
|
||
GetContainerConfig(ctx context.Context, node string, vmid int) (map[string]interface{}, error)
|
||
GetContainerInterfaces(ctx context.Context, node string, vmid int) ([]proxmox.ContainerInterface, error)
|
||
GetClusterResources(ctx context.Context, resourceType string) ([]proxmox.ClusterResource, error)
|
||
IsClusterMember(ctx context.Context) (bool, error)
|
||
GetVMFSInfo(ctx context.Context, node string, vmid int) ([]proxmox.VMFileSystem, error)
|
||
GetVMNetworkInterfaces(ctx context.Context, node string, vmid int) ([]proxmox.VMNetworkInterface, error)
|
||
GetVMAgentInfo(ctx context.Context, node string, vmid int) (map[string]interface{}, error)
|
||
GetVMAgentVersion(ctx context.Context, node string, vmid int) (string, error)
|
||
GetZFSPoolStatus(ctx context.Context, node string) ([]proxmox.ZFSPoolStatus, error)
|
||
GetZFSPoolsWithDetails(ctx context.Context, node string) ([]proxmox.ZFSPoolInfo, error)
|
||
GetDisks(ctx context.Context, node string) ([]proxmox.Disk, error)
|
||
GetNodePendingUpdates(ctx context.Context, node string) ([]proxmox.AptPackage, error)
|
||
GetCephStatus(ctx context.Context) (*proxmox.CephStatus, error)
|
||
GetCephDF(ctx context.Context) (*proxmox.CephDF, error)
|
||
}
|
||
|
||
// ResourceStoreInterface provides methods for polling optimization and resource access.
|
||
// When an agent is monitoring a node, we can reduce API polling for that node.
|
||
type ResourceStoreInterface interface {
|
||
// ShouldSkipAPIPolling returns true if API polling should be skipped for the hostname
|
||
// because an agent is providing richer data.
|
||
ShouldSkipAPIPolling(hostname string) bool
|
||
// GetPollingRecommendations returns a map of hostname -> polling multiplier.
|
||
// 0 = skip entirely, 0.5 = half frequency, 1 = normal
|
||
GetPollingRecommendations() map[string]float64
|
||
// GetAll returns all resources in the store (for WebSocket broadcasts)
|
||
GetAll() []resources.Resource
|
||
// PopulateFromSnapshot updates the store with data from a StateSnapshot
|
||
PopulateFromSnapshot(snapshot models.StateSnapshot)
|
||
}
|
||
|
||
func getNodeDisplayName(instance *config.PVEInstance, nodeName string) string {
|
||
baseName := strings.TrimSpace(nodeName)
|
||
if baseName == "" {
|
||
baseName = "unknown-node"
|
||
}
|
||
|
||
if instance == nil {
|
||
return baseName
|
||
}
|
||
|
||
friendly := strings.TrimSpace(instance.Name)
|
||
|
||
if instance.IsCluster {
|
||
if endpointLabel := lookupClusterEndpointLabel(instance, nodeName); endpointLabel != "" {
|
||
return endpointLabel
|
||
}
|
||
|
||
if baseName != "" && baseName != "unknown-node" {
|
||
return baseName
|
||
}
|
||
|
||
if friendly != "" {
|
||
return friendly
|
||
}
|
||
|
||
return baseName
|
||
}
|
||
|
||
if friendly != "" {
|
||
return friendly
|
||
}
|
||
|
||
if baseName != "" && baseName != "unknown-node" {
|
||
return baseName
|
||
}
|
||
|
||
if label := normalizeEndpointHost(instance.Host); label != "" && !isLikelyIPAddress(label) {
|
||
return label
|
||
}
|
||
|
||
return baseName
|
||
}
|
||
|
||
func (m *Monitor) getInstanceConfig(instanceName string) *config.PVEInstance {
|
||
if m == nil || m.config == nil {
|
||
return nil
|
||
}
|
||
for i := range m.config.PVEInstances {
|
||
if strings.EqualFold(m.config.PVEInstances[i].Name, instanceName) {
|
||
return &m.config.PVEInstances[i]
|
||
}
|
||
}
|
||
return nil
|
||
}
|
||
|
||
func mergeNVMeTempsIntoDisks(disks []models.PhysicalDisk, nodes []models.Node) []models.PhysicalDisk {
|
||
if len(disks) == 0 || len(nodes) == 0 {
|
||
return disks
|
||
}
|
||
|
||
// Build temperature maps by node for both SMART and legacy NVMe data
|
||
smartTempsByNode := make(map[string][]models.DiskTemp)
|
||
nvmeTempsByNode := make(map[string][]models.NVMeTemp)
|
||
|
||
for _, node := range nodes {
|
||
log.Debug().
|
||
Str("nodeName", node.Name).
|
||
Bool("hasTemp", node.Temperature != nil).
|
||
Bool("tempAvailable", node.Temperature != nil && node.Temperature.Available).
|
||
Int("smartCount", func() int {
|
||
if node.Temperature != nil {
|
||
return len(node.Temperature.SMART)
|
||
}
|
||
return 0
|
||
}()).
|
||
Msg("mergeNVMeTempsIntoDisks: checking node temperature")
|
||
|
||
if node.Temperature == nil || !node.Temperature.Available {
|
||
continue
|
||
}
|
||
|
||
// Collect SMART temps (preferred source)
|
||
if len(node.Temperature.SMART) > 0 {
|
||
temps := make([]models.DiskTemp, len(node.Temperature.SMART))
|
||
copy(temps, node.Temperature.SMART)
|
||
smartTempsByNode[node.Name] = temps
|
||
log.Debug().
|
||
Str("nodeName", node.Name).
|
||
Int("smartTempCount", len(temps)).
|
||
Msg("mergeNVMeTempsIntoDisks: collected SMART temps for node")
|
||
}
|
||
|
||
// Collect legacy NVMe temps as fallback
|
||
if len(node.Temperature.NVMe) > 0 {
|
||
temps := make([]models.NVMeTemp, len(node.Temperature.NVMe))
|
||
copy(temps, node.Temperature.NVMe)
|
||
sort.Slice(temps, func(i, j int) bool {
|
||
return temps[i].Device < temps[j].Device
|
||
})
|
||
nvmeTempsByNode[node.Name] = temps
|
||
}
|
||
}
|
||
|
||
if len(smartTempsByNode) == 0 && len(nvmeTempsByNode) == 0 {
|
||
log.Debug().
|
||
Int("diskCount", len(disks)).
|
||
Msg("mergeNVMeTempsIntoDisks: no SMART or NVMe temperature data available")
|
||
return disks
|
||
}
|
||
|
||
log.Debug().
|
||
Int("smartNodeCount", len(smartTempsByNode)).
|
||
Int("nvmeNodeCount", len(nvmeTempsByNode)).
|
||
Int("diskCount", len(disks)).
|
||
Msg("mergeNVMeTempsIntoDisks: starting disk temperature merge")
|
||
|
||
updated := make([]models.PhysicalDisk, len(disks))
|
||
copy(updated, disks)
|
||
|
||
// Process SMART temperatures first (preferred method)
|
||
for i := range updated {
|
||
smartTemps, ok := smartTempsByNode[updated[i].Node]
|
||
log.Debug().
|
||
Str("diskDevPath", updated[i].DevPath).
|
||
Str("diskNode", updated[i].Node).
|
||
Bool("hasSMARTData", ok).
|
||
Int("smartTempCount", len(smartTemps)).
|
||
Msg("mergeNVMeTempsIntoDisks: checking disk for SMART temp match")
|
||
if !ok || len(smartTemps) == 0 {
|
||
continue
|
||
}
|
||
|
||
// Try to match by WWN (most reliable)
|
||
if updated[i].WWN != "" {
|
||
for _, temp := range smartTemps {
|
||
if temp.WWN != "" && strings.EqualFold(temp.WWN, updated[i].WWN) {
|
||
if temp.Temperature > 0 && !temp.StandbySkipped {
|
||
updated[i].Temperature = temp.Temperature
|
||
log.Debug().
|
||
Str("disk", updated[i].DevPath).
|
||
Str("wwn", updated[i].WWN).
|
||
Int("temp", temp.Temperature).
|
||
Msg("Matched SMART temperature by WWN")
|
||
}
|
||
continue
|
||
}
|
||
}
|
||
}
|
||
|
||
// Fall back to serial number match (case-insensitive)
|
||
if updated[i].Serial != "" && updated[i].Temperature == 0 {
|
||
for _, temp := range smartTemps {
|
||
if temp.Serial != "" && strings.EqualFold(temp.Serial, updated[i].Serial) {
|
||
if temp.Temperature > 0 && !temp.StandbySkipped {
|
||
updated[i].Temperature = temp.Temperature
|
||
log.Debug().
|
||
Str("disk", updated[i].DevPath).
|
||
Str("serial", updated[i].Serial).
|
||
Int("temp", temp.Temperature).
|
||
Msg("Matched SMART temperature by serial")
|
||
}
|
||
continue
|
||
}
|
||
}
|
||
}
|
||
|
||
// Last resort: match by device path (normalized)
|
||
if updated[i].Temperature == 0 {
|
||
normalizedDevPath := strings.TrimPrefix(updated[i].DevPath, "/dev/")
|
||
for _, temp := range smartTemps {
|
||
normalizedTempDev := strings.TrimPrefix(temp.Device, "/dev/")
|
||
if normalizedTempDev == normalizedDevPath {
|
||
if temp.Temperature > 0 && !temp.StandbySkipped {
|
||
updated[i].Temperature = temp.Temperature
|
||
log.Debug().
|
||
Str("disk", updated[i].DevPath).
|
||
Int("temp", temp.Temperature).
|
||
Msg("Matched SMART temperature by device path")
|
||
}
|
||
break
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// Process legacy NVMe temperatures for disks that didn't get SMART data
|
||
disksByNode := make(map[string][]int)
|
||
for i := range updated {
|
||
if strings.EqualFold(updated[i].Type, "nvme") && updated[i].Temperature == 0 {
|
||
disksByNode[updated[i].Node] = append(disksByNode[updated[i].Node], i)
|
||
}
|
||
}
|
||
|
||
for nodeName, diskIndexes := range disksByNode {
|
||
temps, ok := nvmeTempsByNode[nodeName]
|
||
if !ok || len(temps) == 0 {
|
||
continue
|
||
}
|
||
|
||
sort.Slice(diskIndexes, func(i, j int) bool {
|
||
return updated[diskIndexes[i]].DevPath < updated[diskIndexes[j]].DevPath
|
||
})
|
||
|
||
for idx, diskIdx := range diskIndexes {
|
||
if idx >= len(temps) {
|
||
break
|
||
}
|
||
|
||
tempVal := temps[idx].Temp
|
||
if tempVal <= 0 || math.IsNaN(tempVal) {
|
||
continue
|
||
}
|
||
|
||
updated[diskIdx].Temperature = int(math.Round(tempVal))
|
||
log.Debug().
|
||
Str("disk", updated[diskIdx].DevPath).
|
||
Int("temp", updated[diskIdx].Temperature).
|
||
Msg("Matched legacy NVMe temperature by index")
|
||
}
|
||
}
|
||
|
||
return updated
|
||
}
|
||
|
||
// mergeHostAgentSMARTIntoDisks merges SMART temperature data from linked host agents
|
||
// into physical disks for Proxmox nodes. This allows disk temps collected by the
|
||
// pulse-agent running on a PVE node to populate the Physical Disks view.
|
||
func mergeHostAgentSMARTIntoDisks(disks []models.PhysicalDisk, nodes []models.Node, hosts []models.Host) []models.PhysicalDisk {
|
||
if len(disks) == 0 || len(nodes) == 0 || len(hosts) == 0 {
|
||
return disks
|
||
}
|
||
|
||
// Build a map of host ID to host for quick lookup
|
||
hostByID := make(map[string]*models.Host, len(hosts))
|
||
for i := range hosts {
|
||
hostByID[hosts[i].ID] = &hosts[i]
|
||
}
|
||
|
||
// Build a map of node name to linked host's SMART data
|
||
smartByNodeName := make(map[string][]models.HostDiskSMART)
|
||
for _, node := range nodes {
|
||
if node.LinkedHostAgentID == "" {
|
||
continue
|
||
}
|
||
host, ok := hostByID[node.LinkedHostAgentID]
|
||
if !ok || len(host.Sensors.SMART) == 0 {
|
||
continue
|
||
}
|
||
smartByNodeName[node.Name] = host.Sensors.SMART
|
||
log.Debug().
|
||
Str("nodeName", node.Name).
|
||
Str("hostAgentID", node.LinkedHostAgentID).
|
||
Int("smartDiskCount", len(host.Sensors.SMART)).
|
||
Msg("mergeHostAgentSMARTIntoDisks: found linked host agent with SMART data")
|
||
}
|
||
|
||
if len(smartByNodeName) == 0 {
|
||
return disks
|
||
}
|
||
|
||
updated := make([]models.PhysicalDisk, len(disks))
|
||
copy(updated, disks)
|
||
|
||
for i := range updated {
|
||
smartData, ok := smartByNodeName[updated[i].Node]
|
||
if !ok || len(smartData) == 0 {
|
||
continue
|
||
}
|
||
|
||
// Find matching SMART entry by WWN, serial, or device path
|
||
var matched *models.HostDiskSMART
|
||
|
||
// Try to match by WWN (most reliable)
|
||
if updated[i].WWN != "" {
|
||
for j := range smartData {
|
||
if smartData[j].WWN != "" && strings.EqualFold(smartData[j].WWN, updated[i].WWN) {
|
||
matched = &smartData[j]
|
||
break
|
||
}
|
||
}
|
||
}
|
||
|
||
// Fall back to serial number match
|
||
if matched == nil && updated[i].Serial != "" {
|
||
for j := range smartData {
|
||
if smartData[j].Serial != "" && strings.EqualFold(smartData[j].Serial, updated[i].Serial) {
|
||
matched = &smartData[j]
|
||
break
|
||
}
|
||
}
|
||
}
|
||
|
||
// Last resort: match by device path
|
||
if matched == nil {
|
||
normalizedDevPath := strings.TrimPrefix(updated[i].DevPath, "/dev/")
|
||
for j := range smartData {
|
||
normalizedDiskDev := strings.TrimPrefix(smartData[j].Device, "/dev/")
|
||
if normalizedDiskDev == normalizedDevPath {
|
||
matched = &smartData[j]
|
||
break
|
||
}
|
||
}
|
||
}
|
||
|
||
if matched == nil || matched.Standby {
|
||
continue
|
||
}
|
||
|
||
// Merge temperature if not already set
|
||
if updated[i].Temperature == 0 && matched.Temperature > 0 {
|
||
updated[i].Temperature = matched.Temperature
|
||
log.Debug().
|
||
Str("device", updated[i].DevPath).
|
||
Int("temp", matched.Temperature).
|
||
Msg("Matched host agent SMART temperature")
|
||
}
|
||
|
||
// Always merge SMART attributes from host agent
|
||
if matched.Attributes != nil {
|
||
updated[i].SmartAttributes = matched.Attributes
|
||
}
|
||
}
|
||
|
||
return updated
|
||
}
|
||
|
||
// writeSMARTMetrics writes SMART attribute metrics to the persistent metrics store for a single disk.
|
||
func (m *Monitor) writeSMARTMetrics(disk models.PhysicalDisk, now time.Time) {
|
||
// Determine resource ID: serial (preferred) → WWN → composite fallback
|
||
resourceID := disk.Serial
|
||
if resourceID == "" {
|
||
resourceID = disk.WWN
|
||
}
|
||
if resourceID == "" {
|
||
resourceID = fmt.Sprintf("%s-%s-%s", disk.Instance, disk.Node, strings.ReplaceAll(disk.DevPath, "/", "-"))
|
||
}
|
||
|
||
// Temperature (always write if > 0)
|
||
if disk.Temperature > 0 {
|
||
m.metricsStore.Write("disk", resourceID, "smart_temp", float64(disk.Temperature), now)
|
||
}
|
||
|
||
attrs := disk.SmartAttributes
|
||
if attrs == nil {
|
||
return
|
||
}
|
||
|
||
// Common
|
||
if attrs.PowerOnHours != nil {
|
||
m.metricsStore.Write("disk", resourceID, "smart_power_on_hours", float64(*attrs.PowerOnHours), now)
|
||
}
|
||
if attrs.PowerCycles != nil {
|
||
m.metricsStore.Write("disk", resourceID, "smart_power_cycles", float64(*attrs.PowerCycles), now)
|
||
}
|
||
|
||
// SATA-specific
|
||
if attrs.ReallocatedSectors != nil {
|
||
m.metricsStore.Write("disk", resourceID, "smart_reallocated_sectors", float64(*attrs.ReallocatedSectors), now)
|
||
}
|
||
if attrs.PendingSectors != nil {
|
||
m.metricsStore.Write("disk", resourceID, "smart_pending_sectors", float64(*attrs.PendingSectors), now)
|
||
}
|
||
if attrs.OfflineUncorrectable != nil {
|
||
m.metricsStore.Write("disk", resourceID, "smart_offline_uncorrectable", float64(*attrs.OfflineUncorrectable), now)
|
||
}
|
||
if attrs.UDMACRCErrors != nil {
|
||
m.metricsStore.Write("disk", resourceID, "smart_crc_errors", float64(*attrs.UDMACRCErrors), now)
|
||
}
|
||
|
||
// NVMe-specific
|
||
if attrs.PercentageUsed != nil {
|
||
m.metricsStore.Write("disk", resourceID, "smart_percentage_used", float64(*attrs.PercentageUsed), now)
|
||
}
|
||
if attrs.AvailableSpare != nil {
|
||
m.metricsStore.Write("disk", resourceID, "smart_available_spare", float64(*attrs.AvailableSpare), now)
|
||
}
|
||
if attrs.MediaErrors != nil {
|
||
m.metricsStore.Write("disk", resourceID, "smart_media_errors", float64(*attrs.MediaErrors), now)
|
||
}
|
||
if attrs.UnsafeShutdowns != nil {
|
||
m.metricsStore.Write("disk", resourceID, "smart_unsafe_shutdowns", float64(*attrs.UnsafeShutdowns), now)
|
||
}
|
||
}
|
||
|
||
func lookupClusterEndpointLabel(instance *config.PVEInstance, nodeName string) string {
|
||
if instance == nil {
|
||
return ""
|
||
}
|
||
|
||
for _, endpoint := range instance.ClusterEndpoints {
|
||
if !strings.EqualFold(endpoint.NodeName, nodeName) {
|
||
continue
|
||
}
|
||
|
||
if host := strings.TrimSpace(endpoint.Host); host != "" {
|
||
if label := normalizeEndpointHost(host); label != "" && !isLikelyIPAddress(label) {
|
||
return label
|
||
}
|
||
}
|
||
|
||
if nodeNameLabel := strings.TrimSpace(endpoint.NodeName); nodeNameLabel != "" {
|
||
return nodeNameLabel
|
||
}
|
||
|
||
if ip := strings.TrimSpace(endpoint.IP); ip != "" {
|
||
return ip
|
||
}
|
||
}
|
||
|
||
return ""
|
||
}
|
||
|
||
func normalizeEndpointHost(raw string) string {
|
||
value := strings.TrimSpace(raw)
|
||
if value == "" {
|
||
return ""
|
||
}
|
||
|
||
if parsed, err := url.Parse(value); err == nil && parsed.Host != "" {
|
||
host := parsed.Hostname()
|
||
if host != "" {
|
||
return host
|
||
}
|
||
return parsed.Host
|
||
}
|
||
|
||
value = strings.TrimPrefix(value, "https://")
|
||
value = strings.TrimPrefix(value, "http://")
|
||
value = strings.TrimSpace(value)
|
||
if value == "" {
|
||
return ""
|
||
}
|
||
|
||
if idx := strings.Index(value, "/"); idx >= 0 {
|
||
value = strings.TrimSpace(value[:idx])
|
||
}
|
||
|
||
if idx := strings.Index(value, ":"); idx >= 0 {
|
||
value = strings.TrimSpace(value[:idx])
|
||
}
|
||
|
||
return value
|
||
}
|
||
|
||
func isLikelyIPAddress(value string) bool {
|
||
if value == "" {
|
||
return false
|
||
}
|
||
|
||
if ip := net.ParseIP(value); ip != nil {
|
||
return true
|
||
}
|
||
|
||
// Handle IPv6 with zone identifier (fe80::1%eth0)
|
||
if i := strings.Index(value, "%"); i > 0 {
|
||
if ip := net.ParseIP(value[:i]); ip != nil {
|
||
return true
|
||
}
|
||
}
|
||
|
||
return false
|
||
}
|
||
|
||
func ensureClusterEndpointURL(raw string) string {
|
||
value := strings.TrimSpace(raw)
|
||
if value == "" {
|
||
return ""
|
||
}
|
||
|
||
lower := strings.ToLower(value)
|
||
if strings.HasPrefix(lower, "http://") || strings.HasPrefix(lower, "https://") {
|
||
return value
|
||
}
|
||
|
||
if _, _, err := net.SplitHostPort(value); err == nil {
|
||
return "https://" + value
|
||
}
|
||
|
||
return "https://" + net.JoinHostPort(value, "8006")
|
||
}
|
||
|
||
func clusterEndpointEffectiveURL(endpoint config.ClusterEndpoint, verifySSL bool, hasFingerprint bool) string {
|
||
// When TLS hostname verification is required (VerifySSL=true and no fingerprint),
|
||
// prefer hostname over IP to ensure certificate CN/SAN validation works correctly.
|
||
// When TLS is not verified (VerifySSL=false) or a fingerprint is provided (which
|
||
// bypasses hostname checks), prefer IP to reduce DNS lookups (refs #620).
|
||
requiresHostnameForTLS := verifySSL && !hasFingerprint
|
||
|
||
// Use EffectiveIP() which prefers user-specified IPOverride over auto-discovered IP
|
||
effectiveIP := endpoint.EffectiveIP()
|
||
|
||
if requiresHostnameForTLS {
|
||
// Prefer hostname for proper TLS certificate validation
|
||
if endpoint.Host != "" {
|
||
return ensureClusterEndpointURL(endpoint.Host)
|
||
}
|
||
if effectiveIP != "" {
|
||
return ensureClusterEndpointURL(effectiveIP)
|
||
}
|
||
} else {
|
||
// Prefer IP address to avoid excessive DNS lookups
|
||
if effectiveIP != "" {
|
||
return ensureClusterEndpointURL(effectiveIP)
|
||
}
|
||
if endpoint.Host != "" {
|
||
return ensureClusterEndpointURL(endpoint.Host)
|
||
}
|
||
}
|
||
return ""
|
||
}
|
||
|
||
// PollExecutor defines the contract for executing polling tasks.
|
||
type PollExecutor interface {
|
||
Execute(ctx context.Context, task PollTask)
|
||
}
|
||
|
||
type realExecutor struct {
|
||
monitor *Monitor
|
||
}
|
||
|
||
func newRealExecutor(m *Monitor) PollExecutor {
|
||
return &realExecutor{monitor: m}
|
||
}
|
||
|
||
func (r *realExecutor) Execute(ctx context.Context, task PollTask) {
|
||
if r == nil || r.monitor == nil {
|
||
return
|
||
}
|
||
|
||
switch strings.ToLower(task.InstanceType) {
|
||
case "pve":
|
||
if task.PVEClient == nil {
|
||
log.Warn().
|
||
Str("instance", task.InstanceName).
|
||
Msg("PollExecutor received nil PVE client")
|
||
return
|
||
}
|
||
r.monitor.pollPVEInstance(ctx, task.InstanceName, task.PVEClient)
|
||
case "pbs":
|
||
if task.PBSClient == nil {
|
||
log.Warn().
|
||
Str("instance", task.InstanceName).
|
||
Msg("PollExecutor received nil PBS client")
|
||
return
|
||
}
|
||
r.monitor.pollPBSInstance(ctx, task.InstanceName, task.PBSClient)
|
||
case "pmg":
|
||
if task.PMGClient == nil {
|
||
log.Warn().
|
||
Str("instance", task.InstanceName).
|
||
Msg("PollExecutor received nil PMG client")
|
||
return
|
||
}
|
||
r.monitor.pollPMGInstance(ctx, task.InstanceName, task.PMGClient)
|
||
default:
|
||
if logging.IsLevelEnabled(zerolog.DebugLevel) {
|
||
log.Debug().
|
||
Str("instance", task.InstanceName).
|
||
Str("type", task.InstanceType).
|
||
Msg("PollExecutor received unsupported task type")
|
||
}
|
||
}
|
||
}
|
||
|
||
type instanceInfo struct {
|
||
Key string
|
||
Type InstanceType
|
||
DisplayName string
|
||
Connection string
|
||
Metadata map[string]string
|
||
}
|
||
|
||
type pollStatus struct {
|
||
LastSuccess time.Time
|
||
LastErrorAt time.Time
|
||
LastErrorMessage string
|
||
LastErrorCategory string
|
||
ConsecutiveFailures int
|
||
FirstFailureAt time.Time
|
||
}
|
||
|
||
type dlqInsight struct {
|
||
Reason string
|
||
FirstAttempt time.Time
|
||
LastAttempt time.Time
|
||
RetryCount int
|
||
NextRetry time.Time
|
||
}
|
||
|
||
type ErrorDetail struct {
|
||
At time.Time `json:"at"`
|
||
Message string `json:"message"`
|
||
Category string `json:"category"`
|
||
}
|
||
|
||
type InstancePollStatus struct {
|
||
LastSuccess *time.Time `json:"lastSuccess,omitempty"`
|
||
LastError *ErrorDetail `json:"lastError,omitempty"`
|
||
ConsecutiveFailures int `json:"consecutiveFailures"`
|
||
FirstFailureAt *time.Time `json:"firstFailureAt,omitempty"`
|
||
}
|
||
|
||
type InstanceBreaker struct {
|
||
State string `json:"state"`
|
||
Since *time.Time `json:"since,omitempty"`
|
||
LastTransition *time.Time `json:"lastTransition,omitempty"`
|
||
RetryAt *time.Time `json:"retryAt,omitempty"`
|
||
FailureCount int `json:"failureCount"`
|
||
}
|
||
|
||
type InstanceDLQ struct {
|
||
Present bool `json:"present"`
|
||
Reason string `json:"reason,omitempty"`
|
||
FirstAttempt *time.Time `json:"firstAttempt,omitempty"`
|
||
LastAttempt *time.Time `json:"lastAttempt,omitempty"`
|
||
RetryCount int `json:"retryCount,omitempty"`
|
||
NextRetry *time.Time `json:"nextRetry,omitempty"`
|
||
}
|
||
|
||
type InstanceHealth struct {
|
||
Key string `json:"key"`
|
||
Type string `json:"type"`
|
||
DisplayName string `json:"displayName"`
|
||
Instance string `json:"instance"`
|
||
Connection string `json:"connection"`
|
||
PollStatus InstancePollStatus `json:"pollStatus"`
|
||
Breaker InstanceBreaker `json:"breaker"`
|
||
DeadLetter InstanceDLQ `json:"deadLetter"`
|
||
Warnings []string `json:"warnings,omitempty"`
|
||
}
|
||
|
||
func schedulerKey(instanceType InstanceType, name string) string {
|
||
return string(instanceType) + "::" + name
|
||
}
|
||
|
||
func timePtr(t time.Time) *time.Time {
|
||
if t.IsZero() {
|
||
return nil
|
||
}
|
||
copy := t
|
||
return ©
|
||
}
|
||
|
||
// Monitor handles all monitoring operations
|
||
type Monitor struct {
|
||
config *config.Config
|
||
state *models.State
|
||
orgID string // Organization ID for tenant isolation (empty = default/legacy)
|
||
pveClients map[string]PVEClientInterface
|
||
pbsClients map[string]*pbs.Client
|
||
pmgClients map[string]*pmg.Client
|
||
pollMetrics *PollMetrics
|
||
scheduler *AdaptiveScheduler
|
||
stalenessTracker *StalenessTracker
|
||
taskQueue *TaskQueue
|
||
pollTimeout time.Duration
|
||
circuitBreakers map[string]*circuitBreaker
|
||
deadLetterQueue *TaskQueue
|
||
failureCounts map[string]int
|
||
lastOutcome map[string]taskOutcome
|
||
backoffCfg backoffConfig
|
||
rng *rand.Rand
|
||
maxRetryAttempts int
|
||
tempCollector *TemperatureCollector // SSH-based temperature collector
|
||
guestMetadataStore *config.GuestMetadataStore
|
||
dockerMetadataStore *config.DockerMetadataStore
|
||
hostMetadataStore *config.HostMetadataStore
|
||
mu sync.RWMutex
|
||
startTime time.Time
|
||
rateTracker *RateTracker
|
||
metricsHistory *MetricsHistory
|
||
metricsStore *metrics.Store // Persistent SQLite metrics storage
|
||
alertManager *alerts.Manager
|
||
alertResolvedAICallback func(*alerts.Alert)
|
||
alertTriggeredAICallback func(*alerts.Alert)
|
||
incidentStore *memory.IncidentStore
|
||
notificationMgr *notifications.NotificationManager
|
||
configPersist *config.ConfigPersistence
|
||
discoveryService *discovery.Service // Background discovery service
|
||
activePollCount int32 // Number of active polling operations
|
||
pollCounter int64 // Counter for polling cycles
|
||
authFailures map[string]int // Track consecutive auth failures per node
|
||
lastAuthAttempt map[string]time.Time // Track last auth attempt time
|
||
lastClusterCheck map[string]time.Time // Track last cluster check for standalone nodes
|
||
lastPhysicalDiskPoll map[string]time.Time // Track last physical disk poll time per instance
|
||
lastPVEBackupPoll map[string]time.Time // Track last PVE backup poll per instance
|
||
lastPBSBackupPoll map[string]time.Time // Track last PBS backup poll per instance
|
||
backupPermissionWarnings map[string]string // Track backup permission issues per instance (instance -> warning message)
|
||
persistence *config.ConfigPersistence // Add persistence for saving updated configs
|
||
pbsBackupPollers map[string]bool // Track PBS backup polling goroutines per instance
|
||
pbsBackupCacheTime map[string]map[pbsBackupGroupKey]time.Time // Track when each PBS backup group was last fetched
|
||
runtimeCtx context.Context // Context used while monitor is running
|
||
wsHub *websocket.Hub // Hub used for broadcasting state
|
||
diagMu sync.RWMutex // Protects diagnostic snapshot maps
|
||
nodeSnapshots map[string]NodeMemorySnapshot
|
||
guestSnapshots map[string]GuestMemorySnapshot
|
||
rrdCacheMu sync.RWMutex // Protects RRD memavailable cache
|
||
nodeRRDMemCache map[string]rrdMemCacheEntry
|
||
removedDockerHosts map[string]time.Time // Track deliberately removed Docker hosts (ID -> removal time)
|
||
dockerTokenBindings map[string]string // Track token ID -> agent ID bindings to enforce uniqueness
|
||
removedKubernetesClusters map[string]time.Time // Track deliberately removed Kubernetes clusters (ID -> removal time)
|
||
kubernetesTokenBindings map[string]string // Track token ID -> agent ID bindings to enforce uniqueness
|
||
hostTokenBindings map[string]string // Track tokenID:hostname -> host identity bindings
|
||
dockerCommands map[string]*dockerHostCommand
|
||
dockerCommandIndex map[string]string
|
||
guestMetadataMu sync.RWMutex
|
||
guestMetadataCache map[string]guestMetadataCacheEntry
|
||
guestMetadataLimiterMu sync.Mutex
|
||
guestMetadataLimiter map[string]time.Time
|
||
guestMetadataSlots chan struct{}
|
||
guestMetadataMinRefresh time.Duration
|
||
guestMetadataRefreshJitter time.Duration
|
||
guestMetadataRetryBackoff time.Duration
|
||
guestMetadataHoldDuration time.Duration
|
||
// Configurable guest agent timeouts (refs #592)
|
||
guestAgentFSInfoTimeout time.Duration
|
||
guestAgentNetworkTimeout time.Duration
|
||
guestAgentOSInfoTimeout time.Duration
|
||
guestAgentVersionTimeout time.Duration
|
||
guestAgentRetries int
|
||
executor PollExecutor
|
||
breakerBaseRetry time.Duration
|
||
breakerMaxDelay time.Duration
|
||
breakerHalfOpenWindow time.Duration
|
||
instanceInfoCache map[string]*instanceInfo
|
||
pollStatusMap map[string]*pollStatus
|
||
dlqInsightMap map[string]*dlqInsight
|
||
nodeLastOnline map[string]time.Time // Track last time each node was seen online (for grace period)
|
||
nodePendingUpdatesCache map[string]pendingUpdatesCache // Cache pending updates per node (checked every 30 min)
|
||
resourceStore ResourceStoreInterface // Optional unified resource store for polling optimization
|
||
mockMetricsCancel context.CancelFunc
|
||
mockMetricsWg sync.WaitGroup
|
||
dockerChecker DockerChecker // Optional Docker checker for LXC containers
|
||
// Agent profile cache to avoid disk I/O on every report (refs #1094)
|
||
agentProfileCacheMu sync.RWMutex
|
||
agentProfileCache *agentProfileCacheEntry
|
||
}
|
||
|
||
type rrdMemCacheEntry struct {
|
||
available uint64
|
||
used uint64
|
||
total uint64
|
||
fetchedAt time.Time
|
||
}
|
||
|
||
// pendingUpdatesCache caches apt pending updates count per node
|
||
type pendingUpdatesCache struct {
|
||
count int
|
||
checkedAt time.Time
|
||
}
|
||
|
||
// TTL for pending updates cache (30 minutes - balance between freshness and API load)
|
||
const pendingUpdatesCacheTTL = 30 * time.Minute
|
||
|
||
// agentProfileCacheEntry caches agent profiles and assignments to avoid disk I/O on every agent report.
|
||
// TTL is 60 seconds to balance freshness with performance.
|
||
type agentProfileCacheEntry struct {
|
||
profiles []models.AgentProfile
|
||
assignments []models.AgentProfileAssignment
|
||
loadedAt time.Time
|
||
}
|
||
|
||
const agentProfileCacheTTL = 60 * time.Second
|
||
|
||
// safePercentage calculates percentage safely, returning 0 if divisor is 0
|
||
func safePercentage(used, total float64) float64 {
|
||
if total == 0 {
|
||
return 0
|
||
}
|
||
result := used / total * 100
|
||
if math.IsNaN(result) || math.IsInf(result, 0) {
|
||
return 0
|
||
}
|
||
return result
|
||
}
|
||
|
||
// safeFloat ensures a float value is not NaN or Inf
|
||
func safeFloat(val float64) float64 {
|
||
if math.IsNaN(val) || math.IsInf(val, 0) {
|
||
return 0
|
||
}
|
||
return val
|
||
}
|
||
|
||
// makeGuestID generates a stable, canonical guest ID that includes instance, node, and VMID.
|
||
// Format: {instance}:{node}:{vmid} (e.g., "delly:minipc:201")
|
||
//
|
||
// Using colons as separators prevents ambiguity with dashes in instance/node names.
|
||
// This format ensures:
|
||
// - Unique IDs across all deployment scenarios (single agent, per-node agents, mixed)
|
||
// - Stable IDs that don't change when monitoring topology changes
|
||
// - Easy parsing to extract instance, node, and VMID components
|
||
//
|
||
// For clustered setups, the instance name is the cluster name.
|
||
// For standalone nodes, the instance name matches the node name.
|
||
func makeGuestID(instanceName string, node string, vmid int) string {
|
||
return fmt.Sprintf("%s:%s:%d", instanceName, node, vmid)
|
||
}
|
||
|
||
// parseDurationEnv parses a duration from an environment variable, returning defaultVal if not set or invalid
|
||
func parseDurationEnv(key string, defaultVal time.Duration) time.Duration {
|
||
val := os.Getenv(key)
|
||
if val == "" {
|
||
return defaultVal
|
||
}
|
||
parsed, err := time.ParseDuration(val)
|
||
if err != nil {
|
||
log.Warn().
|
||
Str("key", key).
|
||
Str("value", val).
|
||
Err(err).
|
||
Dur("default", defaultVal).
|
||
Msg("Failed to parse duration from environment variable, using default")
|
||
return defaultVal
|
||
}
|
||
return parsed
|
||
}
|
||
|
||
// parseIntEnv parses an integer from an environment variable, returning defaultVal if not set or invalid
|
||
func parseIntEnv(key string, defaultVal int) int {
|
||
val := os.Getenv(key)
|
||
if val == "" {
|
||
return defaultVal
|
||
}
|
||
parsed, err := strconv.Atoi(val)
|
||
if err != nil {
|
||
log.Warn().
|
||
Str("key", key).
|
||
Str("value", val).
|
||
Err(err).
|
||
Int("default", defaultVal).
|
||
Msg("Failed to parse integer from environment variable, using default")
|
||
return defaultVal
|
||
}
|
||
return parsed
|
||
}
|
||
|
||
func clampUint64ToInt64(val uint64) int64 {
|
||
if val > math.MaxInt64 {
|
||
return math.MaxInt64
|
||
}
|
||
return int64(val)
|
||
}
|
||
|
||
func cloneStringFloatMap(src map[string]float64) map[string]float64 {
|
||
if len(src) == 0 {
|
||
return nil
|
||
}
|
||
out := make(map[string]float64, len(src))
|
||
for k, v := range src {
|
||
out[k] = v
|
||
}
|
||
return out
|
||
}
|
||
|
||
func cloneStringMap(src map[string]string) map[string]string {
|
||
if len(src) == 0 {
|
||
return nil
|
||
}
|
||
out := make(map[string]string, len(src))
|
||
for k, v := range src {
|
||
out[k] = v
|
||
}
|
||
return out
|
||
}
|
||
|
||
func convertDockerServices(services []agentsdocker.Service) []models.DockerService {
|
||
if len(services) == 0 {
|
||
return nil
|
||
}
|
||
|
||
result := make([]models.DockerService, 0, len(services))
|
||
for _, svc := range services {
|
||
service := models.DockerService{
|
||
ID: svc.ID,
|
||
Name: svc.Name,
|
||
Stack: svc.Stack,
|
||
Image: svc.Image,
|
||
Mode: svc.Mode,
|
||
DesiredTasks: svc.DesiredTasks,
|
||
RunningTasks: svc.RunningTasks,
|
||
CompletedTasks: svc.CompletedTasks,
|
||
}
|
||
|
||
if len(svc.Labels) > 0 {
|
||
service.Labels = cloneStringMap(svc.Labels)
|
||
}
|
||
|
||
if len(svc.EndpointPorts) > 0 {
|
||
ports := make([]models.DockerServicePort, len(svc.EndpointPorts))
|
||
for i, port := range svc.EndpointPorts {
|
||
ports[i] = models.DockerServicePort{
|
||
Name: port.Name,
|
||
Protocol: port.Protocol,
|
||
TargetPort: port.TargetPort,
|
||
PublishedPort: port.PublishedPort,
|
||
PublishMode: port.PublishMode,
|
||
}
|
||
}
|
||
service.EndpointPorts = ports
|
||
}
|
||
|
||
if svc.UpdateStatus != nil {
|
||
update := &models.DockerServiceUpdate{
|
||
State: svc.UpdateStatus.State,
|
||
Message: svc.UpdateStatus.Message,
|
||
}
|
||
if svc.UpdateStatus.CompletedAt != nil && !svc.UpdateStatus.CompletedAt.IsZero() {
|
||
completed := *svc.UpdateStatus.CompletedAt
|
||
update.CompletedAt = &completed
|
||
}
|
||
service.UpdateStatus = update
|
||
}
|
||
|
||
if svc.CreatedAt != nil && !svc.CreatedAt.IsZero() {
|
||
created := *svc.CreatedAt
|
||
service.CreatedAt = &created
|
||
}
|
||
if svc.UpdatedAt != nil && !svc.UpdatedAt.IsZero() {
|
||
updated := *svc.UpdatedAt
|
||
service.UpdatedAt = &updated
|
||
}
|
||
|
||
result = append(result, service)
|
||
}
|
||
|
||
return result
|
||
}
|
||
|
||
func convertDockerTasks(tasks []agentsdocker.Task) []models.DockerTask {
|
||
if len(tasks) == 0 {
|
||
return nil
|
||
}
|
||
|
||
result := make([]models.DockerTask, 0, len(tasks))
|
||
for _, task := range tasks {
|
||
modelTask := models.DockerTask{
|
||
ID: task.ID,
|
||
ServiceID: task.ServiceID,
|
||
ServiceName: task.ServiceName,
|
||
Slot: task.Slot,
|
||
NodeID: task.NodeID,
|
||
NodeName: task.NodeName,
|
||
DesiredState: task.DesiredState,
|
||
CurrentState: task.CurrentState,
|
||
Error: task.Error,
|
||
Message: task.Message,
|
||
ContainerID: task.ContainerID,
|
||
ContainerName: task.ContainerName,
|
||
CreatedAt: task.CreatedAt,
|
||
}
|
||
|
||
if task.UpdatedAt != nil && !task.UpdatedAt.IsZero() {
|
||
updated := *task.UpdatedAt
|
||
modelTask.UpdatedAt = &updated
|
||
}
|
||
if task.StartedAt != nil && !task.StartedAt.IsZero() {
|
||
started := *task.StartedAt
|
||
modelTask.StartedAt = &started
|
||
}
|
||
if task.CompletedAt != nil && !task.CompletedAt.IsZero() {
|
||
completed := *task.CompletedAt
|
||
modelTask.CompletedAt = &completed
|
||
}
|
||
|
||
result = append(result, modelTask)
|
||
}
|
||
|
||
return result
|
||
}
|
||
|
||
func normalizeAgentVersion(version string) string {
|
||
version = strings.TrimSpace(version)
|
||
if version == "" {
|
||
return ""
|
||
}
|
||
version = strings.TrimLeft(version, "vV")
|
||
if version == "" {
|
||
return ""
|
||
}
|
||
return "v" + version
|
||
}
|
||
|
||
func convertDockerSwarmInfo(info *agentsdocker.SwarmInfo) *models.DockerSwarmInfo {
|
||
if info == nil {
|
||
return nil
|
||
}
|
||
|
||
return &models.DockerSwarmInfo{
|
||
NodeID: info.NodeID,
|
||
NodeRole: info.NodeRole,
|
||
LocalState: info.LocalState,
|
||
ControlAvailable: info.ControlAvailable,
|
||
ClusterID: info.ClusterID,
|
||
ClusterName: info.ClusterName,
|
||
Scope: info.Scope,
|
||
Error: info.Error,
|
||
}
|
||
}
|
||
|
||
// shouldRunBackupPoll determines whether a backup polling cycle should execute.
|
||
// Returns whether polling should run, a human-readable skip reason, and the timestamp to record.
|
||
func (m *Monitor) shouldRunBackupPoll(last time.Time, now time.Time) (bool, string, time.Time) {
|
||
if m == nil || m.config == nil {
|
||
return false, "configuration unavailable", last
|
||
}
|
||
|
||
if !m.config.EnableBackupPolling {
|
||
return false, "backup polling globally disabled", last
|
||
}
|
||
|
||
interval := m.config.BackupPollingInterval
|
||
if interval > 0 {
|
||
if !last.IsZero() && now.Sub(last) < interval {
|
||
next := last.Add(interval)
|
||
return false, fmt.Sprintf("next run scheduled for %s", next.Format(time.RFC3339)), last
|
||
}
|
||
return true, "", now
|
||
}
|
||
|
||
backupCycles := m.config.BackupPollingCycles
|
||
if backupCycles <= 0 {
|
||
backupCycles = 10
|
||
}
|
||
|
||
if m.pollCounter%int64(backupCycles) == 0 || m.pollCounter == 1 {
|
||
return true, "", now
|
||
}
|
||
|
||
remaining := int64(backupCycles) - (m.pollCounter % int64(backupCycles))
|
||
return false, fmt.Sprintf("next run in %d polling cycles", remaining), last
|
||
}
|
||
|
||
const (
|
||
dockerConnectionPrefix = "docker-"
|
||
kubernetesConnectionPrefix = "kubernetes-"
|
||
hostConnectionPrefix = "host-"
|
||
dockerOfflineGraceMultiplier = 4
|
||
dockerMinimumHealthWindow = 30 * time.Second
|
||
dockerMaximumHealthWindow = 10 * time.Minute
|
||
kubernetesOfflineGraceMultiplier = 4
|
||
kubernetesMinimumHealthWindow = 30 * time.Second
|
||
kubernetesMaximumHealthWindow = 10 * time.Minute
|
||
hostOfflineGraceMultiplier = 6
|
||
hostMinimumHealthWindow = 60 * time.Second
|
||
hostMaximumHealthWindow = 10 * time.Minute
|
||
nodeOfflineGracePeriod = 60 * time.Second // Grace period before marking Proxmox nodes offline
|
||
nodeRRDCacheTTL = 30 * time.Second
|
||
nodeRRDRequestTimeout = 2 * time.Second
|
||
)
|
||
|
||
type taskOutcome struct {
|
||
success bool
|
||
transient bool
|
||
err error
|
||
recordedAt time.Time
|
||
}
|
||
|
||
func (m *Monitor) getNodeRRDMetrics(ctx context.Context, client PVEClientInterface, nodeName string) (rrdMemCacheEntry, error) {
|
||
if client == nil || nodeName == "" {
|
||
return rrdMemCacheEntry{}, fmt.Errorf("invalid arguments for RRD lookup")
|
||
}
|
||
|
||
now := time.Now()
|
||
|
||
m.rrdCacheMu.RLock()
|
||
if entry, ok := m.nodeRRDMemCache[nodeName]; ok && now.Sub(entry.fetchedAt) < nodeRRDCacheTTL {
|
||
m.rrdCacheMu.RUnlock()
|
||
return entry, nil
|
||
}
|
||
m.rrdCacheMu.RUnlock()
|
||
|
||
requestCtx, cancel := context.WithTimeout(ctx, nodeRRDRequestTimeout)
|
||
defer cancel()
|
||
|
||
points, err := client.GetNodeRRDData(requestCtx, nodeName, "hour", "AVERAGE", []string{"memavailable", "memused", "memtotal"})
|
||
if err != nil {
|
||
return rrdMemCacheEntry{}, err
|
||
}
|
||
|
||
var memAvailable uint64
|
||
var memUsed uint64
|
||
var memTotal uint64
|
||
|
||
for i := len(points) - 1; i >= 0; i-- {
|
||
point := points[i]
|
||
|
||
if memTotal == 0 && point.MemTotal != nil && !math.IsNaN(*point.MemTotal) && *point.MemTotal > 0 {
|
||
memTotal = uint64(math.Round(*point.MemTotal))
|
||
}
|
||
|
||
if memAvailable == 0 && point.MemAvailable != nil && !math.IsNaN(*point.MemAvailable) && *point.MemAvailable > 0 {
|
||
memAvailable = uint64(math.Round(*point.MemAvailable))
|
||
}
|
||
|
||
if memUsed == 0 && point.MemUsed != nil && !math.IsNaN(*point.MemUsed) && *point.MemUsed > 0 {
|
||
memUsed = uint64(math.Round(*point.MemUsed))
|
||
}
|
||
|
||
if memTotal > 0 && (memAvailable > 0 || memUsed > 0) {
|
||
break
|
||
}
|
||
}
|
||
|
||
if memTotal > 0 {
|
||
if memAvailable > memTotal {
|
||
memAvailable = memTotal
|
||
}
|
||
if memUsed > memTotal {
|
||
memUsed = memTotal
|
||
}
|
||
}
|
||
|
||
if memAvailable == 0 && memUsed == 0 {
|
||
return rrdMemCacheEntry{}, fmt.Errorf("rrd mem metrics not present")
|
||
}
|
||
|
||
entry := rrdMemCacheEntry{
|
||
available: memAvailable,
|
||
used: memUsed,
|
||
total: memTotal,
|
||
fetchedAt: now,
|
||
}
|
||
|
||
m.rrdCacheMu.Lock()
|
||
m.nodeRRDMemCache[nodeName] = entry
|
||
m.rrdCacheMu.Unlock()
|
||
|
||
return entry, nil
|
||
}
|
||
|
||
// RemoveDockerHost removes a docker host from the shared state and clears related alerts.
|
||
func (m *Monitor) RemoveDockerHost(hostID string) (models.DockerHost, error) {
|
||
hostID = strings.TrimSpace(hostID)
|
||
if hostID == "" {
|
||
return models.DockerHost{}, fmt.Errorf("docker host id is required")
|
||
}
|
||
|
||
host, removed := m.state.RemoveDockerHost(hostID)
|
||
if !removed {
|
||
if logging.IsLevelEnabled(zerolog.DebugLevel) {
|
||
log.Debug().Str("dockerHostID", hostID).Msg("Docker host not present in state during removal; proceeding to clear alerts")
|
||
}
|
||
host = models.DockerHost{
|
||
ID: hostID,
|
||
Hostname: hostID,
|
||
DisplayName: hostID,
|
||
}
|
||
}
|
||
|
||
// Revoke the API token associated with this Docker host
|
||
if host.TokenID != "" {
|
||
tokenRemoved := m.config.RemoveAPIToken(host.TokenID)
|
||
if tokenRemoved != nil {
|
||
m.config.SortAPITokens()
|
||
|
||
if m.persistence != nil {
|
||
if err := m.persistence.SaveAPITokens(m.config.APITokens); err != nil {
|
||
log.Warn().Err(err).Str("tokenID", host.TokenID).Msg("Failed to persist API token revocation after Docker host removal")
|
||
} else {
|
||
log.Info().Str("tokenID", host.TokenID).Str("tokenName", host.TokenName).Msg("API token revoked for removed Docker host")
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// Track removal to prevent resurrection from cached reports
|
||
removedAt := time.Now()
|
||
|
||
m.mu.Lock()
|
||
m.removedDockerHosts[hostID] = removedAt
|
||
// Unbind the token so it can be reused with a different agent if needed
|
||
if host.TokenID != "" {
|
||
delete(m.dockerTokenBindings, host.TokenID)
|
||
log.Debug().
|
||
Str("tokenID", host.TokenID).
|
||
Str("dockerHostID", hostID).
|
||
Msg("Unbound Docker agent token from removed host")
|
||
}
|
||
if cmd, ok := m.dockerCommands[hostID]; ok {
|
||
delete(m.dockerCommandIndex, cmd.status.ID)
|
||
}
|
||
delete(m.dockerCommands, hostID)
|
||
m.mu.Unlock()
|
||
|
||
m.state.AddRemovedDockerHost(models.RemovedDockerHost{
|
||
ID: hostID,
|
||
Hostname: host.Hostname,
|
||
DisplayName: host.DisplayName,
|
||
RemovedAt: removedAt,
|
||
})
|
||
|
||
m.state.RemoveConnectionHealth(dockerConnectionPrefix + hostID)
|
||
if m.alertManager != nil {
|
||
m.alertManager.HandleDockerHostRemoved(host)
|
||
m.SyncAlertState()
|
||
}
|
||
|
||
log.Info().
|
||
Str("dockerHost", host.Hostname).
|
||
Str("dockerHostID", hostID).
|
||
Bool("removed", removed).
|
||
Msg("Docker host removed and alerts cleared")
|
||
|
||
return host, nil
|
||
}
|
||
|
||
// RemoveHostAgent removes a host agent from monitoring state and clears related data.
|
||
func (m *Monitor) RemoveHostAgent(hostID string) (models.Host, error) {
|
||
hostID = strings.TrimSpace(hostID)
|
||
if hostID == "" {
|
||
return models.Host{}, fmt.Errorf("host id is required")
|
||
}
|
||
|
||
host, removed := m.state.RemoveHost(hostID)
|
||
if !removed {
|
||
if logging.IsLevelEnabled(zerolog.DebugLevel) {
|
||
log.Debug().Str("hostID", hostID).Msg("Host not present in state during removal")
|
||
}
|
||
host = models.Host{
|
||
ID: hostID,
|
||
Hostname: hostID,
|
||
}
|
||
}
|
||
|
||
tokenID := strings.TrimSpace(host.TokenID)
|
||
hostname := strings.TrimSpace(host.Hostname)
|
||
|
||
tokenStillUsed := false
|
||
if tokenID != "" && m.state != nil {
|
||
for _, other := range m.state.GetHosts() {
|
||
if strings.TrimSpace(other.TokenID) == tokenID {
|
||
tokenStillUsed = true
|
||
break
|
||
}
|
||
}
|
||
if !tokenStillUsed {
|
||
for _, other := range m.state.GetDockerHosts() {
|
||
if strings.TrimSpace(other.TokenID) == tokenID {
|
||
tokenStillUsed = true
|
||
break
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
var tokenRemoved *config.APITokenRecord
|
||
if tokenID != "" && !tokenStillUsed {
|
||
tokenRemoved = m.config.RemoveAPIToken(tokenID)
|
||
if tokenRemoved != nil {
|
||
m.config.SortAPITokens()
|
||
|
||
if m.persistence != nil {
|
||
if err := m.persistence.SaveAPITokens(m.config.APITokens); err != nil {
|
||
log.Warn().Err(err).Str("tokenID", tokenID).Msg("Failed to persist API token revocation after host agent removal")
|
||
} else {
|
||
log.Info().Str("tokenID", tokenID).Str("tokenName", host.TokenName).Msg("API token revoked for removed host agent")
|
||
}
|
||
}
|
||
}
|
||
} else if tokenID != "" && tokenStillUsed {
|
||
log.Info().
|
||
Str("tokenID", tokenID).
|
||
Str("hostID", hostID).
|
||
Msg("API token still used by other agents; skipping revocation during host removal")
|
||
}
|
||
|
||
if tokenID != "" {
|
||
m.mu.Lock()
|
||
if m.hostTokenBindings == nil {
|
||
m.hostTokenBindings = make(map[string]string)
|
||
}
|
||
|
||
if _, exists := m.hostTokenBindings[tokenID]; exists {
|
||
delete(m.hostTokenBindings, tokenID)
|
||
}
|
||
|
||
if hostname != "" {
|
||
key := fmt.Sprintf("%s:%s", tokenID, hostname)
|
||
if _, exists := m.hostTokenBindings[key]; exists {
|
||
delete(m.hostTokenBindings, key)
|
||
}
|
||
}
|
||
|
||
if tokenRemoved != nil {
|
||
prefix := tokenID + ":"
|
||
for key := range m.hostTokenBindings {
|
||
if strings.HasPrefix(key, prefix) {
|
||
delete(m.hostTokenBindings, key)
|
||
}
|
||
}
|
||
}
|
||
m.mu.Unlock()
|
||
|
||
log.Debug().
|
||
Str("tokenID", tokenID).
|
||
Str("hostID", hostID).
|
||
Bool("revoked", tokenRemoved != nil).
|
||
Msg("Unbound host agent token bindings after host removal")
|
||
}
|
||
|
||
m.state.RemoveConnectionHealth(hostConnectionPrefix + hostID)
|
||
|
||
// Clear LinkedHostAgentID from any nodes that were linked to this host agent
|
||
unlinkedCount := m.state.UnlinkNodesFromHostAgent(hostID)
|
||
if unlinkedCount > 0 {
|
||
log.Info().
|
||
Str("hostID", hostID).
|
||
Int("unlinkedNodes", unlinkedCount).
|
||
Msg("Cleared host agent links from PVE nodes")
|
||
}
|
||
|
||
log.Info().
|
||
Str("host", host.Hostname).
|
||
Str("hostID", hostID).
|
||
Bool("removed", removed).
|
||
Msg("Host agent removed from monitoring")
|
||
|
||
if m.alertManager != nil {
|
||
m.alertManager.HandleHostRemoved(host)
|
||
}
|
||
|
||
return host, nil
|
||
}
|
||
|
||
// LinkHostAgent manually links a host agent to a specific PVE node.
|
||
// This is used when auto-linking can't disambiguate (e.g., multiple nodes with hostname "pve").
|
||
// After linking, the host agent's temperature/sensor data will appear on the correct node.
|
||
func (m *Monitor) LinkHostAgent(hostID, nodeID string) error {
|
||
hostID = strings.TrimSpace(hostID)
|
||
nodeID = strings.TrimSpace(nodeID)
|
||
if hostID == "" {
|
||
return fmt.Errorf("host id is required")
|
||
}
|
||
if nodeID == "" {
|
||
return fmt.Errorf("node id is required")
|
||
}
|
||
|
||
if err := m.state.LinkHostAgentToNode(hostID, nodeID); err != nil {
|
||
return err
|
||
}
|
||
|
||
log.Info().
|
||
Str("hostID", hostID).
|
||
Str("nodeID", nodeID).
|
||
Msg("Manually linked host agent to PVE node")
|
||
|
||
return nil
|
||
}
|
||
|
||
// UnlinkHostAgent removes the link between a host agent and its PVE node.
|
||
// The agent will continue to report but will appear in the Managed Agents table
|
||
// instead of being merged with the PVE node in the Dashboard.
|
||
func (m *Monitor) UnlinkHostAgent(hostID string) error {
|
||
hostID = strings.TrimSpace(hostID)
|
||
if hostID == "" {
|
||
return fmt.Errorf("host id is required")
|
||
}
|
||
|
||
if !m.state.UnlinkHostAgent(hostID) {
|
||
return fmt.Errorf("host not found or not linked to a node")
|
||
}
|
||
|
||
log.Info().
|
||
Str("hostID", hostID).
|
||
Msg("Unlinked host agent from PVE node")
|
||
|
||
return nil
|
||
}
|
||
|
||
// HostAgentConfig represents server-side configuration for a host agent.
|
||
type HostAgentConfig struct {
|
||
CommandsEnabled *bool `json:"commandsEnabled,omitempty"` // nil = use agent default
|
||
Settings map[string]interface{} `json:"settings,omitempty"` // Merged profile settings
|
||
IssuedAt *time.Time `json:"issuedAt,omitempty"`
|
||
ExpiresAt *time.Time `json:"expiresAt,omitempty"`
|
||
Signature string `json:"signature,omitempty"`
|
||
}
|
||
|
||
// GetHostAgentConfig returns the server-side configuration for a host agent.
|
||
// The agent can poll this to apply remote config overrides.
|
||
// Uses in-memory caching to avoid disk I/O on every agent report (refs #1094).
|
||
func (m *Monitor) GetHostAgentConfig(hostID string) HostAgentConfig {
|
||
hostID = strings.TrimSpace(hostID)
|
||
if hostID == "" {
|
||
return HostAgentConfig{}
|
||
}
|
||
|
||
cfg := HostAgentConfig{}
|
||
|
||
// 1. Load Host Metadata (CommandsEnabled) - this is already in-memory
|
||
if m.hostMetadataStore != nil {
|
||
if meta := m.hostMetadataStore.Get(hostID); meta != nil {
|
||
cfg.CommandsEnabled = meta.CommandsEnabled
|
||
}
|
||
}
|
||
|
||
// 2. Load Profile Configuration from cache
|
||
if m.persistence != nil {
|
||
profiles, assignments := m.getAgentProfileCache()
|
||
|
||
var profileID string
|
||
for _, a := range assignments {
|
||
if a.AgentID == hostID {
|
||
profileID = a.ProfileID
|
||
break
|
||
}
|
||
}
|
||
|
||
if profileID != "" {
|
||
for _, p := range profiles {
|
||
if p.ID == profileID {
|
||
cfg.Settings = p.Config
|
||
break
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
return cfg
|
||
}
|
||
|
||
// getAgentProfileCache returns cached profiles and assignments, refreshing if stale.
|
||
func (m *Monitor) getAgentProfileCache() ([]models.AgentProfile, []models.AgentProfileAssignment) {
|
||
now := time.Now()
|
||
|
||
// Fast path: check if cache is valid
|
||
m.agentProfileCacheMu.RLock()
|
||
cache := m.agentProfileCache
|
||
if cache != nil && now.Sub(cache.loadedAt) < agentProfileCacheTTL {
|
||
profiles := cache.profiles
|
||
assignments := cache.assignments
|
||
m.agentProfileCacheMu.RUnlock()
|
||
return profiles, assignments
|
||
}
|
||
m.agentProfileCacheMu.RUnlock()
|
||
|
||
// Slow path: reload from disk
|
||
m.agentProfileCacheMu.Lock()
|
||
defer m.agentProfileCacheMu.Unlock()
|
||
|
||
// Double-check after acquiring write lock
|
||
if m.agentProfileCache != nil && now.Sub(m.agentProfileCache.loadedAt) < agentProfileCacheTTL {
|
||
return m.agentProfileCache.profiles, m.agentProfileCache.assignments
|
||
}
|
||
|
||
var profiles []models.AgentProfile
|
||
var assignments []models.AgentProfileAssignment
|
||
|
||
if loadedAssignments, err := m.persistence.LoadAgentProfileAssignments(); err != nil {
|
||
log.Warn().Err(err).Msg("Failed to load agent profile assignments for cache")
|
||
} else {
|
||
assignments = loadedAssignments
|
||
}
|
||
|
||
if loadedProfiles, err := m.persistence.LoadAgentProfiles(); err != nil {
|
||
log.Warn().Err(err).Msg("Failed to load agent profiles for cache")
|
||
} else {
|
||
profiles = loadedProfiles
|
||
}
|
||
|
||
m.agentProfileCache = &agentProfileCacheEntry{
|
||
profiles: profiles,
|
||
assignments: assignments,
|
||
loadedAt: now,
|
||
}
|
||
|
||
return profiles, assignments
|
||
}
|
||
|
||
// InvalidateAgentProfileCache clears the agent profile cache, forcing a reload on next access.
|
||
// Call this when profiles or assignments are modified.
|
||
func (m *Monitor) InvalidateAgentProfileCache() {
|
||
m.agentProfileCacheMu.Lock()
|
||
m.agentProfileCache = nil
|
||
m.agentProfileCacheMu.Unlock()
|
||
}
|
||
|
||
// UpdateHostAgentConfig updates the server-side configuration for a host agent.
|
||
// This allows the UI to remotely enable/disable features on agents.
|
||
func (m *Monitor) UpdateHostAgentConfig(hostID string, commandsEnabled *bool) error {
|
||
hostID = strings.TrimSpace(hostID)
|
||
if hostID == "" {
|
||
return fmt.Errorf("host id is required")
|
||
}
|
||
|
||
if m.hostMetadataStore == nil {
|
||
return fmt.Errorf("host metadata store not initialized")
|
||
}
|
||
|
||
// Get existing metadata or create new
|
||
meta := m.hostMetadataStore.Get(hostID)
|
||
if meta == nil {
|
||
meta = &config.HostMetadata{ID: hostID}
|
||
}
|
||
|
||
meta.CommandsEnabled = commandsEnabled
|
||
|
||
if err := m.hostMetadataStore.Set(hostID, meta); err != nil {
|
||
return fmt.Errorf("failed to save host config: %w", err)
|
||
}
|
||
|
||
// Also update the Host model in state for immediate UI feedback
|
||
// The agent will confirm on its next report, but this provides instant feedback
|
||
if commandsEnabled != nil {
|
||
m.state.SetHostCommandsEnabled(hostID, *commandsEnabled)
|
||
}
|
||
|
||
log.Info().
|
||
Str("hostId", hostID).
|
||
Interface("commandsEnabled", commandsEnabled).
|
||
Msg("Host agent config updated")
|
||
|
||
return nil
|
||
}
|
||
|
||
// HideDockerHost marks a docker host as hidden without removing it from state.
|
||
// Hidden hosts will not be shown in the frontend but will continue to accept updates.
|
||
func (m *Monitor) HideDockerHost(hostID string) (models.DockerHost, error) {
|
||
hostID = strings.TrimSpace(hostID)
|
||
if hostID == "" {
|
||
return models.DockerHost{}, fmt.Errorf("docker host id is required")
|
||
}
|
||
|
||
host, ok := m.state.SetDockerHostHidden(hostID, true)
|
||
if !ok {
|
||
return models.DockerHost{}, fmt.Errorf("docker host %q not found", hostID)
|
||
}
|
||
|
||
log.Info().
|
||
Str("dockerHost", host.Hostname).
|
||
Str("dockerHostID", hostID).
|
||
Msg("Docker host hidden from view")
|
||
|
||
return host, nil
|
||
}
|
||
|
||
// UnhideDockerHost marks a docker host as visible again.
|
||
func (m *Monitor) UnhideDockerHost(hostID string) (models.DockerHost, error) {
|
||
hostID = strings.TrimSpace(hostID)
|
||
if hostID == "" {
|
||
return models.DockerHost{}, fmt.Errorf("docker host id is required")
|
||
}
|
||
|
||
host, ok := m.state.SetDockerHostHidden(hostID, false)
|
||
if !ok {
|
||
return models.DockerHost{}, fmt.Errorf("docker host %q not found", hostID)
|
||
}
|
||
|
||
// Clear removal tracking if it was marked as removed
|
||
m.mu.Lock()
|
||
delete(m.removedDockerHosts, hostID)
|
||
m.mu.Unlock()
|
||
|
||
log.Info().
|
||
Str("dockerHost", host.Hostname).
|
||
Str("dockerHostID", hostID).
|
||
Msg("Docker host unhidden")
|
||
|
||
return host, nil
|
||
}
|
||
|
||
// MarkDockerHostPendingUninstall marks a docker host as pending uninstall.
|
||
// This is used when the user has run the uninstall command and is waiting for the host to go offline.
|
||
func (m *Monitor) MarkDockerHostPendingUninstall(hostID string) (models.DockerHost, error) {
|
||
hostID = strings.TrimSpace(hostID)
|
||
if hostID == "" {
|
||
return models.DockerHost{}, fmt.Errorf("docker host id is required")
|
||
}
|
||
|
||
host, ok := m.state.SetDockerHostPendingUninstall(hostID, true)
|
||
if !ok {
|
||
return models.DockerHost{}, fmt.Errorf("docker host %q not found", hostID)
|
||
}
|
||
|
||
log.Info().
|
||
Str("dockerHost", host.Hostname).
|
||
Str("dockerHostID", hostID).
|
||
Msg("Docker host marked as pending uninstall")
|
||
|
||
return host, nil
|
||
}
|
||
|
||
// SetDockerHostCustomDisplayName updates the custom display name for a docker host.
|
||
func (m *Monitor) SetDockerHostCustomDisplayName(hostID string, customName string) (models.DockerHost, error) {
|
||
hostID = strings.TrimSpace(hostID)
|
||
if hostID == "" {
|
||
return models.DockerHost{}, fmt.Errorf("docker host id is required")
|
||
}
|
||
|
||
customName = strings.TrimSpace(customName)
|
||
|
||
// Persist to Docker metadata store first
|
||
var hostMeta *config.DockerHostMetadata
|
||
if customName != "" {
|
||
hostMeta = &config.DockerHostMetadata{
|
||
CustomDisplayName: customName,
|
||
}
|
||
}
|
||
if err := m.dockerMetadataStore.SetHostMetadata(hostID, hostMeta); err != nil {
|
||
log.Error().Err(err).Str("hostID", hostID).Msg("Failed to persist Docker host metadata")
|
||
return models.DockerHost{}, fmt.Errorf("failed to persist custom display name: %w", err)
|
||
}
|
||
|
||
// Update in-memory state
|
||
host, ok := m.state.SetDockerHostCustomDisplayName(hostID, customName)
|
||
if !ok {
|
||
return models.DockerHost{}, fmt.Errorf("docker host %q not found", hostID)
|
||
}
|
||
|
||
log.Info().
|
||
Str("dockerHost", host.Hostname).
|
||
Str("dockerHostID", hostID).
|
||
Str("customDisplayName", customName).
|
||
Msg("Docker host custom display name updated")
|
||
|
||
return host, nil
|
||
}
|
||
|
||
// AllowDockerHostReenroll removes a host ID from the removal blocklist so it can report again.
|
||
func (m *Monitor) AllowDockerHostReenroll(hostID string) error {
|
||
hostID = strings.TrimSpace(hostID)
|
||
if hostID == "" {
|
||
return fmt.Errorf("docker host id is required")
|
||
}
|
||
|
||
m.mu.Lock()
|
||
defer m.mu.Unlock()
|
||
|
||
if _, exists := m.removedDockerHosts[hostID]; !exists {
|
||
host, found := m.GetDockerHost(hostID)
|
||
event := log.Info().
|
||
Str("dockerHostID", hostID)
|
||
if found {
|
||
event = event.Str("dockerHost", host.Hostname)
|
||
}
|
||
event.Msg("Allow re-enroll requested but host was not blocked; ignoring")
|
||
return nil
|
||
}
|
||
|
||
delete(m.removedDockerHosts, hostID)
|
||
if cmd, exists := m.dockerCommands[hostID]; exists {
|
||
delete(m.dockerCommandIndex, cmd.status.ID)
|
||
delete(m.dockerCommands, hostID)
|
||
}
|
||
m.state.SetDockerHostCommand(hostID, nil)
|
||
m.state.RemoveRemovedDockerHost(hostID)
|
||
|
||
log.Info().
|
||
Str("dockerHostID", hostID).
|
||
Msg("Docker host removal block cleared; host may report again")
|
||
|
||
return nil
|
||
}
|
||
|
||
// GetDockerHost retrieves a docker host by identifier if present in state.
|
||
func (m *Monitor) GetDockerHost(hostID string) (models.DockerHost, bool) {
|
||
hostID = strings.TrimSpace(hostID)
|
||
if hostID == "" {
|
||
return models.DockerHost{}, false
|
||
}
|
||
|
||
hosts := m.state.GetDockerHosts()
|
||
for _, host := range hosts {
|
||
if host.ID == hostID {
|
||
return host, true
|
||
}
|
||
}
|
||
return models.DockerHost{}, false
|
||
}
|
||
|
||
// GetDockerHosts returns a point-in-time snapshot of all Docker hosts Pulse knows about.
|
||
func (m *Monitor) GetDockerHosts() []models.DockerHost {
|
||
if m == nil || m.state == nil {
|
||
return nil
|
||
}
|
||
return m.state.GetDockerHosts()
|
||
}
|
||
|
||
// RebuildTokenBindings reconstructs agent-to-token binding maps from the current
|
||
// state of Docker hosts and host agents. This should be called after API tokens
|
||
// are reloaded from disk to ensure bindings remain consistent with the new token set.
|
||
// It preserves bindings for tokens that still exist and removes orphaned entries.
|
||
func (m *Monitor) RebuildTokenBindings() {
|
||
if m == nil || m.state == nil || m.config == nil {
|
||
return
|
||
}
|
||
|
||
m.mu.Lock()
|
||
defer m.mu.Unlock()
|
||
|
||
// Build a set of valid token IDs from the current config
|
||
validTokens := make(map[string]struct{})
|
||
for _, token := range m.config.APITokens {
|
||
if token.ID != "" {
|
||
validTokens[token.ID] = struct{}{}
|
||
}
|
||
}
|
||
|
||
// Rebuild Docker token bindings
|
||
newDockerBindings := make(map[string]string)
|
||
dockerHosts := m.state.GetDockerHosts()
|
||
for _, host := range dockerHosts {
|
||
tokenID := strings.TrimSpace(host.TokenID)
|
||
if tokenID == "" {
|
||
continue
|
||
}
|
||
// Only keep bindings for tokens that still exist in config
|
||
if _, valid := validTokens[tokenID]; !valid {
|
||
continue
|
||
}
|
||
// Use AgentID if available, otherwise fall back to host ID
|
||
agentID := strings.TrimSpace(host.AgentID)
|
||
if agentID == "" {
|
||
agentID = host.ID
|
||
}
|
||
if agentID != "" {
|
||
newDockerBindings[tokenID] = agentID
|
||
}
|
||
}
|
||
|
||
// Rebuild Host agent token bindings
|
||
newHostBindings := make(map[string]string)
|
||
hosts := m.state.GetHosts()
|
||
for _, host := range hosts {
|
||
tokenID := strings.TrimSpace(host.TokenID)
|
||
if tokenID == "" {
|
||
continue
|
||
}
|
||
// Only keep bindings for tokens that still exist in config
|
||
if _, valid := validTokens[tokenID]; !valid {
|
||
continue
|
||
}
|
||
hostname := strings.TrimSpace(host.Hostname)
|
||
if hostname == "" || host.ID == "" {
|
||
continue
|
||
}
|
||
newHostBindings[fmt.Sprintf("%s:%s", tokenID, hostname)] = host.ID
|
||
}
|
||
|
||
// Log what changed
|
||
oldDockerCount := len(m.dockerTokenBindings)
|
||
oldHostCount := len(m.hostTokenBindings)
|
||
m.dockerTokenBindings = newDockerBindings
|
||
m.hostTokenBindings = newHostBindings
|
||
|
||
log.Info().
|
||
Int("dockerBindings", len(newDockerBindings)).
|
||
Int("hostBindings", len(newHostBindings)).
|
||
Int("previousDockerBindings", oldDockerCount).
|
||
Int("previousHostBindings", oldHostCount).
|
||
Int("validTokens", len(validTokens)).
|
||
Msg("Rebuilt agent token bindings after API token reload")
|
||
}
|
||
|
||
// ClearUnauthenticatedAgents removes all host agents and docker hosts from the state.
|
||
// This should be called when security is first configured to clear any agents that
|
||
// connected during the brief unauthenticated window before credentials were set up.
|
||
// This prevents stale/unauthorized agent data from appearing in the UI.
|
||
func (m *Monitor) ClearUnauthenticatedAgents() (int, int) {
|
||
if m == nil || m.state == nil {
|
||
return 0, 0
|
||
}
|
||
|
||
// Clear all hosts
|
||
hostCount := m.state.ClearAllHosts()
|
||
|
||
// Clear all docker hosts
|
||
dockerCount := m.state.ClearAllDockerHosts()
|
||
|
||
// Clear any token bindings since the tokens used by the old agents are invalid
|
||
m.mu.Lock()
|
||
m.dockerTokenBindings = make(map[string]string)
|
||
m.hostTokenBindings = make(map[string]string)
|
||
m.mu.Unlock()
|
||
|
||
if hostCount > 0 || dockerCount > 0 {
|
||
log.Info().
|
||
Int("hostsCleared", hostCount).
|
||
Int("dockerHostsCleared", dockerCount).
|
||
Msg("Cleared unauthenticated agents after security setup")
|
||
}
|
||
|
||
return hostCount, dockerCount
|
||
}
|
||
|
||
// QueueDockerHostStop queues a stop command for the specified docker host.
|
||
func (m *Monitor) QueueDockerHostStop(hostID string) (models.DockerHostCommandStatus, error) {
|
||
return m.queueDockerStopCommand(hostID)
|
||
}
|
||
|
||
// FetchDockerCommandForHost retrieves the next command payload (if any) for the host.
|
||
func (m *Monitor) FetchDockerCommandForHost(hostID string) (map[string]any, *models.DockerHostCommandStatus) {
|
||
return m.getDockerCommandPayload(hostID)
|
||
}
|
||
|
||
// AcknowledgeDockerHostCommand updates the lifecycle status for a docker host command.
|
||
func (m *Monitor) AcknowledgeDockerHostCommand(commandID, hostID, status, message string) (models.DockerHostCommandStatus, string, bool, error) {
|
||
return m.acknowledgeDockerCommand(commandID, hostID, status, message)
|
||
}
|
||
|
||
// ApplyDockerReport ingests a docker agent report into the shared state.
|
||
func (m *Monitor) ApplyDockerReport(report agentsdocker.Report, tokenRecord *config.APITokenRecord) (models.DockerHost, error) {
|
||
hostsSnapshot := m.state.GetDockerHosts()
|
||
identifier, legacyIDs, previous, hasPrevious := resolveDockerHostIdentifier(report, tokenRecord, hostsSnapshot)
|
||
if strings.TrimSpace(identifier) == "" {
|
||
return models.DockerHost{}, fmt.Errorf("docker report missing agent identifier")
|
||
}
|
||
|
||
// Check if this host was deliberately removed - reject report to prevent resurrection
|
||
m.mu.RLock()
|
||
removedAt, wasRemoved := m.removedDockerHosts[identifier]
|
||
if !wasRemoved {
|
||
for _, legacyID := range legacyIDs {
|
||
if legacyID == "" || legacyID == identifier {
|
||
continue
|
||
}
|
||
if ts, ok := m.removedDockerHosts[legacyID]; ok {
|
||
removedAt = ts
|
||
wasRemoved = true
|
||
break
|
||
}
|
||
}
|
||
}
|
||
m.mu.RUnlock()
|
||
|
||
if wasRemoved {
|
||
log.Info().
|
||
Str("dockerHostID", identifier).
|
||
Time("removedAt", removedAt).
|
||
Msg("Rejecting report from deliberately removed Docker host")
|
||
return models.DockerHost{}, fmt.Errorf("docker host %q was removed at %v and cannot report again. Use Allow re-enroll in Settings -> Agents -> Removed Docker Hosts or rerun the installer with a docker:manage token to clear this block", identifier, removedAt.Format(time.RFC3339))
|
||
}
|
||
|
||
// Enforce token uniqueness: each token can only be bound to one agent
|
||
if tokenRecord != nil && tokenRecord.ID != "" {
|
||
tokenID := strings.TrimSpace(tokenRecord.ID)
|
||
agentID := strings.TrimSpace(report.Agent.ID)
|
||
if agentID == "" {
|
||
agentID = identifier
|
||
}
|
||
|
||
m.mu.Lock()
|
||
if boundAgentID, exists := m.dockerTokenBindings[tokenID]; exists {
|
||
if boundAgentID != agentID {
|
||
m.mu.Unlock()
|
||
// Find the conflicting host to provide helpful error message
|
||
conflictingHostname := "unknown"
|
||
for _, host := range hostsSnapshot {
|
||
if host.AgentID == boundAgentID || host.ID == boundAgentID {
|
||
conflictingHostname = host.Hostname
|
||
if host.CustomDisplayName != "" {
|
||
conflictingHostname = host.CustomDisplayName
|
||
} else if host.DisplayName != "" {
|
||
conflictingHostname = host.DisplayName
|
||
}
|
||
break
|
||
}
|
||
}
|
||
tokenHint := tokenHintFromRecord(tokenRecord)
|
||
if tokenHint != "" {
|
||
tokenHint = " (" + tokenHint + ")"
|
||
}
|
||
log.Warn().
|
||
Str("tokenID", tokenID).
|
||
Str("tokenHint", tokenHint).
|
||
Str("reportingAgentID", agentID).
|
||
Str("boundAgentID", boundAgentID).
|
||
Str("conflictingHost", conflictingHostname).
|
||
Msg("Rejecting Docker report: token already bound to different agent")
|
||
return models.DockerHost{}, fmt.Errorf("API token%s is already in use by agent %q (host: %s). Each Docker agent must use a unique API token. Generate a new token for this agent", tokenHint, boundAgentID, conflictingHostname)
|
||
}
|
||
} else {
|
||
// First time seeing this token - bind it to this agent
|
||
m.dockerTokenBindings[tokenID] = agentID
|
||
log.Debug().
|
||
Str("tokenID", tokenID).
|
||
Str("agentID", agentID).
|
||
Str("hostname", report.Host.Hostname).
|
||
Msg("Bound Docker agent token to agent identity")
|
||
}
|
||
m.mu.Unlock()
|
||
}
|
||
|
||
hostname := strings.TrimSpace(report.Host.Hostname)
|
||
if hostname == "" {
|
||
return models.DockerHost{}, fmt.Errorf("docker report missing hostname")
|
||
}
|
||
|
||
timestamp := report.Timestamp
|
||
if timestamp.IsZero() {
|
||
timestamp = time.Now()
|
||
}
|
||
|
||
agentID := strings.TrimSpace(report.Agent.ID)
|
||
if agentID == "" {
|
||
agentID = identifier
|
||
}
|
||
|
||
displayName := strings.TrimSpace(report.Host.Name)
|
||
if displayName == "" {
|
||
displayName = hostname
|
||
}
|
||
|
||
runtime := strings.ToLower(strings.TrimSpace(report.Host.Runtime))
|
||
switch runtime {
|
||
case "", "auto", "default":
|
||
runtime = "docker"
|
||
case "docker", "podman":
|
||
// supported runtimes
|
||
default:
|
||
runtime = "docker"
|
||
}
|
||
|
||
runtimeVersion := strings.TrimSpace(report.Host.RuntimeVersion)
|
||
dockerVersion := strings.TrimSpace(report.Host.DockerVersion)
|
||
if runtimeVersion == "" {
|
||
runtimeVersion = dockerVersion
|
||
}
|
||
if dockerVersion == "" {
|
||
dockerVersion = runtimeVersion
|
||
}
|
||
|
||
containers := make([]models.DockerContainer, 0, len(report.Containers))
|
||
for _, payload := range report.Containers {
|
||
container := models.DockerContainer{
|
||
ID: payload.ID,
|
||
Name: payload.Name,
|
||
Image: payload.Image,
|
||
ImageDigest: payload.ImageDigest,
|
||
State: payload.State,
|
||
Status: payload.Status,
|
||
Health: payload.Health,
|
||
CPUPercent: safeFloat(payload.CPUPercent),
|
||
MemoryUsage: payload.MemoryUsageBytes,
|
||
MemoryLimit: payload.MemoryLimitBytes,
|
||
MemoryPercent: safeFloat(payload.MemoryPercent),
|
||
UptimeSeconds: payload.UptimeSeconds,
|
||
RestartCount: payload.RestartCount,
|
||
ExitCode: payload.ExitCode,
|
||
CreatedAt: payload.CreatedAt,
|
||
StartedAt: payload.StartedAt,
|
||
FinishedAt: payload.FinishedAt,
|
||
}
|
||
|
||
// Copy update status if provided by agent
|
||
if payload.UpdateStatus != nil {
|
||
container.UpdateStatus = &models.DockerContainerUpdateStatus{
|
||
UpdateAvailable: payload.UpdateStatus.UpdateAvailable,
|
||
CurrentDigest: payload.UpdateStatus.CurrentDigest,
|
||
LatestDigest: payload.UpdateStatus.LatestDigest,
|
||
LastChecked: payload.UpdateStatus.LastChecked,
|
||
Error: payload.UpdateStatus.Error,
|
||
}
|
||
}
|
||
|
||
if len(payload.Ports) > 0 {
|
||
ports := make([]models.DockerContainerPort, len(payload.Ports))
|
||
for i, port := range payload.Ports {
|
||
ports[i] = models.DockerContainerPort{
|
||
PrivatePort: port.PrivatePort,
|
||
PublicPort: port.PublicPort,
|
||
Protocol: port.Protocol,
|
||
IP: port.IP,
|
||
}
|
||
}
|
||
container.Ports = ports
|
||
}
|
||
|
||
if len(payload.Labels) > 0 {
|
||
labels := make(map[string]string, len(payload.Labels))
|
||
for k, v := range payload.Labels {
|
||
labels[k] = v
|
||
}
|
||
container.Labels = labels
|
||
}
|
||
|
||
if len(payload.Networks) > 0 {
|
||
networks := make([]models.DockerContainerNetworkLink, len(payload.Networks))
|
||
for i, net := range payload.Networks {
|
||
networks[i] = models.DockerContainerNetworkLink{
|
||
Name: net.Name,
|
||
IPv4: net.IPv4,
|
||
IPv6: net.IPv6,
|
||
}
|
||
}
|
||
container.Networks = networks
|
||
}
|
||
|
||
container.WritableLayerBytes = payload.WritableLayerBytes
|
||
container.RootFilesystemBytes = payload.RootFilesystemBytes
|
||
|
||
if payload.BlockIO != nil {
|
||
container.BlockIO = &models.DockerContainerBlockIO{
|
||
ReadBytes: payload.BlockIO.ReadBytes,
|
||
WriteBytes: payload.BlockIO.WriteBytes,
|
||
}
|
||
|
||
containerIdentifier := payload.ID
|
||
if strings.TrimSpace(containerIdentifier) == "" {
|
||
containerIdentifier = payload.Name
|
||
}
|
||
if strings.TrimSpace(containerIdentifier) != "" {
|
||
metrics := types.IOMetrics{
|
||
DiskRead: clampUint64ToInt64(payload.BlockIO.ReadBytes),
|
||
DiskWrite: clampUint64ToInt64(payload.BlockIO.WriteBytes),
|
||
Timestamp: timestamp,
|
||
}
|
||
readRate, writeRate, _, _ := m.rateTracker.CalculateRates(fmt.Sprintf("docker:%s:%s", identifier, containerIdentifier), metrics)
|
||
if readRate >= 0 {
|
||
value := readRate
|
||
container.BlockIO.ReadRateBytesPerSecond = &value
|
||
}
|
||
if writeRate >= 0 {
|
||
value := writeRate
|
||
container.BlockIO.WriteRateBytesPerSecond = &value
|
||
}
|
||
}
|
||
}
|
||
|
||
if len(payload.Mounts) > 0 {
|
||
mounts := make([]models.DockerContainerMount, len(payload.Mounts))
|
||
for i, mount := range payload.Mounts {
|
||
mounts[i] = models.DockerContainerMount{
|
||
Type: mount.Type,
|
||
Source: mount.Source,
|
||
Destination: mount.Destination,
|
||
Mode: mount.Mode,
|
||
RW: mount.RW,
|
||
Propagation: mount.Propagation,
|
||
Name: mount.Name,
|
||
Driver: mount.Driver,
|
||
}
|
||
}
|
||
container.Mounts = mounts
|
||
}
|
||
|
||
containers = append(containers, container)
|
||
}
|
||
|
||
services := convertDockerServices(report.Services)
|
||
tasks := convertDockerTasks(report.Tasks)
|
||
swarmInfo := convertDockerSwarmInfo(report.Host.Swarm)
|
||
|
||
loadAverage := make([]float64, 0, len(report.Host.LoadAverage))
|
||
if len(report.Host.LoadAverage) > 0 {
|
||
loadAverage = append(loadAverage, report.Host.LoadAverage...)
|
||
}
|
||
|
||
var memory models.Memory
|
||
if report.Host.Memory.TotalBytes > 0 || report.Host.Memory.UsedBytes > 0 {
|
||
memory = models.Memory{
|
||
Total: report.Host.Memory.TotalBytes,
|
||
Used: report.Host.Memory.UsedBytes,
|
||
Free: report.Host.Memory.FreeBytes,
|
||
Usage: safeFloat(report.Host.Memory.Usage),
|
||
SwapTotal: report.Host.Memory.SwapTotal,
|
||
SwapUsed: report.Host.Memory.SwapUsed,
|
||
}
|
||
}
|
||
// Fallback: if gopsutil's memory reading failed but Docker's TotalMemoryBytes
|
||
// is valid (possibly already a fallback from the agent), use that for Total.
|
||
// This handles Docker-in-LXC scenarios where both Docker and gopsutil may
|
||
// fail to read memory stats, but the agent fix provides a valid fallback.
|
||
if memory.Total <= 0 && report.Host.TotalMemoryBytes > 0 {
|
||
memory.Total = report.Host.TotalMemoryBytes
|
||
}
|
||
|
||
// Additional fallback for Docker-in-LXC: gopsutil may read Total and Free
|
||
// correctly from cgroup limits but return 0 for Used. Calculate Used from
|
||
// Total - Free when this happens. This fixes the "0B / 7GB" display issue.
|
||
if memory.Used <= 0 && memory.Total > 0 && memory.Free > 0 {
|
||
memory.Used = memory.Total - memory.Free
|
||
if memory.Used < 0 {
|
||
memory.Used = 0
|
||
}
|
||
// Recalculate usage percentage
|
||
if memory.Total > 0 {
|
||
memory.Usage = safePercentage(float64(memory.Used), float64(memory.Total))
|
||
}
|
||
}
|
||
|
||
disks := make([]models.Disk, 0, len(report.Host.Disks))
|
||
for _, disk := range report.Host.Disks {
|
||
disks = append(disks, models.Disk{
|
||
Total: disk.TotalBytes,
|
||
Used: disk.UsedBytes,
|
||
Free: disk.FreeBytes,
|
||
Usage: safeFloat(disk.Usage),
|
||
Mountpoint: disk.Mountpoint,
|
||
Type: disk.Type,
|
||
Device: disk.Device,
|
||
})
|
||
}
|
||
|
||
networkIfaces := make([]models.HostNetworkInterface, 0, len(report.Host.Network))
|
||
for _, iface := range report.Host.Network {
|
||
addresses := append([]string(nil), iface.Addresses...)
|
||
networkIfaces = append(networkIfaces, models.HostNetworkInterface{
|
||
Name: iface.Name,
|
||
MAC: iface.MAC,
|
||
Addresses: addresses,
|
||
RXBytes: iface.RXBytes,
|
||
TXBytes: iface.TXBytes,
|
||
SpeedMbps: iface.SpeedMbps,
|
||
})
|
||
}
|
||
|
||
agentVersion := normalizeAgentVersion(report.Agent.Version)
|
||
if agentVersion == "" && hasPrevious {
|
||
agentVersion = normalizeAgentVersion(previous.AgentVersion)
|
||
}
|
||
|
||
host := models.DockerHost{
|
||
ID: identifier,
|
||
AgentID: agentID,
|
||
Hostname: hostname,
|
||
DisplayName: displayName,
|
||
MachineID: strings.TrimSpace(report.Host.MachineID),
|
||
OS: report.Host.OS,
|
||
KernelVersion: report.Host.KernelVersion,
|
||
Architecture: report.Host.Architecture,
|
||
Runtime: runtime,
|
||
RuntimeVersion: runtimeVersion,
|
||
DockerVersion: dockerVersion,
|
||
CPUs: report.Host.TotalCPU,
|
||
TotalMemoryBytes: report.Host.TotalMemoryBytes,
|
||
UptimeSeconds: report.Host.UptimeSeconds,
|
||
CPUUsage: safeFloat(report.Host.CPUUsagePercent),
|
||
LoadAverage: loadAverage,
|
||
Memory: memory,
|
||
Disks: disks,
|
||
NetworkInterfaces: networkIfaces,
|
||
Status: "online",
|
||
LastSeen: timestamp,
|
||
IntervalSeconds: report.Agent.IntervalSeconds,
|
||
AgentVersion: agentVersion,
|
||
Containers: containers,
|
||
Services: services,
|
||
Tasks: tasks,
|
||
Swarm: swarmInfo,
|
||
IsLegacy: isLegacyDockerAgent(report.Agent.Type),
|
||
}
|
||
|
||
if tokenRecord != nil {
|
||
host.TokenID = tokenRecord.ID
|
||
host.TokenName = tokenRecord.Name
|
||
host.TokenHint = tokenHintFromRecord(tokenRecord)
|
||
if tokenRecord.LastUsedAt != nil {
|
||
t := tokenRecord.LastUsedAt.UTC()
|
||
host.TokenLastUsedAt = &t
|
||
} else {
|
||
t := time.Now().UTC()
|
||
host.TokenLastUsedAt = &t
|
||
}
|
||
} else if hasPrevious {
|
||
host.TokenID = previous.TokenID
|
||
host.TokenName = previous.TokenName
|
||
host.TokenHint = previous.TokenHint
|
||
host.TokenLastUsedAt = previous.TokenLastUsedAt
|
||
}
|
||
|
||
// Load custom display name from metadata store if not already set
|
||
if host.CustomDisplayName == "" {
|
||
if hostMeta := m.dockerMetadataStore.GetHostMetadata(identifier); hostMeta != nil {
|
||
host.CustomDisplayName = hostMeta.CustomDisplayName
|
||
}
|
||
}
|
||
|
||
m.state.UpsertDockerHost(host)
|
||
m.state.SetConnectionHealth(dockerConnectionPrefix+host.ID, true)
|
||
|
||
// Check if the host was previously hidden and is now visible again
|
||
if hasPrevious && previous.Hidden && !host.Hidden {
|
||
log.Info().
|
||
Str("dockerHost", host.Hostname).
|
||
Str("dockerHostID", host.ID).
|
||
Msg("Docker host auto-unhidden after receiving report")
|
||
}
|
||
|
||
// Check if the host was pending uninstall - if so, log a warning that uninstall failed and clear the flag
|
||
if hasPrevious && previous.PendingUninstall {
|
||
log.Warn().
|
||
Str("dockerHost", host.Hostname).
|
||
Str("dockerHostID", host.ID).
|
||
Msg("Docker host reporting again after pending uninstall - uninstall may have failed")
|
||
|
||
// Clear the pending uninstall flag since the host is clearly still active
|
||
m.state.SetDockerHostPendingUninstall(host.ID, false)
|
||
}
|
||
|
||
if m.alertManager != nil {
|
||
m.alertManager.CheckDockerHost(host)
|
||
}
|
||
|
||
// Record Docker HOST metrics for sparkline charts
|
||
now := time.Now()
|
||
hostMetricKey := fmt.Sprintf("dockerHost:%s", host.ID)
|
||
|
||
// Record host CPU usage
|
||
m.metricsHistory.AddGuestMetric(hostMetricKey, "cpu", host.CPUUsage, now)
|
||
|
||
// Record host Memory usage
|
||
m.metricsHistory.AddGuestMetric(hostMetricKey, "memory", host.Memory.Usage, now)
|
||
|
||
// Record host Disk usage (use first disk or calculate total)
|
||
var hostDiskPercent float64
|
||
if len(host.Disks) > 0 {
|
||
hostDiskPercent = host.Disks[0].Usage
|
||
}
|
||
m.metricsHistory.AddGuestMetric(hostMetricKey, "disk", hostDiskPercent, now)
|
||
|
||
// Also write to persistent SQLite store
|
||
if m.metricsStore != nil {
|
||
m.metricsStore.Write("dockerHost", host.ID, "cpu", host.CPUUsage, now)
|
||
m.metricsStore.Write("dockerHost", host.ID, "memory", host.Memory.Usage, now)
|
||
m.metricsStore.Write("dockerHost", host.ID, "disk", hostDiskPercent, now)
|
||
}
|
||
|
||
// Record Docker CONTAINER metrics for sparkline charts
|
||
// Use a prefixed key (docker:containerID) to distinguish from Proxmox containers
|
||
for _, container := range containers {
|
||
if container.ID == "" {
|
||
continue
|
||
}
|
||
// Build a unique metric key for Docker containers
|
||
metricKey := fmt.Sprintf("docker:%s", container.ID)
|
||
|
||
// Record CPU (already a percentage 0-100)
|
||
m.metricsHistory.AddGuestMetric(metricKey, "cpu", container.CPUPercent, now)
|
||
|
||
// Record Memory (already a percentage 0-100)
|
||
m.metricsHistory.AddGuestMetric(metricKey, "memory", container.MemoryPercent, now)
|
||
|
||
// Record Disk usage as percentage of writable layer vs root filesystem
|
||
var diskPercent float64
|
||
if container.RootFilesystemBytes > 0 && container.WritableLayerBytes > 0 {
|
||
diskPercent = float64(container.WritableLayerBytes) / float64(container.RootFilesystemBytes) * 100
|
||
if diskPercent > 100 {
|
||
diskPercent = 100
|
||
}
|
||
}
|
||
m.metricsHistory.AddGuestMetric(metricKey, "disk", diskPercent, now)
|
||
|
||
// Also write to persistent SQLite store for long-term storage
|
||
if m.metricsStore != nil {
|
||
m.metricsStore.Write("dockerContainer", container.ID, "cpu", container.CPUPercent, now)
|
||
m.metricsStore.Write("dockerContainer", container.ID, "memory", container.MemoryPercent, now)
|
||
m.metricsStore.Write("dockerContainer", container.ID, "disk", diskPercent, now)
|
||
}
|
||
}
|
||
|
||
log.Debug().
|
||
Str("dockerHost", host.Hostname).
|
||
Int("containers", len(containers)).
|
||
Msg("Docker host report processed")
|
||
|
||
return host, nil
|
||
}
|
||
|
||
// ApplyHostReport ingests a host agent report into the shared state.
|
||
func (m *Monitor) ApplyHostReport(report agentshost.Report, tokenRecord *config.APITokenRecord) (models.Host, error) {
|
||
hostname := strings.TrimSpace(report.Host.Hostname)
|
||
if hostname == "" {
|
||
return models.Host{}, fmt.Errorf("host report missing hostname")
|
||
}
|
||
|
||
baseIdentifier := strings.TrimSpace(report.Host.ID)
|
||
if baseIdentifier != "" {
|
||
baseIdentifier = sanitizeDockerHostSuffix(baseIdentifier)
|
||
}
|
||
if baseIdentifier == "" {
|
||
if machine := sanitizeDockerHostSuffix(report.Host.MachineID); machine != "" {
|
||
baseIdentifier = machine
|
||
}
|
||
}
|
||
if baseIdentifier == "" {
|
||
if agentID := sanitizeDockerHostSuffix(report.Agent.ID); agentID != "" {
|
||
baseIdentifier = agentID
|
||
}
|
||
}
|
||
if baseIdentifier == "" {
|
||
if hostName := sanitizeDockerHostSuffix(hostname); hostName != "" {
|
||
baseIdentifier = hostName
|
||
}
|
||
}
|
||
if baseIdentifier == "" {
|
||
seedParts := uniqueNonEmptyStrings(
|
||
report.Host.MachineID,
|
||
report.Agent.ID,
|
||
report.Host.Hostname,
|
||
)
|
||
if len(seedParts) == 0 {
|
||
seedParts = []string{hostname}
|
||
}
|
||
seed := strings.Join(seedParts, "|")
|
||
sum := sha1.Sum([]byte(seed))
|
||
baseIdentifier = fmt.Sprintf("host-%s", hex.EncodeToString(sum[:6]))
|
||
}
|
||
|
||
existingHosts := m.state.GetHosts()
|
||
|
||
identifier := baseIdentifier
|
||
if tokenRecord != nil && strings.TrimSpace(tokenRecord.ID) != "" {
|
||
tokenID := strings.TrimSpace(tokenRecord.ID)
|
||
bindingKey := fmt.Sprintf("%s:%s", tokenID, hostname)
|
||
|
||
m.mu.Lock()
|
||
if m.hostTokenBindings == nil {
|
||
m.hostTokenBindings = make(map[string]string)
|
||
}
|
||
boundID := strings.TrimSpace(m.hostTokenBindings[bindingKey])
|
||
m.mu.Unlock()
|
||
|
||
// If we already have a binding for this token+hostname, use it to keep host IDs stable
|
||
// even if another colliding host disappears later.
|
||
if boundID != "" {
|
||
identifier = boundID
|
||
} else {
|
||
bindingID := baseIdentifier
|
||
for _, candidate := range existingHosts {
|
||
if candidate.ID != bindingID {
|
||
continue
|
||
}
|
||
if strings.TrimSpace(candidate.Hostname) == hostname && strings.TrimSpace(candidate.TokenID) == tokenID {
|
||
break
|
||
}
|
||
|
||
seed := strings.Join([]string{tokenID, hostname, bindingID}, "|")
|
||
sum := sha1.Sum([]byte(seed))
|
||
suffix := hex.EncodeToString(sum[:4])
|
||
|
||
base := bindingID
|
||
if base == "" {
|
||
base = "host"
|
||
}
|
||
if len(base) > 40 {
|
||
base = base[:40]
|
||
}
|
||
bindingID = fmt.Sprintf("%s-%s", base, suffix)
|
||
break
|
||
}
|
||
|
||
m.mu.Lock()
|
||
if m.hostTokenBindings == nil {
|
||
m.hostTokenBindings = make(map[string]string)
|
||
}
|
||
if existing := strings.TrimSpace(m.hostTokenBindings[bindingKey]); existing != "" {
|
||
identifier = existing
|
||
} else {
|
||
m.hostTokenBindings[bindingKey] = bindingID
|
||
log.Debug().
|
||
Str("tokenID", tokenID).
|
||
Str("hostID", bindingID).
|
||
Str("hostname", hostname).
|
||
Msg("Bound host agent token to hostname")
|
||
identifier = bindingID
|
||
}
|
||
m.mu.Unlock()
|
||
}
|
||
}
|
||
|
||
var previous models.Host
|
||
var hasPrevious bool
|
||
for _, candidate := range existingHosts {
|
||
if candidate.ID == identifier {
|
||
previous = candidate
|
||
hasPrevious = true
|
||
break
|
||
}
|
||
}
|
||
|
||
displayName := strings.TrimSpace(report.Host.DisplayName)
|
||
if displayName == "" {
|
||
displayName = hostname
|
||
}
|
||
|
||
timestamp := report.Timestamp
|
||
if timestamp.IsZero() {
|
||
timestamp = time.Now().UTC()
|
||
}
|
||
|
||
memory := models.Memory{
|
||
Total: report.Metrics.Memory.TotalBytes,
|
||
Used: report.Metrics.Memory.UsedBytes,
|
||
Free: report.Metrics.Memory.FreeBytes,
|
||
Usage: safeFloat(report.Metrics.Memory.Usage),
|
||
SwapTotal: report.Metrics.Memory.SwapTotal,
|
||
SwapUsed: report.Metrics.Memory.SwapUsed,
|
||
}
|
||
|
||
// Fallback for LXC environments: gopsutil may read Total and Free correctly
|
||
// from cgroup limits but return 0 for Used. Calculate Used from Total - Free.
|
||
if memory.Used <= 0 && memory.Total > 0 && memory.Free > 0 {
|
||
memory.Used = memory.Total - memory.Free
|
||
if memory.Used < 0 {
|
||
memory.Used = 0
|
||
}
|
||
}
|
||
|
||
if memory.Usage <= 0 && memory.Total > 0 {
|
||
memory.Usage = safePercentage(float64(memory.Used), float64(memory.Total))
|
||
}
|
||
|
||
disks := make([]models.Disk, 0, len(report.Disks))
|
||
for _, disk := range report.Disks {
|
||
// Filter virtual/system filesystems and read-only filesystems to avoid cluttering
|
||
// the UI with tmpfs, devtmpfs, /dev, /run, /sys, docker overlay mounts, snap mounts,
|
||
// immutable OS images, etc. (issues #505, #690, #790).
|
||
if shouldSkip, _ := fsfilters.ShouldSkipFilesystem(disk.Type, disk.Mountpoint, uint64(disk.TotalBytes), uint64(disk.UsedBytes)); shouldSkip {
|
||
continue
|
||
}
|
||
|
||
usage := safeFloat(disk.Usage)
|
||
if usage <= 0 && disk.TotalBytes > 0 {
|
||
usage = safePercentage(float64(disk.UsedBytes), float64(disk.TotalBytes))
|
||
}
|
||
disks = append(disks, models.Disk{
|
||
Total: disk.TotalBytes,
|
||
Used: disk.UsedBytes,
|
||
Free: disk.FreeBytes,
|
||
Usage: usage,
|
||
Mountpoint: disk.Mountpoint,
|
||
Type: disk.Type,
|
||
Device: disk.Device,
|
||
})
|
||
}
|
||
|
||
diskIO := make([]models.DiskIO, 0, len(report.DiskIO))
|
||
for _, io := range report.DiskIO {
|
||
diskIO = append(diskIO, models.DiskIO{
|
||
Device: io.Device,
|
||
ReadBytes: io.ReadBytes,
|
||
WriteBytes: io.WriteBytes,
|
||
ReadOps: io.ReadOps,
|
||
WriteOps: io.WriteOps,
|
||
ReadTime: io.ReadTime,
|
||
WriteTime: io.WriteTime,
|
||
IOTime: io.IOTime,
|
||
})
|
||
}
|
||
|
||
network := make([]models.HostNetworkInterface, 0, len(report.Network))
|
||
for _, nic := range report.Network {
|
||
network = append(network, models.HostNetworkInterface{
|
||
Name: nic.Name,
|
||
MAC: nic.MAC,
|
||
Addresses: append([]string(nil), nic.Addresses...),
|
||
RXBytes: nic.RXBytes,
|
||
TXBytes: nic.TXBytes,
|
||
SpeedMbps: nic.SpeedMbps,
|
||
})
|
||
}
|
||
|
||
raid := make([]models.HostRAIDArray, 0, len(report.RAID))
|
||
for _, array := range report.RAID {
|
||
devices := make([]models.HostRAIDDevice, 0, len(array.Devices))
|
||
for _, dev := range array.Devices {
|
||
devices = append(devices, models.HostRAIDDevice{
|
||
Device: dev.Device,
|
||
State: dev.State,
|
||
Slot: dev.Slot,
|
||
})
|
||
}
|
||
raid = append(raid, models.HostRAIDArray{
|
||
Device: array.Device,
|
||
Name: array.Name,
|
||
Level: array.Level,
|
||
State: array.State,
|
||
TotalDevices: array.TotalDevices,
|
||
ActiveDevices: array.ActiveDevices,
|
||
WorkingDevices: array.WorkingDevices,
|
||
FailedDevices: array.FailedDevices,
|
||
SpareDevices: array.SpareDevices,
|
||
UUID: array.UUID,
|
||
Devices: devices,
|
||
RebuildPercent: array.RebuildPercent,
|
||
RebuildSpeed: array.RebuildSpeed,
|
||
})
|
||
}
|
||
|
||
// Convert Ceph data from agent report
|
||
var cephData *models.HostCephCluster
|
||
if report.Ceph != nil {
|
||
cephData = convertAgentCephToModels(report.Ceph)
|
||
}
|
||
|
||
host := models.Host{
|
||
ID: identifier,
|
||
Hostname: hostname,
|
||
DisplayName: displayName,
|
||
Platform: strings.TrimSpace(strings.ToLower(report.Host.Platform)),
|
||
OSName: strings.TrimSpace(report.Host.OSName),
|
||
OSVersion: strings.TrimSpace(report.Host.OSVersion),
|
||
KernelVersion: strings.TrimSpace(report.Host.KernelVersion),
|
||
Architecture: strings.TrimSpace(report.Host.Architecture),
|
||
CPUCount: report.Host.CPUCount,
|
||
CPUUsage: safeFloat(report.Metrics.CPUUsagePercent),
|
||
LoadAverage: append([]float64(nil), report.Host.LoadAverage...),
|
||
Memory: memory,
|
||
Disks: disks,
|
||
DiskIO: diskIO,
|
||
NetworkInterfaces: network,
|
||
Sensors: models.HostSensorSummary{
|
||
TemperatureCelsius: cloneStringFloatMap(report.Sensors.TemperatureCelsius),
|
||
FanRPM: cloneStringFloatMap(report.Sensors.FanRPM),
|
||
Additional: cloneStringFloatMap(report.Sensors.Additional),
|
||
SMART: convertAgentSMARTToModels(report.Sensors.SMART),
|
||
},
|
||
RAID: raid,
|
||
Ceph: cephData,
|
||
Status: "online",
|
||
UptimeSeconds: report.Host.UptimeSeconds,
|
||
IntervalSeconds: report.Agent.IntervalSeconds,
|
||
LastSeen: timestamp,
|
||
AgentVersion: strings.TrimSpace(report.Agent.Version),
|
||
CommandsEnabled: report.Agent.CommandsEnabled,
|
||
ReportIP: strings.TrimSpace(report.Host.ReportIP),
|
||
Tags: append([]string(nil), report.Tags...),
|
||
IsLegacy: isLegacyHostAgent(report.Agent.Type),
|
||
}
|
||
|
||
// Apply any pending commands execution override from server config
|
||
// This ensures the UI remains stable when the user toggles this setting,
|
||
// even if the agent hasn't yet picked up the new config in this report cycle.
|
||
if cfg := m.GetHostAgentConfig(identifier); cfg.CommandsEnabled != nil {
|
||
host.CommandsEnabled = *cfg.CommandsEnabled
|
||
}
|
||
|
||
if len(host.LoadAverage) == 0 {
|
||
host.LoadAverage = nil
|
||
}
|
||
if len(host.Disks) == 0 {
|
||
host.Disks = nil
|
||
}
|
||
if len(host.DiskIO) == 0 {
|
||
host.DiskIO = nil
|
||
}
|
||
if len(host.NetworkInterfaces) == 0 {
|
||
host.NetworkInterfaces = nil
|
||
}
|
||
if len(host.RAID) == 0 {
|
||
host.RAID = nil
|
||
}
|
||
|
||
if tokenRecord != nil {
|
||
host.TokenID = tokenRecord.ID
|
||
host.TokenName = tokenRecord.Name
|
||
host.TokenHint = tokenHintFromRecord(tokenRecord)
|
||
if tokenRecord.LastUsedAt != nil {
|
||
t := tokenRecord.LastUsedAt.UTC()
|
||
host.TokenLastUsedAt = &t
|
||
} else {
|
||
now := time.Now().UTC()
|
||
host.TokenLastUsedAt = &now
|
||
}
|
||
} else if hasPrevious {
|
||
host.TokenID = previous.TokenID
|
||
host.TokenName = previous.TokenName
|
||
host.TokenHint = previous.TokenHint
|
||
host.TokenLastUsedAt = previous.TokenLastUsedAt
|
||
}
|
||
|
||
// Link host agent to matching PVE node/VM/container by hostname
|
||
// This prevents duplication when users install agents on PVE cluster nodes
|
||
linkedNodeID, linkedVMID, linkedContainerID := m.findLinkedProxmoxEntity(hostname)
|
||
if linkedNodeID != "" {
|
||
host.LinkedNodeID = linkedNodeID
|
||
log.Debug().
|
||
Str("hostId", identifier).
|
||
Str("hostname", hostname).
|
||
Str("linkedNodeId", linkedNodeID).
|
||
Msg("Linked host agent to PVE node")
|
||
}
|
||
if linkedVMID != "" {
|
||
host.LinkedVMID = linkedVMID
|
||
log.Debug().
|
||
Str("hostId", identifier).
|
||
Str("hostname", hostname).
|
||
Str("linkedVmId", linkedVMID).
|
||
Msg("Linked host agent to VM")
|
||
}
|
||
if linkedContainerID != "" {
|
||
host.LinkedContainerID = linkedContainerID
|
||
log.Debug().
|
||
Str("hostId", identifier).
|
||
Str("hostname", hostname).
|
||
Str("linkedContainerId", linkedContainerID).
|
||
Msg("Linked host agent to container")
|
||
}
|
||
|
||
m.state.UpsertHost(host)
|
||
m.state.SetConnectionHealth(hostConnectionPrefix+host.ID, true)
|
||
|
||
// Update the linked PVE node to point back to this host agent
|
||
if host.LinkedNodeID != "" {
|
||
m.linkNodeToHostAgent(host.LinkedNodeID, host.ID)
|
||
}
|
||
|
||
// If host reports Ceph data, also update the global CephClusters state
|
||
if report.Ceph != nil {
|
||
cephCluster := convertAgentCephToGlobalCluster(report.Ceph, hostname, identifier, timestamp)
|
||
m.state.UpsertCephCluster(cephCluster)
|
||
log.Debug().
|
||
Str("hostId", identifier).
|
||
Str("hostname", hostname).
|
||
Str("fsid", cephCluster.FSID).
|
||
Str("health", cephCluster.Health).
|
||
Int("osds", cephCluster.NumOSDs).
|
||
Msg("Updated Ceph cluster from host agent")
|
||
}
|
||
|
||
if m.alertManager != nil {
|
||
m.alertManager.CheckHost(host)
|
||
}
|
||
|
||
// Record Host metrics for sparkline charts
|
||
now := time.Now()
|
||
hostMetricKey := fmt.Sprintf("host:%s", host.ID)
|
||
|
||
var hostDiskPercent float64
|
||
if len(host.Disks) > 0 {
|
||
hostDiskPercent = host.Disks[0].Usage
|
||
}
|
||
|
||
// Record host Network I/O (sum across all interfaces)
|
||
// Network bytes from the agent are cumulative totals since boot, not rates.
|
||
// We need to use the RateTracker to convert to bytes/second, just like VMs and containers.
|
||
var totalRXBytes, totalTXBytes uint64
|
||
for _, nic := range host.NetworkInterfaces {
|
||
totalRXBytes += nic.RXBytes
|
||
totalTXBytes += nic.TXBytes
|
||
}
|
||
|
||
// Calculate network rates using the RateTracker (convert cumulative bytes to bytes/sec)
|
||
hostRateKey := fmt.Sprintf("host:%s", host.ID)
|
||
currentMetrics := IOMetrics{
|
||
NetworkIn: int64(totalRXBytes),
|
||
NetworkOut: int64(totalTXBytes),
|
||
Timestamp: now,
|
||
}
|
||
_, _, netInRate, netOutRate := m.rateTracker.CalculateRates(hostRateKey, currentMetrics)
|
||
|
||
if m.metricsHistory != nil {
|
||
// Record host CPU usage
|
||
m.metricsHistory.AddGuestMetric(hostMetricKey, "cpu", host.CPUUsage, now)
|
||
|
||
// Record host Memory usage
|
||
m.metricsHistory.AddGuestMetric(hostMetricKey, "memory", host.Memory.Usage, now)
|
||
|
||
m.metricsHistory.AddGuestMetric(hostMetricKey, "disk", hostDiskPercent, now)
|
||
|
||
// Only record network rates if we have valid data (rate >= 0 means we have enough samples)
|
||
if netInRate >= 0 {
|
||
m.metricsHistory.AddGuestMetric(hostMetricKey, "netin", netInRate, now)
|
||
}
|
||
if netOutRate >= 0 {
|
||
m.metricsHistory.AddGuestMetric(hostMetricKey, "netout", netOutRate, now)
|
||
}
|
||
}
|
||
|
||
// Also write to persistent SQLite store
|
||
if m.metricsStore != nil {
|
||
m.metricsStore.Write("host", host.ID, "cpu", host.CPUUsage, now)
|
||
m.metricsStore.Write("host", host.ID, "memory", host.Memory.Usage, now)
|
||
m.metricsStore.Write("host", host.ID, "disk", hostDiskPercent, now)
|
||
// Only write network rates if we have valid data
|
||
if netInRate >= 0 {
|
||
m.metricsStore.Write("host", host.ID, "netin", netInRate, now)
|
||
}
|
||
if netOutRate >= 0 {
|
||
m.metricsStore.Write("host", host.ID, "netout", netOutRate, now)
|
||
}
|
||
}
|
||
|
||
return host, nil
|
||
}
|
||
|
||
// findLinkedProxmoxEntity searches for a PVE node, VM, or container with a matching hostname.
|
||
// Returns the IDs of matched entities (empty string if no match).
|
||
// When multiple entities match the same hostname (e.g., two PVE instances both have a node
|
||
// named "pve"), this function returns empty strings to avoid incorrect linking. Users should
|
||
// manually link agents to nodes via the UI in such cases.
|
||
func (m *Monitor) findLinkedProxmoxEntity(hostname string) (nodeID, vmID, containerID string) {
|
||
if hostname == "" {
|
||
return "", "", ""
|
||
}
|
||
|
||
// Normalize hostname for comparison (lowercase, strip domain)
|
||
normalizedHostname := strings.ToLower(hostname)
|
||
shortHostname := normalizedHostname
|
||
if idx := strings.Index(normalizedHostname, "."); idx > 0 {
|
||
shortHostname = normalizedHostname[:idx]
|
||
}
|
||
|
||
matchHostname := func(name string) bool {
|
||
normalized := strings.ToLower(name)
|
||
if normalized == normalizedHostname || normalized == shortHostname {
|
||
return true
|
||
}
|
||
// Also check short version of the candidate
|
||
if idx := strings.Index(normalized, "."); idx > 0 {
|
||
if normalized[:idx] == shortHostname {
|
||
return true
|
||
}
|
||
}
|
||
return false
|
||
}
|
||
|
||
state := m.GetState()
|
||
|
||
// Check PVE nodes first - but detect ambiguity when multiple nodes match
|
||
var matchingNodes []models.Node
|
||
for _, node := range state.Nodes {
|
||
if matchHostname(node.Name) {
|
||
matchingNodes = append(matchingNodes, node)
|
||
}
|
||
}
|
||
if len(matchingNodes) == 1 {
|
||
return matchingNodes[0].ID, "", ""
|
||
}
|
||
if len(matchingNodes) > 1 {
|
||
// Multiple nodes with the same hostname - can't auto-link, would cause data mixing
|
||
log.Warn().
|
||
Str("hostname", hostname).
|
||
Int("matchCount", len(matchingNodes)).
|
||
Strs("instances", func() []string {
|
||
instances := make([]string, len(matchingNodes))
|
||
for i, n := range matchingNodes {
|
||
instances[i] = n.Instance
|
||
}
|
||
return instances
|
||
}()).
|
||
Msg("Multiple PVE nodes match hostname - cannot auto-link host agent. Manual linking required via UI.")
|
||
return "", "", ""
|
||
}
|
||
|
||
// Check VMs - same pattern for ambiguity detection
|
||
var matchingVMs []models.VM
|
||
for _, vm := range state.VMs {
|
||
if matchHostname(vm.Name) {
|
||
matchingVMs = append(matchingVMs, vm)
|
||
}
|
||
}
|
||
if len(matchingVMs) == 1 {
|
||
return "", matchingVMs[0].ID, ""
|
||
}
|
||
if len(matchingVMs) > 1 {
|
||
log.Warn().
|
||
Str("hostname", hostname).
|
||
Int("matchCount", len(matchingVMs)).
|
||
Msg("Multiple VMs match hostname - cannot auto-link host agent. Manual linking required via UI.")
|
||
return "", "", ""
|
||
}
|
||
|
||
// Check containers - same pattern
|
||
var matchingCTs []models.Container
|
||
for _, ct := range state.Containers {
|
||
if matchHostname(ct.Name) {
|
||
matchingCTs = append(matchingCTs, ct)
|
||
}
|
||
}
|
||
if len(matchingCTs) == 1 {
|
||
return "", "", matchingCTs[0].ID
|
||
}
|
||
if len(matchingCTs) > 1 {
|
||
log.Warn().
|
||
Str("hostname", hostname).
|
||
Int("matchCount", len(matchingCTs)).
|
||
Msg("Multiple containers match hostname - cannot auto-link host agent. Manual linking required via UI.")
|
||
return "", "", ""
|
||
}
|
||
|
||
return "", "", ""
|
||
}
|
||
|
||
// linkNodeToHostAgent updates a PVE node to link to its host agent.
|
||
func (m *Monitor) linkNodeToHostAgent(nodeID, hostAgentID string) {
|
||
m.state.LinkNodeToHostAgent(nodeID, hostAgentID)
|
||
}
|
||
|
||
const (
|
||
removedDockerHostsTTL = 24 * time.Hour // Clean up removed hosts tracking after 24 hours
|
||
)
|
||
|
||
// recoverFromPanic recovers from panics in monitoring goroutines and logs them.
|
||
// This prevents a panic in one component from crashing the entire monitoring system.
|
||
func recoverFromPanic(goroutineName string) {
|
||
if r := recover(); r != nil {
|
||
log.Error().
|
||
Str("goroutine", goroutineName).
|
||
Interface("panic", r).
|
||
Stack().
|
||
Msg("Recovered from panic in monitoring goroutine")
|
||
}
|
||
}
|
||
|
||
// cleanupRemovedDockerHosts removes entries from the removed hosts map that are older than 24 hours.
|
||
func (m *Monitor) cleanupRemovedDockerHosts(now time.Time) {
|
||
// Collect IDs to remove first to avoid holding lock during state update
|
||
var toRemove []string
|
||
|
||
m.mu.Lock()
|
||
for hostID, removedAt := range m.removedDockerHosts {
|
||
if now.Sub(removedAt) > removedDockerHostsTTL {
|
||
toRemove = append(toRemove, hostID)
|
||
}
|
||
}
|
||
m.mu.Unlock()
|
||
|
||
// Remove from state and map without holding both locks
|
||
for _, hostID := range toRemove {
|
||
m.state.RemoveRemovedDockerHost(hostID)
|
||
|
||
m.mu.Lock()
|
||
removedAt := m.removedDockerHosts[hostID]
|
||
delete(m.removedDockerHosts, hostID)
|
||
m.mu.Unlock()
|
||
|
||
log.Debug().
|
||
Str("dockerHostID", hostID).
|
||
Time("removedAt", removedAt).
|
||
Msg("Cleaned up old removed Docker host entry")
|
||
}
|
||
}
|
||
|
||
// cleanupGuestMetadataCache removes stale guest metadata entries.
|
||
// Entries older than 2x the cache TTL (10 minutes) are removed to prevent unbounded growth
|
||
// when VMs are deleted or moved.
|
||
func (m *Monitor) cleanupGuestMetadataCache(now time.Time) {
|
||
const maxAge = 2 * guestMetadataCacheTTL // 10 minutes
|
||
|
||
m.guestMetadataMu.Lock()
|
||
defer m.guestMetadataMu.Unlock()
|
||
|
||
for key, entry := range m.guestMetadataCache {
|
||
if now.Sub(entry.fetchedAt) > maxAge {
|
||
delete(m.guestMetadataCache, key)
|
||
log.Debug().
|
||
Str("key", key).
|
||
Time("fetchedAt", entry.fetchedAt).
|
||
Msg("Cleaned up stale guest metadata cache entry")
|
||
}
|
||
}
|
||
}
|
||
|
||
// cleanupTrackingMaps removes stale entries from various tracking maps to prevent unbounded memory growth.
|
||
// This cleans up auth tracking, polling timestamps, and circuit breaker state for resources
|
||
// that haven't been accessed in over 24 hours.
|
||
func (m *Monitor) cleanupTrackingMaps(now time.Time) {
|
||
const staleThreshold = 24 * time.Hour
|
||
cutoff := now.Add(-staleThreshold)
|
||
cleaned := 0
|
||
|
||
m.mu.Lock()
|
||
defer m.mu.Unlock()
|
||
|
||
// Clean up auth tracking maps - entries older than 24 hours
|
||
for nodeID, ts := range m.lastAuthAttempt {
|
||
if ts.Before(cutoff) {
|
||
delete(m.lastAuthAttempt, nodeID)
|
||
delete(m.authFailures, nodeID)
|
||
cleaned++
|
||
}
|
||
}
|
||
|
||
// Clean up last cluster check timestamps
|
||
for instanceID, ts := range m.lastClusterCheck {
|
||
if ts.Before(cutoff) {
|
||
delete(m.lastClusterCheck, instanceID)
|
||
cleaned++
|
||
}
|
||
}
|
||
|
||
// Clean up last physical disk poll timestamps
|
||
for instanceID, ts := range m.lastPhysicalDiskPoll {
|
||
if ts.Before(cutoff) {
|
||
delete(m.lastPhysicalDiskPoll, instanceID)
|
||
cleaned++
|
||
}
|
||
}
|
||
|
||
// Clean up last PVE backup poll timestamps
|
||
for instanceID, ts := range m.lastPVEBackupPoll {
|
||
if ts.Before(cutoff) {
|
||
delete(m.lastPVEBackupPoll, instanceID)
|
||
cleaned++
|
||
}
|
||
}
|
||
|
||
// Clean up last PBS backup poll timestamps
|
||
for instanceID, ts := range m.lastPBSBackupPoll {
|
||
if ts.Before(cutoff) {
|
||
delete(m.lastPBSBackupPoll, instanceID)
|
||
cleaned++
|
||
}
|
||
}
|
||
|
||
// Clean up circuit breakers for keys not in active clients
|
||
// Build set of active keys from pveClients and pbsClients
|
||
activeKeys := make(map[string]struct{})
|
||
for key := range m.pveClients {
|
||
activeKeys[key] = struct{}{}
|
||
}
|
||
for key := range m.pbsClients {
|
||
activeKeys[key] = struct{}{}
|
||
}
|
||
for key := range m.pmgClients {
|
||
activeKeys[key] = struct{}{}
|
||
}
|
||
|
||
// Only clean up circuit breakers for inactive keys that have been idle
|
||
// for longer than the stale threshold
|
||
for key, breaker := range m.circuitBreakers {
|
||
if _, active := activeKeys[key]; !active {
|
||
// Key is not in active clients - check if breaker is stale
|
||
if breaker != nil {
|
||
_, _, _, _, lastTransition := breaker.stateDetails()
|
||
if now.Sub(lastTransition) > staleThreshold {
|
||
delete(m.circuitBreakers, key)
|
||
delete(m.failureCounts, key)
|
||
delete(m.lastOutcome, key)
|
||
cleaned++
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
if cleaned > 0 {
|
||
log.Debug().
|
||
Int("entriesCleaned", cleaned).
|
||
Msg("Cleaned stale entries from monitor tracking maps")
|
||
}
|
||
}
|
||
|
||
// cleanupDiagnosticSnapshots removes stale diagnostic snapshots.
|
||
// Snapshots older than 1 hour are removed to prevent unbounded growth
|
||
// when nodes/VMs are deleted or reconfigured.
|
||
func (m *Monitor) cleanupDiagnosticSnapshots(now time.Time) {
|
||
const maxAge = 1 * time.Hour
|
||
|
||
m.diagMu.Lock()
|
||
defer m.diagMu.Unlock()
|
||
|
||
for key, snapshot := range m.nodeSnapshots {
|
||
if now.Sub(snapshot.RetrievedAt) > maxAge {
|
||
delete(m.nodeSnapshots, key)
|
||
log.Debug().
|
||
Str("key", key).
|
||
Time("retrievedAt", snapshot.RetrievedAt).
|
||
Msg("Cleaned up stale node snapshot")
|
||
}
|
||
}
|
||
|
||
for key, snapshot := range m.guestSnapshots {
|
||
if now.Sub(snapshot.RetrievedAt) > maxAge {
|
||
delete(m.guestSnapshots, key)
|
||
log.Debug().
|
||
Str("key", key).
|
||
Time("retrievedAt", snapshot.RetrievedAt).
|
||
Msg("Cleaned up stale guest snapshot")
|
||
}
|
||
}
|
||
}
|
||
|
||
// cleanupRRDCache removes stale RRD memory cache entries.
|
||
// Entries older than 2x the cache TTL (1 minute) are removed to prevent unbounded growth
|
||
// when nodes are removed from the cluster.
|
||
func (m *Monitor) cleanupRRDCache(now time.Time) {
|
||
const maxAge = 2 * nodeRRDCacheTTL // 1 minute
|
||
|
||
m.rrdCacheMu.Lock()
|
||
defer m.rrdCacheMu.Unlock()
|
||
|
||
for key, entry := range m.nodeRRDMemCache {
|
||
if now.Sub(entry.fetchedAt) > maxAge {
|
||
delete(m.nodeRRDMemCache, key)
|
||
log.Debug().
|
||
Str("node", key).
|
||
Time("fetchedAt", entry.fetchedAt).
|
||
Msg("Cleaned up stale RRD cache entry")
|
||
}
|
||
}
|
||
}
|
||
|
||
// cleanupMetricsHistory removes stale entries from the metrics history.
|
||
// This prevents unbounded memory growth when containers/VMs are deleted.
|
||
func (m *Monitor) cleanupMetricsHistory() {
|
||
if m.metricsHistory != nil {
|
||
m.metricsHistory.Cleanup()
|
||
}
|
||
}
|
||
|
||
// cleanupRateTracker removes stale entries from the rate tracker.
|
||
// Entries older than 24 hours are removed to prevent unbounded memory growth.
|
||
func (m *Monitor) cleanupRateTracker(now time.Time) {
|
||
const staleThreshold = 24 * time.Hour
|
||
cutoff := now.Add(-staleThreshold)
|
||
|
||
if m.rateTracker != nil {
|
||
if removed := m.rateTracker.Cleanup(cutoff); removed > 0 {
|
||
log.Debug().
|
||
Int("entriesRemoved", removed).
|
||
Msg("Cleaned up stale rate tracker entries")
|
||
}
|
||
}
|
||
}
|
||
|
||
// evaluateDockerAgents updates health for Docker hosts based on last report time.
|
||
func (m *Monitor) evaluateDockerAgents(now time.Time) {
|
||
hosts := m.state.GetDockerHosts()
|
||
for _, host := range hosts {
|
||
interval := host.IntervalSeconds
|
||
if interval <= 0 {
|
||
interval = int(dockerMinimumHealthWindow / time.Second)
|
||
}
|
||
|
||
window := time.Duration(interval) * time.Second * dockerOfflineGraceMultiplier
|
||
if window < dockerMinimumHealthWindow {
|
||
window = dockerMinimumHealthWindow
|
||
} else if window > dockerMaximumHealthWindow {
|
||
window = dockerMaximumHealthWindow
|
||
}
|
||
|
||
healthy := !host.LastSeen.IsZero() && now.Sub(host.LastSeen) <= window
|
||
key := dockerConnectionPrefix + host.ID
|
||
m.state.SetConnectionHealth(key, healthy)
|
||
hostCopy := host
|
||
if healthy {
|
||
hostCopy.Status = "online"
|
||
m.state.SetDockerHostStatus(host.ID, "online")
|
||
if m.alertManager != nil {
|
||
m.alertManager.HandleDockerHostOnline(hostCopy)
|
||
}
|
||
} else {
|
||
hostCopy.Status = "offline"
|
||
m.state.SetDockerHostStatus(host.ID, "offline")
|
||
if m.alertManager != nil {
|
||
m.alertManager.HandleDockerHostOffline(hostCopy)
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// evaluateHostAgents updates health for host agents based on last report time.
|
||
func (m *Monitor) evaluateHostAgents(now time.Time) {
|
||
hosts := m.state.GetHosts()
|
||
for _, host := range hosts {
|
||
interval := host.IntervalSeconds
|
||
if interval <= 0 {
|
||
interval = int(hostMinimumHealthWindow / time.Second)
|
||
}
|
||
|
||
window := time.Duration(interval) * time.Second * hostOfflineGraceMultiplier
|
||
if window < hostMinimumHealthWindow {
|
||
window = hostMinimumHealthWindow
|
||
} else if window > hostMaximumHealthWindow {
|
||
window = hostMaximumHealthWindow
|
||
}
|
||
|
||
age := now.Sub(host.LastSeen)
|
||
healthy := !host.LastSeen.IsZero() && age <= window
|
||
key := hostConnectionPrefix + host.ID
|
||
m.state.SetConnectionHealth(key, healthy)
|
||
|
||
hostCopy := host
|
||
if healthy {
|
||
hostCopy.Status = "online"
|
||
// Log status transition from offline to online
|
||
if host.Status == "offline" {
|
||
log.Debug().
|
||
Str("hostID", host.ID).
|
||
Str("hostname", host.Hostname).
|
||
Dur("age", age).
|
||
Dur("window", window).
|
||
Msg("Host agent back online")
|
||
}
|
||
m.state.SetHostStatus(host.ID, "online")
|
||
if m.alertManager != nil {
|
||
m.alertManager.HandleHostOnline(hostCopy)
|
||
}
|
||
} else {
|
||
hostCopy.Status = "offline"
|
||
// Log status transition from online to offline with diagnostic info
|
||
if host.Status == "online" || host.Status == "" {
|
||
log.Debug().
|
||
Str("hostID", host.ID).
|
||
Str("hostname", host.Hostname).
|
||
Time("lastSeen", host.LastSeen).
|
||
Dur("age", age).
|
||
Dur("window", window).
|
||
Int("intervalSeconds", host.IntervalSeconds).
|
||
Bool("lastSeenZero", host.LastSeen.IsZero()).
|
||
Msg("Host agent appears offline")
|
||
}
|
||
m.state.SetHostStatus(host.ID, "offline")
|
||
if m.alertManager != nil {
|
||
m.alertManager.HandleHostOffline(hostCopy)
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// sortContent sorts comma-separated content values for consistent display
|
||
func sortContent(content string) string {
|
||
if content == "" {
|
||
return ""
|
||
}
|
||
parts := strings.Split(content, ",")
|
||
sort.Strings(parts)
|
||
return strings.Join(parts, ",")
|
||
}
|
||
|
||
func (m *Monitor) enrichContainerMetadata(ctx context.Context, client PVEClientInterface, instanceName, nodeName string, container *models.Container) {
|
||
if container == nil {
|
||
return
|
||
}
|
||
|
||
ensureContainerRootDiskEntry(container)
|
||
|
||
if client == nil {
|
||
return
|
||
}
|
||
|
||
isRunning := container.Status == "running"
|
||
|
||
var status *proxmox.Container
|
||
if isRunning {
|
||
statusCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
|
||
statusResp, err := client.GetContainerStatus(statusCtx, nodeName, container.VMID)
|
||
cancel()
|
||
if err != nil {
|
||
log.Debug().
|
||
Err(err).
|
||
Str("instance", instanceName).
|
||
Str("node", nodeName).
|
||
Str("container", container.Name).
|
||
Int("vmid", container.VMID).
|
||
Msg("Container status metadata unavailable")
|
||
} else {
|
||
status = statusResp
|
||
}
|
||
}
|
||
|
||
rootDeviceHint := ""
|
||
var mountMetadata map[string]containerMountMetadata
|
||
addressSet := make(map[string]struct{})
|
||
addressOrder := make([]string, 0, 4)
|
||
|
||
addAddress := func(addr string) {
|
||
addr = strings.TrimSpace(addr)
|
||
if addr == "" {
|
||
return
|
||
}
|
||
if _, exists := addressSet[addr]; exists {
|
||
return
|
||
}
|
||
addressSet[addr] = struct{}{}
|
||
addressOrder = append(addressOrder, addr)
|
||
}
|
||
|
||
if status != nil {
|
||
for _, addr := range sanitizeGuestAddressStrings(status.IP) {
|
||
addAddress(addr)
|
||
}
|
||
for _, addr := range sanitizeGuestAddressStrings(status.IP6) {
|
||
addAddress(addr)
|
||
}
|
||
for _, addr := range parseContainerRawIPs(status.IPv4) {
|
||
addAddress(addr)
|
||
}
|
||
for _, addr := range parseContainerRawIPs(status.IPv6) {
|
||
addAddress(addr)
|
||
}
|
||
}
|
||
|
||
networkIfaces := make([]models.GuestNetworkInterface, 0, 4)
|
||
if status != nil {
|
||
networkIfaces = make([]models.GuestNetworkInterface, 0, len(status.Network))
|
||
for rawName, cfg := range status.Network {
|
||
if cfg == (proxmox.ContainerNetworkConfig{}) {
|
||
continue
|
||
}
|
||
|
||
iface := models.GuestNetworkInterface{}
|
||
name := strings.TrimSpace(cfg.Name)
|
||
if name == "" {
|
||
name = strings.TrimSpace(rawName)
|
||
}
|
||
if name != "" {
|
||
iface.Name = name
|
||
}
|
||
if mac := strings.TrimSpace(cfg.HWAddr); mac != "" {
|
||
iface.MAC = mac
|
||
}
|
||
|
||
addrCandidates := make([]string, 0, 4)
|
||
addrCandidates = append(addrCandidates, collectIPsFromInterface(cfg.IP)...)
|
||
addrCandidates = append(addrCandidates, collectIPsFromInterface(cfg.IP6)...)
|
||
addrCandidates = append(addrCandidates, collectIPsFromInterface(cfg.IPv4)...)
|
||
addrCandidates = append(addrCandidates, collectIPsFromInterface(cfg.IPv6)...)
|
||
|
||
if len(addrCandidates) > 0 {
|
||
deduped := dedupeStringsPreserveOrder(addrCandidates)
|
||
if len(deduped) > 0 {
|
||
iface.Addresses = deduped
|
||
for _, addr := range deduped {
|
||
addAddress(addr)
|
||
}
|
||
}
|
||
}
|
||
|
||
if iface.Name != "" || iface.MAC != "" || len(iface.Addresses) > 0 {
|
||
networkIfaces = append(networkIfaces, iface)
|
||
}
|
||
}
|
||
}
|
||
|
||
configCtx, cancelConfig := context.WithTimeout(ctx, 5*time.Second)
|
||
configData, configErr := client.GetContainerConfig(configCtx, nodeName, container.VMID)
|
||
cancelConfig()
|
||
if configErr != nil {
|
||
log.Debug().
|
||
Err(configErr).
|
||
Str("instance", instanceName).
|
||
Str("node", nodeName).
|
||
Str("container", container.Name).
|
||
Int("vmid", container.VMID).
|
||
Msg("Container config metadata unavailable")
|
||
} else if len(configData) > 0 {
|
||
mountMetadata = parseContainerMountMetadata(configData)
|
||
if rootDeviceHint == "" {
|
||
if meta, ok := mountMetadata["rootfs"]; ok && meta.Source != "" {
|
||
rootDeviceHint = meta.Source
|
||
}
|
||
}
|
||
if rootDeviceHint == "" {
|
||
if hint := extractContainerRootDeviceFromConfig(configData); hint != "" {
|
||
rootDeviceHint = hint
|
||
}
|
||
}
|
||
for _, detail := range parseContainerConfigNetworks(configData) {
|
||
if len(detail.Addresses) > 0 {
|
||
for _, addr := range detail.Addresses {
|
||
addAddress(addr)
|
||
}
|
||
}
|
||
mergeContainerNetworkInterface(&networkIfaces, detail)
|
||
}
|
||
// Extract OS type from container config
|
||
if osName := extractContainerOSType(configData); osName != "" {
|
||
container.OSName = osName
|
||
}
|
||
// Detect OCI containers (Proxmox VE 9.1+)
|
||
// Method 1: Check ostemplate for OCI registry patterns
|
||
if osTemplate := extractContainerOSTemplate(configData); osTemplate != "" {
|
||
container.OSTemplate = osTemplate
|
||
if isOCITemplate(osTemplate) {
|
||
container.IsOCI = true
|
||
container.Type = "oci"
|
||
log.Debug().
|
||
Str("container", container.Name).
|
||
Int("vmid", container.VMID).
|
||
Str("osTemplate", osTemplate).
|
||
Msg("Detected OCI container by template")
|
||
}
|
||
}
|
||
// Method 2: Check config fields (entrypoint, ostype, cmode)
|
||
// This is needed because Proxmox doesn't persist ostemplate after creation
|
||
if !container.IsOCI && isOCIContainerByConfig(configData) {
|
||
container.IsOCI = true
|
||
container.Type = "oci"
|
||
log.Debug().
|
||
Str("container", container.Name).
|
||
Int("vmid", container.VMID).
|
||
Msg("Detected OCI container by config (entrypoint/ostype)")
|
||
}
|
||
}
|
||
|
||
if len(addressOrder) == 0 {
|
||
if isRunning {
|
||
interfacesCtx, cancelInterfaces := context.WithTimeout(ctx, 5*time.Second)
|
||
ifaceDetails, ifaceErr := client.GetContainerInterfaces(interfacesCtx, nodeName, container.VMID)
|
||
cancelInterfaces()
|
||
if ifaceErr != nil {
|
||
log.Debug().
|
||
Err(ifaceErr).
|
||
Str("instance", instanceName).
|
||
Str("node", nodeName).
|
||
Str("container", container.Name).
|
||
Int("vmid", container.VMID).
|
||
Msg("Container interface metadata unavailable")
|
||
} else if len(ifaceDetails) > 0 {
|
||
for _, detail := range ifaceDetails {
|
||
parsed := containerNetworkDetails{}
|
||
parsed.Name = strings.TrimSpace(detail.Name)
|
||
parsed.MAC = strings.ToUpper(strings.TrimSpace(detail.HWAddr))
|
||
|
||
for _, addr := range detail.IPAddresses {
|
||
stripped := strings.TrimSpace(addr.Address)
|
||
if stripped == "" {
|
||
continue
|
||
}
|
||
if slash := strings.Index(stripped, "/"); slash > 0 {
|
||
stripped = stripped[:slash]
|
||
}
|
||
parsed.Addresses = append(parsed.Addresses, sanitizeGuestAddressStrings(stripped)...)
|
||
}
|
||
|
||
if len(parsed.Addresses) == 0 && strings.TrimSpace(detail.Inet) != "" {
|
||
parts := strings.Fields(detail.Inet)
|
||
for _, part := range parts {
|
||
stripped := strings.TrimSpace(part)
|
||
if stripped == "" {
|
||
continue
|
||
}
|
||
if slash := strings.Index(stripped, "/"); slash > 0 {
|
||
stripped = stripped[:slash]
|
||
}
|
||
parsed.Addresses = append(parsed.Addresses, sanitizeGuestAddressStrings(stripped)...)
|
||
}
|
||
}
|
||
|
||
parsed.Addresses = dedupeStringsPreserveOrder(parsed.Addresses)
|
||
|
||
if len(parsed.Addresses) > 0 {
|
||
for _, addr := range parsed.Addresses {
|
||
addAddress(addr)
|
||
}
|
||
}
|
||
|
||
if parsed.Name != "" || parsed.MAC != "" || len(parsed.Addresses) > 0 {
|
||
mergeContainerNetworkInterface(&networkIfaces, parsed)
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
if len(networkIfaces) > 1 {
|
||
sort.SliceStable(networkIfaces, func(i, j int) bool {
|
||
left := strings.TrimSpace(networkIfaces[i].Name)
|
||
right := strings.TrimSpace(networkIfaces[j].Name)
|
||
return left < right
|
||
})
|
||
}
|
||
|
||
if len(addressOrder) > 1 {
|
||
sort.Strings(addressOrder)
|
||
}
|
||
|
||
if len(addressOrder) > 0 {
|
||
container.IPAddresses = addressOrder
|
||
}
|
||
|
||
if len(networkIfaces) > 0 {
|
||
container.NetworkInterfaces = networkIfaces
|
||
}
|
||
|
||
if disks := convertContainerDiskInfo(status, mountMetadata); len(disks) > 0 {
|
||
container.Disks = disks
|
||
}
|
||
|
||
ensureContainerRootDiskEntry(container)
|
||
|
||
if rootDeviceHint != "" && len(container.Disks) > 0 {
|
||
for i := range container.Disks {
|
||
if container.Disks[i].Mountpoint == "/" && container.Disks[i].Device == "" {
|
||
container.Disks[i].Device = rootDeviceHint
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// GetConnectionStatuses returns the current connection status for all nodes
|
||
func (m *Monitor) GetConnectionStatuses() map[string]bool {
|
||
if mock.IsMockEnabled() {
|
||
statuses := make(map[string]bool)
|
||
state := mock.GetMockState()
|
||
for _, node := range state.Nodes {
|
||
key := "pve-" + node.Name
|
||
statuses[key] = strings.ToLower(node.Status) == "online"
|
||
if node.Host != "" {
|
||
statuses[node.Host] = strings.ToLower(node.Status) == "online"
|
||
}
|
||
}
|
||
for _, pbsInst := range state.PBSInstances {
|
||
key := "pbs-" + pbsInst.Name
|
||
statuses[key] = strings.ToLower(pbsInst.Status) != "offline"
|
||
if pbsInst.Host != "" {
|
||
statuses[pbsInst.Host] = strings.ToLower(pbsInst.Status) != "offline"
|
||
}
|
||
}
|
||
|
||
for _, dockerHost := range state.DockerHosts {
|
||
key := dockerConnectionPrefix + dockerHost.ID
|
||
statuses[key] = strings.ToLower(dockerHost.Status) == "online"
|
||
}
|
||
return statuses
|
||
}
|
||
|
||
m.mu.RLock()
|
||
defer m.mu.RUnlock()
|
||
|
||
statuses := make(map[string]bool)
|
||
|
||
// Check all configured PVE nodes (not just ones with clients)
|
||
for _, pve := range m.config.PVEInstances {
|
||
key := "pve-" + pve.Name
|
||
// Check if we have a client for this node
|
||
if client, exists := m.pveClients[pve.Name]; exists && client != nil {
|
||
// We have a client, check actual connection health from state
|
||
if m.state != nil && m.state.ConnectionHealth != nil {
|
||
statuses[key] = m.state.ConnectionHealth[pve.Name]
|
||
} else {
|
||
statuses[key] = true // Assume connected if we have a client
|
||
}
|
||
} else {
|
||
// No client means disconnected
|
||
statuses[key] = false
|
||
}
|
||
}
|
||
|
||
// Check all configured PBS nodes (not just ones with clients)
|
||
for _, pbs := range m.config.PBSInstances {
|
||
key := "pbs-" + pbs.Name
|
||
// Check if we have a client for this node
|
||
if client, exists := m.pbsClients[pbs.Name]; exists && client != nil {
|
||
// We have a client, check actual connection health from state
|
||
if m.state != nil && m.state.ConnectionHealth != nil {
|
||
statuses[key] = m.state.ConnectionHealth["pbs-"+pbs.Name]
|
||
} else {
|
||
statuses[key] = true // Assume connected if we have a client
|
||
}
|
||
} else {
|
||
// No client means disconnected
|
||
statuses[key] = false
|
||
}
|
||
}
|
||
|
||
return statuses
|
||
}
|
||
|
||
// checkContainerizedTempMonitoring logs a security warning if Pulse is running
|
||
// in a container with SSH-based temperature monitoring enabled
|
||
func checkContainerizedTempMonitoring() {
|
||
// Check if running in container
|
||
isContainer := os.Getenv("PULSE_DOCKER") == "true" || system.InContainer()
|
||
if !isContainer {
|
||
return
|
||
}
|
||
|
||
// Check if SSH keys exist (indicates temperature monitoring is configured)
|
||
homeDir := os.Getenv("HOME")
|
||
if homeDir == "" {
|
||
homeDir = "/home/pulse"
|
||
}
|
||
sshKeyPath := homeDir + "/.ssh/id_ed25519"
|
||
if _, err := os.Stat(sshKeyPath); err != nil {
|
||
// No SSH key found, temperature monitoring not configured
|
||
return
|
||
}
|
||
|
||
// Log warning
|
||
log.Warn().
|
||
Msg("SECURITY NOTICE: Pulse is running in a container with SSH-based temperature monitoring enabled. " +
|
||
"SSH private keys are stored inside the container, which could be a security risk if the container is compromised. " +
|
||
"Future versions will use agent-based architecture for better security. " +
|
||
"See documentation for hardening recommendations.")
|
||
}
|
||
|
||
// New creates a new Monitor instance
|
||
func New(cfg *config.Config) (*Monitor, error) {
|
||
if cfg == nil {
|
||
return nil, fmt.Errorf("config cannot be nil")
|
||
}
|
||
|
||
// Initialize temperature collector with sensors SSH key
|
||
// Will use root user for now - can be made configurable later
|
||
homeDir := os.Getenv("HOME")
|
||
if homeDir == "" {
|
||
homeDir = "/home/pulse"
|
||
}
|
||
sshKeyPath := filepath.Join(homeDir, ".ssh/id_ed25519_sensors")
|
||
tempCollector := NewTemperatureCollectorWithPort("root", sshKeyPath, cfg.SSHPort)
|
||
|
||
// Security warning if running in container with SSH temperature monitoring
|
||
checkContainerizedTempMonitoring()
|
||
|
||
stalenessTracker := NewStalenessTracker(getPollMetrics())
|
||
stalenessTracker.SetBounds(cfg.AdaptivePollingBaseInterval, cfg.AdaptivePollingMaxInterval)
|
||
taskQueue := NewTaskQueue()
|
||
deadLetterQueue := NewTaskQueue()
|
||
breakers := make(map[string]*circuitBreaker)
|
||
failureCounts := make(map[string]int)
|
||
lastOutcome := make(map[string]taskOutcome)
|
||
backoff := backoffConfig{
|
||
Initial: 5 * time.Second,
|
||
Multiplier: 2,
|
||
Jitter: 0.2,
|
||
Max: 5 * time.Minute,
|
||
}
|
||
|
||
if cfg.AdaptivePollingEnabled && cfg.AdaptivePollingMaxInterval > 0 && cfg.AdaptivePollingMaxInterval <= 15*time.Second {
|
||
backoff.Initial = 750 * time.Millisecond
|
||
backoff.Max = 6 * time.Second
|
||
}
|
||
|
||
var scheduler *AdaptiveScheduler
|
||
if cfg.AdaptivePollingEnabled {
|
||
scheduler = NewAdaptiveScheduler(SchedulerConfig{
|
||
BaseInterval: cfg.AdaptivePollingBaseInterval,
|
||
MinInterval: cfg.AdaptivePollingMinInterval,
|
||
MaxInterval: cfg.AdaptivePollingMaxInterval,
|
||
}, stalenessTracker, nil, nil)
|
||
}
|
||
|
||
minRefresh := cfg.GuestMetadataMinRefreshInterval
|
||
if minRefresh <= 0 {
|
||
minRefresh = config.DefaultGuestMetadataMinRefresh
|
||
}
|
||
jitter := cfg.GuestMetadataRefreshJitter
|
||
if jitter < 0 {
|
||
jitter = 0
|
||
}
|
||
retryBackoff := cfg.GuestMetadataRetryBackoff
|
||
if retryBackoff <= 0 {
|
||
retryBackoff = config.DefaultGuestMetadataRetryBackoff
|
||
}
|
||
concurrency := cfg.GuestMetadataMaxConcurrent
|
||
if concurrency <= 0 {
|
||
concurrency = config.DefaultGuestMetadataMaxConcurrent
|
||
}
|
||
holdDuration := defaultGuestMetadataHold
|
||
|
||
// Load guest agent timeout configuration from environment variables (refs #592)
|
||
guestAgentFSInfoTimeout := parseDurationEnv("GUEST_AGENT_FSINFO_TIMEOUT", defaultGuestAgentFSInfoTimeout)
|
||
guestAgentNetworkTimeout := parseDurationEnv("GUEST_AGENT_NETWORK_TIMEOUT", defaultGuestAgentNetworkTimeout)
|
||
guestAgentOSInfoTimeout := parseDurationEnv("GUEST_AGENT_OSINFO_TIMEOUT", defaultGuestAgentOSInfoTimeout)
|
||
guestAgentVersionTimeout := parseDurationEnv("GUEST_AGENT_VERSION_TIMEOUT", defaultGuestAgentVersionTimeout)
|
||
guestAgentRetries := parseIntEnv("GUEST_AGENT_RETRIES", defaultGuestAgentRetries)
|
||
|
||
// Initialize persistent metrics store (SQLite) with configurable retention
|
||
var metricsStore *metrics.Store
|
||
metricsStoreConfig := metrics.DefaultConfig(cfg.DataPath)
|
||
// Override retention settings from config (allows tier-based pricing in future)
|
||
if cfg.MetricsRetentionRawHours > 0 {
|
||
metricsStoreConfig.RetentionRaw = time.Duration(cfg.MetricsRetentionRawHours) * time.Hour
|
||
}
|
||
if cfg.MetricsRetentionMinuteHours > 0 {
|
||
metricsStoreConfig.RetentionMinute = time.Duration(cfg.MetricsRetentionMinuteHours) * time.Hour
|
||
}
|
||
if cfg.MetricsRetentionHourlyDays > 0 {
|
||
metricsStoreConfig.RetentionHourly = time.Duration(cfg.MetricsRetentionHourlyDays) * 24 * time.Hour
|
||
}
|
||
if cfg.MetricsRetentionDailyDays > 0 {
|
||
metricsStoreConfig.RetentionDaily = time.Duration(cfg.MetricsRetentionDailyDays) * 24 * time.Hour
|
||
}
|
||
|
||
// In mock mode, extend ALL tier retentions to 90 days to match the seeded
|
||
// data range. Different query ranges use different tiers, so all need coverage.
|
||
// Also increase buffer size to handle heavy initial seeding.
|
||
if mock.IsMockEnabled() {
|
||
metricsStoreConfig.WriteBufferSize = 2000
|
||
metricsStoreConfig.RetentionRaw = 90 * 24 * time.Hour
|
||
metricsStoreConfig.RetentionMinute = 90 * 24 * time.Hour
|
||
metricsStoreConfig.RetentionHourly = 90 * 24 * time.Hour
|
||
metricsStoreConfig.RetentionDaily = 90 * 24 * time.Hour
|
||
}
|
||
ms, err := metrics.NewStore(metricsStoreConfig)
|
||
if err != nil {
|
||
// Do not automatically delete the DB on error, as it causes data loss on transient errors (e.g. locks).
|
||
// If the DB is truly corrupted, the user should manually remove it.
|
||
log.Error().Err(err).Msg("Failed to initialize persistent metrics store - continuing without metrics persistence")
|
||
} else {
|
||
if mock.IsMockEnabled() {
|
||
ms.SetMaxOpenConns(10)
|
||
}
|
||
metricsStore = ms
|
||
log.Info().
|
||
Str("path", metricsStoreConfig.DBPath).
|
||
Dur("retentionRaw", metricsStoreConfig.RetentionRaw).
|
||
Dur("retentionMinute", metricsStoreConfig.RetentionMinute).
|
||
Dur("retentionHourly", metricsStoreConfig.RetentionHourly).
|
||
Dur("retentionDaily", metricsStoreConfig.RetentionDaily).
|
||
Msg("Persistent metrics store initialized with configurable retention")
|
||
}
|
||
|
||
incidentStore := memory.NewIncidentStore(memory.IncidentStoreConfig{
|
||
DataDir: cfg.DataPath,
|
||
})
|
||
|
||
m := &Monitor{
|
||
config: cfg,
|
||
state: models.NewState(),
|
||
pveClients: make(map[string]PVEClientInterface),
|
||
pbsClients: make(map[string]*pbs.Client),
|
||
pmgClients: make(map[string]*pmg.Client),
|
||
pollMetrics: getPollMetrics(),
|
||
scheduler: scheduler,
|
||
stalenessTracker: stalenessTracker,
|
||
taskQueue: taskQueue,
|
||
pollTimeout: derivePollTimeout(cfg),
|
||
deadLetterQueue: deadLetterQueue,
|
||
circuitBreakers: breakers,
|
||
failureCounts: failureCounts,
|
||
lastOutcome: lastOutcome,
|
||
backoffCfg: backoff,
|
||
rng: rand.New(rand.NewSource(time.Now().UnixNano())),
|
||
maxRetryAttempts: 5,
|
||
tempCollector: tempCollector,
|
||
guestMetadataStore: config.NewGuestMetadataStore(cfg.DataPath, nil),
|
||
dockerMetadataStore: config.NewDockerMetadataStore(cfg.DataPath, nil),
|
||
hostMetadataStore: config.NewHostMetadataStore(cfg.DataPath, nil),
|
||
startTime: time.Now(),
|
||
rateTracker: NewRateTracker(),
|
||
metricsHistory: NewMetricsHistory(1000, 24*time.Hour), // Keep up to 1000 points (~8h @ 30s)
|
||
metricsStore: metricsStore, // Persistent SQLite storage
|
||
alertManager: alerts.NewManagerWithDataDir(cfg.DataPath),
|
||
incidentStore: incidentStore,
|
||
notificationMgr: notifications.NewNotificationManagerWithDataDir(cfg.PublicURL, cfg.DataPath),
|
||
configPersist: config.NewConfigPersistence(cfg.DataPath),
|
||
discoveryService: nil, // Will be initialized in Start()
|
||
authFailures: make(map[string]int),
|
||
lastAuthAttempt: make(map[string]time.Time),
|
||
lastClusterCheck: make(map[string]time.Time),
|
||
lastPhysicalDiskPoll: make(map[string]time.Time),
|
||
lastPVEBackupPoll: make(map[string]time.Time),
|
||
lastPBSBackupPoll: make(map[string]time.Time),
|
||
backupPermissionWarnings: make(map[string]string),
|
||
persistence: config.NewConfigPersistence(cfg.DataPath),
|
||
pbsBackupPollers: make(map[string]bool),
|
||
pbsBackupCacheTime: make(map[string]map[pbsBackupGroupKey]time.Time),
|
||
nodeSnapshots: make(map[string]NodeMemorySnapshot),
|
||
guestSnapshots: make(map[string]GuestMemorySnapshot),
|
||
nodeRRDMemCache: make(map[string]rrdMemCacheEntry),
|
||
removedDockerHosts: make(map[string]time.Time),
|
||
dockerTokenBindings: make(map[string]string),
|
||
removedKubernetesClusters: make(map[string]time.Time),
|
||
kubernetesTokenBindings: make(map[string]string),
|
||
hostTokenBindings: make(map[string]string),
|
||
dockerCommands: make(map[string]*dockerHostCommand),
|
||
dockerCommandIndex: make(map[string]string),
|
||
guestMetadataCache: make(map[string]guestMetadataCacheEntry),
|
||
guestMetadataLimiter: make(map[string]time.Time),
|
||
guestMetadataMinRefresh: minRefresh,
|
||
guestMetadataRefreshJitter: jitter,
|
||
guestMetadataRetryBackoff: retryBackoff,
|
||
guestMetadataHoldDuration: holdDuration,
|
||
guestAgentFSInfoTimeout: guestAgentFSInfoTimeout,
|
||
guestAgentNetworkTimeout: guestAgentNetworkTimeout,
|
||
guestAgentOSInfoTimeout: guestAgentOSInfoTimeout,
|
||
guestAgentVersionTimeout: guestAgentVersionTimeout,
|
||
guestAgentRetries: guestAgentRetries,
|
||
instanceInfoCache: make(map[string]*instanceInfo),
|
||
pollStatusMap: make(map[string]*pollStatus),
|
||
dlqInsightMap: make(map[string]*dlqInsight),
|
||
nodeLastOnline: make(map[string]time.Time),
|
||
nodePendingUpdatesCache: make(map[string]pendingUpdatesCache),
|
||
}
|
||
|
||
m.breakerBaseRetry = 5 * time.Second
|
||
m.breakerMaxDelay = 5 * time.Minute
|
||
m.breakerHalfOpenWindow = 30 * time.Second
|
||
|
||
if cfg.AdaptivePollingEnabled && cfg.AdaptivePollingMaxInterval > 0 && cfg.AdaptivePollingMaxInterval <= 15*time.Second {
|
||
m.breakerBaseRetry = 2 * time.Second
|
||
m.breakerMaxDelay = 10 * time.Second
|
||
m.breakerHalfOpenWindow = 2 * time.Second
|
||
}
|
||
|
||
m.executor = newRealExecutor(m)
|
||
m.buildInstanceInfoCache(cfg)
|
||
|
||
// Initialize state with config values
|
||
m.state.TemperatureMonitoringEnabled = cfg.TemperatureMonitoringEnabled
|
||
|
||
if m.pollMetrics != nil {
|
||
m.pollMetrics.ResetQueueDepth(0)
|
||
}
|
||
|
||
// Load saved configurations
|
||
if alertConfig, err := m.configPersist.LoadAlertConfig(); err == nil {
|
||
m.alertManager.UpdateConfig(*alertConfig)
|
||
// Apply schedule settings to notification manager
|
||
m.notificationMgr.SetCooldown(alertConfig.Schedule.Cooldown)
|
||
groupWindow := alertConfig.Schedule.Grouping.Window
|
||
if groupWindow == 0 && alertConfig.Schedule.GroupingWindow != 0 {
|
||
groupWindow = alertConfig.Schedule.GroupingWindow
|
||
}
|
||
m.notificationMgr.SetGroupingWindow(groupWindow)
|
||
m.notificationMgr.SetGroupingOptions(
|
||
alertConfig.Schedule.Grouping.ByNode,
|
||
alertConfig.Schedule.Grouping.ByGuest,
|
||
)
|
||
m.notificationMgr.SetNotifyOnResolve(alertConfig.Schedule.NotifyOnResolve)
|
||
} else {
|
||
log.Warn().Err(err).Msg("Failed to load alert configuration")
|
||
}
|
||
|
||
if emailConfig, err := m.configPersist.LoadEmailConfig(); err == nil {
|
||
m.notificationMgr.SetEmailConfig(*emailConfig)
|
||
} else {
|
||
log.Warn().Err(err).Msg("Failed to load email configuration")
|
||
}
|
||
|
||
if concurrency > 0 {
|
||
m.guestMetadataSlots = make(chan struct{}, concurrency)
|
||
}
|
||
|
||
if appriseConfig, err := m.configPersist.LoadAppriseConfig(); err == nil {
|
||
m.notificationMgr.SetAppriseConfig(*appriseConfig)
|
||
} else {
|
||
log.Warn().Err(err).Msg("Failed to load Apprise configuration")
|
||
}
|
||
|
||
// Migrate webhooks if needed (from unencrypted to encrypted)
|
||
if err := m.configPersist.MigrateWebhooksIfNeeded(); err != nil {
|
||
log.Warn().Err(err).Msg("Failed to migrate webhooks")
|
||
}
|
||
|
||
if webhooks, err := m.configPersist.LoadWebhooks(); err == nil {
|
||
for _, webhook := range webhooks {
|
||
m.notificationMgr.AddWebhook(webhook)
|
||
}
|
||
} else {
|
||
log.Warn().Err(err).Msg("Failed to load webhook configuration")
|
||
}
|
||
|
||
// Check if mock mode is enabled before initializing clients
|
||
mockEnabled := mock.IsMockEnabled()
|
||
|
||
if mockEnabled {
|
||
log.Info().Msg("Mock mode enabled - skipping PVE/PBS client initialization")
|
||
} else {
|
||
// Initialize PVE clients
|
||
log.Info().Int("count", len(cfg.PVEInstances)).Msg("Initializing PVE clients")
|
||
for _, pve := range cfg.PVEInstances {
|
||
log.Info().
|
||
Str("name", pve.Name).
|
||
Str("host", pve.Host).
|
||
Str("user", pve.User).
|
||
Bool("hasToken", pve.TokenName != "").
|
||
Msg("Configuring PVE instance")
|
||
|
||
// Check if this is a cluster
|
||
if pve.IsCluster && len(pve.ClusterEndpoints) > 0 {
|
||
// For clusters, check if endpoints have IPs/resolvable hosts
|
||
// If not, use the main host for all connections (Proxmox will route cluster API calls)
|
||
hasValidEndpoints := false
|
||
endpoints := make([]string, 0, len(pve.ClusterEndpoints))
|
||
endpointFingerprints := make(map[string]string)
|
||
|
||
for _, ep := range pve.ClusterEndpoints {
|
||
hasFingerprint := pve.Fingerprint != ""
|
||
effectiveURL := clusterEndpointEffectiveURL(ep, pve.VerifySSL, hasFingerprint)
|
||
if effectiveURL == "" {
|
||
log.Warn().
|
||
Str("node", ep.NodeName).
|
||
Msg("Skipping cluster endpoint with no host/IP")
|
||
continue
|
||
}
|
||
|
||
if parsed, err := url.Parse(effectiveURL); err == nil {
|
||
hostname := parsed.Hostname()
|
||
if hostname != "" && (strings.Contains(hostname, ".") || net.ParseIP(hostname) != nil) {
|
||
hasValidEndpoints = true
|
||
}
|
||
} else {
|
||
hostname := normalizeEndpointHost(effectiveURL)
|
||
if hostname != "" && (strings.Contains(hostname, ".") || net.ParseIP(hostname) != nil) {
|
||
hasValidEndpoints = true
|
||
}
|
||
}
|
||
|
||
endpoints = append(endpoints, effectiveURL)
|
||
// Store per-endpoint fingerprint for TOFU (Trust On First Use)
|
||
if ep.Fingerprint != "" {
|
||
endpointFingerprints[effectiveURL] = ep.Fingerprint
|
||
}
|
||
}
|
||
|
||
// If endpoints are just node names (not FQDNs or IPs), use main host only
|
||
// This is common when cluster nodes are discovered but not directly reachable
|
||
if !hasValidEndpoints || len(endpoints) == 0 {
|
||
log.Info().
|
||
Str("instance", pve.Name).
|
||
Str("mainHost", pve.Host).
|
||
Msg("Cluster endpoints are not resolvable, using main host for all cluster operations")
|
||
fallback := ensureClusterEndpointURL(pve.Host)
|
||
if fallback == "" {
|
||
fallback = ensureClusterEndpointURL(pve.Host)
|
||
}
|
||
endpoints = []string{fallback}
|
||
} else {
|
||
// Always include the main host URL as a fallback endpoint.
|
||
// This handles remote cluster scenarios where Proxmox reports internal IPs
|
||
// that aren't reachable from Pulse's network. The user-provided URL is
|
||
// reachable, so include it as a fallback for cluster API routing.
|
||
mainHostURL := ensureClusterEndpointURL(pve.Host)
|
||
mainHostAlreadyIncluded := false
|
||
for _, ep := range endpoints {
|
||
if ep == mainHostURL {
|
||
mainHostAlreadyIncluded = true
|
||
break
|
||
}
|
||
}
|
||
if !mainHostAlreadyIncluded && mainHostURL != "" {
|
||
log.Info().
|
||
Str("instance", pve.Name).
|
||
Str("mainHost", mainHostURL).
|
||
Int("clusterEndpoints", len(endpoints)).
|
||
Msg("Adding main host as fallback for remote cluster access")
|
||
endpoints = append(endpoints, mainHostURL)
|
||
}
|
||
}
|
||
|
||
log.Info().
|
||
Str("cluster", pve.ClusterName).
|
||
Strs("endpoints", endpoints).
|
||
Int("fingerprints", len(endpointFingerprints)).
|
||
Msg("Creating cluster-aware client")
|
||
|
||
clientConfig := config.CreateProxmoxConfig(&pve)
|
||
clientConfig.Timeout = cfg.ConnectionTimeout
|
||
clusterClient := proxmox.NewClusterClient(
|
||
pve.Name,
|
||
clientConfig,
|
||
endpoints,
|
||
endpointFingerprints,
|
||
)
|
||
m.pveClients[pve.Name] = clusterClient
|
||
log.Info().
|
||
Str("instance", pve.Name).
|
||
Str("cluster", pve.ClusterName).
|
||
Int("endpoints", len(endpoints)).
|
||
Msg("Cluster client created successfully")
|
||
// Set initial connection health to true for cluster
|
||
m.state.SetConnectionHealth(pve.Name, true)
|
||
} else {
|
||
// Create regular client
|
||
clientConfig := config.CreateProxmoxConfig(&pve)
|
||
clientConfig.Timeout = cfg.ConnectionTimeout
|
||
client, err := newProxmoxClientFunc(clientConfig)
|
||
if err != nil {
|
||
monErr := errors.WrapConnectionError("create_pve_client", pve.Name, err)
|
||
log.Error().
|
||
Err(monErr).
|
||
Str("instance", pve.Name).
|
||
Str("host", pve.Host).
|
||
Str("user", pve.User).
|
||
Bool("hasPassword", pve.Password != "").
|
||
Bool("hasToken", pve.TokenValue != "").
|
||
Msg("Failed to create PVE client - node will show as disconnected")
|
||
// Set initial connection health to false for this node
|
||
m.state.SetConnectionHealth(pve.Name, false)
|
||
continue
|
||
}
|
||
m.pveClients[pve.Name] = client
|
||
log.Info().Str("instance", pve.Name).Msg("PVE client created successfully")
|
||
// Set initial connection health to true
|
||
m.state.SetConnectionHealth(pve.Name, true)
|
||
}
|
||
}
|
||
|
||
// Initialize PBS clients
|
||
log.Info().Int("count", len(cfg.PBSInstances)).Msg("Initializing PBS clients")
|
||
for _, pbsInst := range cfg.PBSInstances {
|
||
log.Info().
|
||
Str("name", pbsInst.Name).
|
||
Str("host", pbsInst.Host).
|
||
Str("user", pbsInst.User).
|
||
Bool("hasToken", pbsInst.TokenName != "").
|
||
Msg("Configuring PBS instance")
|
||
|
||
clientConfig := config.CreatePBSConfig(&pbsInst)
|
||
clientConfig.Timeout = 60 * time.Second // Very generous timeout for slow PBS servers
|
||
client, err := pbs.NewClient(clientConfig)
|
||
if err != nil {
|
||
monErr := errors.WrapConnectionError("create_pbs_client", pbsInst.Name, err)
|
||
log.Error().
|
||
Err(monErr).
|
||
Str("instance", pbsInst.Name).
|
||
Str("host", pbsInst.Host).
|
||
Str("user", pbsInst.User).
|
||
Bool("hasPassword", pbsInst.Password != "").
|
||
Bool("hasToken", pbsInst.TokenValue != "").
|
||
Msg("Failed to create PBS client - node will show as disconnected")
|
||
// Set initial connection health to false for this node
|
||
m.state.SetConnectionHealth("pbs-"+pbsInst.Name, false)
|
||
continue
|
||
}
|
||
m.pbsClients[pbsInst.Name] = client
|
||
log.Info().Str("instance", pbsInst.Name).Msg("PBS client created successfully")
|
||
// Set initial connection health to true
|
||
m.state.SetConnectionHealth("pbs-"+pbsInst.Name, true)
|
||
}
|
||
|
||
// Initialize PMG clients
|
||
log.Info().Int("count", len(cfg.PMGInstances)).Msg("Initializing PMG clients")
|
||
for _, pmgInst := range cfg.PMGInstances {
|
||
log.Info().
|
||
Str("name", pmgInst.Name).
|
||
Str("host", pmgInst.Host).
|
||
Str("user", pmgInst.User).
|
||
Bool("hasToken", pmgInst.TokenName != "").
|
||
Msg("Configuring PMG instance")
|
||
|
||
clientConfig := config.CreatePMGConfig(&pmgInst)
|
||
if clientConfig.Timeout <= 0 {
|
||
clientConfig.Timeout = 45 * time.Second
|
||
}
|
||
|
||
client, err := pmg.NewClient(clientConfig)
|
||
if err != nil {
|
||
monErr := errors.WrapConnectionError("create_pmg_client", pmgInst.Name, err)
|
||
log.Error().
|
||
Err(monErr).
|
||
Str("instance", pmgInst.Name).
|
||
Str("host", pmgInst.Host).
|
||
Str("user", pmgInst.User).
|
||
Bool("hasPassword", pmgInst.Password != "").
|
||
Bool("hasToken", pmgInst.TokenValue != "").
|
||
Msg("Failed to create PMG client - gateway will show as disconnected")
|
||
m.state.SetConnectionHealth("pmg-"+pmgInst.Name, false)
|
||
continue
|
||
}
|
||
|
||
m.pmgClients[pmgInst.Name] = client
|
||
log.Info().Str("instance", pmgInst.Name).Msg("PMG client created successfully")
|
||
m.state.SetConnectionHealth("pmg-"+pmgInst.Name, true)
|
||
}
|
||
} // End of else block for mock mode check
|
||
|
||
// Initialize state stats
|
||
m.state.Stats = models.Stats{
|
||
StartTime: m.startTime,
|
||
Version: "2.0.0-go",
|
||
}
|
||
|
||
return m, nil
|
||
}
|
||
|
||
// SetExecutor allows tests to override the poll executor; passing nil restores the default executor.
|
||
func (m *Monitor) SetExecutor(exec PollExecutor) {
|
||
if m == nil {
|
||
return
|
||
}
|
||
|
||
m.mu.Lock()
|
||
defer m.mu.Unlock()
|
||
|
||
if exec == nil {
|
||
m.executor = newRealExecutor(m)
|
||
return
|
||
}
|
||
|
||
m.executor = exec
|
||
}
|
||
|
||
func (m *Monitor) buildInstanceInfoCache(cfg *config.Config) {
|
||
if m == nil || cfg == nil {
|
||
return
|
||
}
|
||
|
||
m.mu.Lock()
|
||
defer m.mu.Unlock()
|
||
|
||
if m.instanceInfoCache == nil {
|
||
m.instanceInfoCache = make(map[string]*instanceInfo)
|
||
}
|
||
|
||
add := func(instType InstanceType, name string, displayName string, connection string, metadata map[string]string) {
|
||
key := schedulerKey(instType, name)
|
||
m.instanceInfoCache[key] = &instanceInfo{
|
||
Key: key,
|
||
Type: instType,
|
||
DisplayName: displayName,
|
||
Connection: connection,
|
||
Metadata: metadata,
|
||
}
|
||
}
|
||
|
||
// PVE instances
|
||
for _, inst := range cfg.PVEInstances {
|
||
name := strings.TrimSpace(inst.Name)
|
||
if name == "" {
|
||
name = strings.TrimSpace(inst.Host)
|
||
}
|
||
if name == "" {
|
||
name = "pve-instance"
|
||
}
|
||
display := name
|
||
if display == "" {
|
||
display = strings.TrimSpace(inst.Host)
|
||
}
|
||
connection := strings.TrimSpace(inst.Host)
|
||
add(InstanceTypePVE, name, display, connection, nil)
|
||
}
|
||
|
||
// PBS instances
|
||
for _, inst := range cfg.PBSInstances {
|
||
name := strings.TrimSpace(inst.Name)
|
||
if name == "" {
|
||
name = strings.TrimSpace(inst.Host)
|
||
}
|
||
if name == "" {
|
||
name = "pbs-instance"
|
||
}
|
||
display := name
|
||
if display == "" {
|
||
display = strings.TrimSpace(inst.Host)
|
||
}
|
||
connection := strings.TrimSpace(inst.Host)
|
||
add(InstanceTypePBS, name, display, connection, nil)
|
||
}
|
||
|
||
// PMG instances
|
||
for _, inst := range cfg.PMGInstances {
|
||
name := strings.TrimSpace(inst.Name)
|
||
if name == "" {
|
||
name = strings.TrimSpace(inst.Host)
|
||
}
|
||
if name == "" {
|
||
name = "pmg-instance"
|
||
}
|
||
display := name
|
||
if display == "" {
|
||
display = strings.TrimSpace(inst.Host)
|
||
}
|
||
connection := strings.TrimSpace(inst.Host)
|
||
add(InstanceTypePMG, name, display, connection, nil)
|
||
}
|
||
}
|
||
|
||
func (m *Monitor) getExecutor() PollExecutor {
|
||
m.mu.RLock()
|
||
exec := m.executor
|
||
m.mu.RUnlock()
|
||
return exec
|
||
}
|
||
|
||
func clampInterval(value, min, max time.Duration) time.Duration {
|
||
if value <= 0 {
|
||
return min
|
||
}
|
||
if min > 0 && value < min {
|
||
return min
|
||
}
|
||
if max > 0 && value > max {
|
||
return max
|
||
}
|
||
return value
|
||
}
|
||
|
||
func (m *Monitor) effectivePVEPollingInterval() time.Duration {
|
||
const minInterval = 10 * time.Second
|
||
const maxInterval = time.Hour
|
||
|
||
interval := minInterval
|
||
if m != nil && m.config != nil && m.config.PVEPollingInterval > 0 {
|
||
interval = m.config.PVEPollingInterval
|
||
}
|
||
if interval < minInterval {
|
||
interval = minInterval
|
||
}
|
||
if interval > maxInterval {
|
||
interval = maxInterval
|
||
}
|
||
return interval
|
||
}
|
||
|
||
func (m *Monitor) baseIntervalForInstanceType(instanceType InstanceType) time.Duration {
|
||
if m == nil || m.config == nil {
|
||
return DefaultSchedulerConfig().BaseInterval
|
||
}
|
||
|
||
switch instanceType {
|
||
case InstanceTypePVE:
|
||
return m.effectivePVEPollingInterval()
|
||
case InstanceTypePBS:
|
||
return clampInterval(m.config.PBSPollingInterval, 10*time.Second, time.Hour)
|
||
case InstanceTypePMG:
|
||
return clampInterval(m.config.PMGPollingInterval, 10*time.Second, time.Hour)
|
||
default:
|
||
base := m.config.AdaptivePollingBaseInterval
|
||
if base <= 0 {
|
||
base = DefaultSchedulerConfig().BaseInterval
|
||
}
|
||
return clampInterval(base, time.Second, 0)
|
||
}
|
||
}
|
||
|
||
// getConfiguredHostIPs returns a list of IP addresses from all configured Proxmox hosts.
|
||
// This is used to prevent discovery from probing hosts we already know about.
|
||
// Caller must hold m.mu.RLock or m.mu.Lock.
|
||
func (m *Monitor) getConfiguredHostIPs() []string {
|
||
if m.config == nil {
|
||
return nil
|
||
}
|
||
|
||
seen := make(map[string]struct{})
|
||
var ips []string
|
||
|
||
addHost := func(host string) {
|
||
// Parse the host to extract IP/hostname
|
||
host = strings.TrimSpace(host)
|
||
if host == "" {
|
||
return
|
||
}
|
||
// Remove scheme if present
|
||
if strings.HasPrefix(host, "https://") {
|
||
host = strings.TrimPrefix(host, "https://")
|
||
} else if strings.HasPrefix(host, "http://") {
|
||
host = strings.TrimPrefix(host, "http://")
|
||
}
|
||
// Remove port if present
|
||
if colonIdx := strings.LastIndex(host, ":"); colonIdx != -1 {
|
||
// Check if it's an IPv6 address
|
||
if !strings.Contains(host[colonIdx:], "]") {
|
||
host = host[:colonIdx]
|
||
}
|
||
}
|
||
// Remove trailing path
|
||
if slashIdx := strings.Index(host, "/"); slashIdx != -1 {
|
||
host = host[:slashIdx]
|
||
}
|
||
host = strings.TrimSpace(host)
|
||
if host == "" {
|
||
return
|
||
}
|
||
// Check if it's already an IP
|
||
if ip := net.ParseIP(host); ip != nil {
|
||
if _, exists := seen[host]; !exists {
|
||
seen[host] = struct{}{}
|
||
ips = append(ips, host)
|
||
}
|
||
return
|
||
}
|
||
// Try to resolve hostname to IP
|
||
if addrs, err := net.LookupIP(host); err == nil && len(addrs) > 0 {
|
||
for _, addr := range addrs {
|
||
// Prefer IPv4
|
||
if v4 := addr.To4(); v4 != nil {
|
||
ipStr := v4.String()
|
||
if _, exists := seen[ipStr]; !exists {
|
||
seen[ipStr] = struct{}{}
|
||
ips = append(ips, ipStr)
|
||
}
|
||
break
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// Add PVE hosts
|
||
for _, pve := range m.config.PVEInstances {
|
||
addHost(pve.Host)
|
||
// Also add cluster endpoints (include both auto-discovered IP and override if set)
|
||
for _, ep := range pve.ClusterEndpoints {
|
||
addHost(ep.Host)
|
||
addHost(ep.IP)
|
||
if ep.IPOverride != "" && ep.IPOverride != ep.IP {
|
||
addHost(ep.IPOverride)
|
||
}
|
||
}
|
||
}
|
||
|
||
// Add PBS hosts
|
||
for _, pbs := range m.config.PBSInstances {
|
||
addHost(pbs.Host)
|
||
}
|
||
|
||
// Add PMG hosts
|
||
for _, pmg := range m.config.PMGInstances {
|
||
addHost(pmg.Host)
|
||
}
|
||
|
||
return ips
|
||
}
|
||
|
||
// consolidateDuplicateClusters detects and merges duplicate cluster instances.
|
||
// When multiple PVE instances belong to the same Proxmox cluster (determined by ClusterName),
|
||
// they should be merged into a single instance with all endpoints combined.
|
||
// This prevents duplicate VMs/containers in the UI.
|
||
func (m *Monitor) consolidateDuplicateClusters() {
|
||
if m == nil || m.config == nil || len(m.config.PVEInstances) < 2 {
|
||
return
|
||
}
|
||
|
||
// Group instances by cluster name
|
||
clusterGroups := make(map[string][]int) // clusterName -> indices of instances
|
||
for i, instance := range m.config.PVEInstances {
|
||
if instance.IsCluster && instance.ClusterName != "" {
|
||
clusterGroups[instance.ClusterName] = append(clusterGroups[instance.ClusterName], i)
|
||
}
|
||
}
|
||
|
||
// Find clusters that have duplicates
|
||
var mergedAny bool
|
||
for clusterName, indices := range clusterGroups {
|
||
if len(indices) < 2 {
|
||
continue // No duplicates for this cluster
|
||
}
|
||
|
||
log.Warn().
|
||
Str("cluster", clusterName).
|
||
Int("duplicates", len(indices)).
|
||
Msg("Detected duplicate cluster instances - consolidating")
|
||
|
||
// Keep the first instance and merge all others into it
|
||
primaryIdx := indices[0]
|
||
primary := &m.config.PVEInstances[primaryIdx]
|
||
|
||
// Build a set of existing endpoint node names
|
||
existingEndpoints := make(map[string]bool)
|
||
for _, ep := range primary.ClusterEndpoints {
|
||
existingEndpoints[ep.NodeName] = true
|
||
}
|
||
|
||
// Merge endpoints from all duplicate instances
|
||
for _, dupIdx := range indices[1:] {
|
||
duplicate := m.config.PVEInstances[dupIdx]
|
||
log.Info().
|
||
Str("cluster", clusterName).
|
||
Str("primary", primary.Name).
|
||
Str("duplicate", duplicate.Name).
|
||
Msg("Merging duplicate cluster instance")
|
||
|
||
for _, ep := range duplicate.ClusterEndpoints {
|
||
if !existingEndpoints[ep.NodeName] {
|
||
primary.ClusterEndpoints = append(primary.ClusterEndpoints, ep)
|
||
existingEndpoints[ep.NodeName] = true
|
||
log.Info().
|
||
Str("cluster", clusterName).
|
||
Str("endpoint", ep.NodeName).
|
||
Msg("Added endpoint from duplicate instance")
|
||
}
|
||
}
|
||
}
|
||
|
||
mergedAny = true
|
||
}
|
||
|
||
if !mergedAny {
|
||
return
|
||
}
|
||
|
||
// Remove duplicate instances (keeping only the primary for each cluster)
|
||
var newInstances []config.PVEInstance
|
||
seenClusters := make(map[string]bool)
|
||
|
||
for _, instance := range m.config.PVEInstances {
|
||
if instance.IsCluster && instance.ClusterName != "" {
|
||
if seenClusters[instance.ClusterName] {
|
||
log.Info().
|
||
Str("cluster", instance.ClusterName).
|
||
Str("instance", instance.Name).
|
||
Msg("Removing duplicate cluster instance")
|
||
continue // Skip duplicates
|
||
}
|
||
seenClusters[instance.ClusterName] = true
|
||
}
|
||
newInstances = append(newInstances, instance)
|
||
}
|
||
|
||
m.config.PVEInstances = newInstances
|
||
|
||
// Persist the consolidated configuration
|
||
if m.persistence != nil {
|
||
if err := m.persistence.SaveNodesConfig(m.config.PVEInstances, m.config.PBSInstances, m.config.PMGInstances); err != nil {
|
||
log.Error().Err(err).Msg("Failed to persist cluster consolidation")
|
||
} else {
|
||
log.Info().Msg("Persisted consolidated cluster configuration")
|
||
}
|
||
}
|
||
}
|
||
|
||
// Start begins the monitoring loop
|
||
func (m *Monitor) Start(ctx context.Context, wsHub *websocket.Hub) {
|
||
// Consolidate any duplicate cluster instances before starting
|
||
// This fixes the case where multiple agents registered from the same cluster
|
||
m.consolidateDuplicateClusters()
|
||
|
||
pollingInterval := m.effectivePVEPollingInterval()
|
||
log.Info().
|
||
Dur("pollingInterval", pollingInterval).
|
||
Msg("Starting monitoring loop")
|
||
|
||
m.mu.Lock()
|
||
m.runtimeCtx = ctx
|
||
m.wsHub = wsHub
|
||
m.mu.Unlock()
|
||
defer m.stopMockMetricsSampler()
|
||
|
||
if mock.IsMockEnabled() {
|
||
m.startMockMetricsSampler(ctx)
|
||
}
|
||
|
||
// Initialize and start discovery service if enabled
|
||
if mock.IsMockEnabled() {
|
||
log.Info().Msg("Mock mode enabled - skipping discovery service")
|
||
m.discoveryService = nil
|
||
} else if m.config.DiscoveryEnabled {
|
||
discoverySubnet := m.config.DiscoverySubnet
|
||
if discoverySubnet == "" {
|
||
discoverySubnet = "auto"
|
||
}
|
||
cfgProvider := func() config.DiscoveryConfig {
|
||
m.mu.RLock()
|
||
defer m.mu.RUnlock()
|
||
if m.config == nil {
|
||
return config.DefaultDiscoveryConfig()
|
||
}
|
||
cfg := config.CloneDiscoveryConfig(m.config.Discovery)
|
||
// Auto-populate IPBlocklist with configured Proxmox host IPs to avoid
|
||
// probing hosts we already know about (reduces PBS auth failure log spam)
|
||
cfg.IPBlocklist = m.getConfiguredHostIPs()
|
||
return cfg
|
||
}
|
||
m.discoveryService = discovery.NewService(wsHub, 5*time.Minute, discoverySubnet, cfgProvider)
|
||
if m.discoveryService != nil {
|
||
m.discoveryService.Start(ctx)
|
||
log.Info().Msg("Discovery service initialized and started")
|
||
} else {
|
||
log.Error().Msg("Failed to initialize discovery service")
|
||
}
|
||
} else {
|
||
log.Info().Msg("Discovery service disabled by configuration")
|
||
m.discoveryService = nil
|
||
}
|
||
|
||
// Set up alert callbacks
|
||
m.alertManager.SetAlertCallback(func(alert *alerts.Alert) {
|
||
m.handleAlertFired(alert)
|
||
})
|
||
// Set up AI analysis callback - this bypasses activation state and other notification suppression
|
||
// so AI can analyze alerts even during pending_review setup phase
|
||
m.alertManager.SetAlertForAICallback(func(alert *alerts.Alert) {
|
||
log.Debug().Str("alertID", alert.ID).Msg("AI alert callback invoked (bypassing notification suppression)")
|
||
if m.alertTriggeredAICallback != nil {
|
||
m.alertTriggeredAICallback(alert)
|
||
}
|
||
})
|
||
m.alertManager.SetResolvedCallback(func(alertID string) {
|
||
m.handleAlertResolved(alertID)
|
||
// Don't broadcast full state here - it causes a cascade with many guests.
|
||
// The frontend will get the updated alerts through the regular broadcast ticker.
|
||
})
|
||
m.alertManager.SetAcknowledgedCallback(func(alert *alerts.Alert, user string) {
|
||
m.handleAlertAcknowledged(alert, user)
|
||
})
|
||
m.alertManager.SetUnacknowledgedCallback(func(alert *alerts.Alert, user string) {
|
||
m.handleAlertUnacknowledged(alert, user)
|
||
})
|
||
m.alertManager.SetEscalateCallback(func(alert *alerts.Alert, level int) {
|
||
log.Info().
|
||
Str("alertID", alert.ID).
|
||
Int("level", level).
|
||
Msg("Alert escalated - sending notifications")
|
||
|
||
// Get escalation config
|
||
config := m.alertManager.GetConfig()
|
||
if level <= 0 || level > len(config.Schedule.Escalation.Levels) {
|
||
return
|
||
}
|
||
|
||
escalationLevel := config.Schedule.Escalation.Levels[level-1]
|
||
|
||
// Send notifications based on escalation level
|
||
switch escalationLevel.Notify {
|
||
case "email":
|
||
// Only send email
|
||
if emailConfig := m.notificationMgr.GetEmailConfig(); emailConfig.Enabled {
|
||
m.notificationMgr.SendAlert(alert)
|
||
}
|
||
case "webhook":
|
||
// Only send webhooks
|
||
for _, webhook := range m.notificationMgr.GetWebhooks() {
|
||
if webhook.Enabled {
|
||
m.notificationMgr.SendAlert(alert)
|
||
break
|
||
}
|
||
}
|
||
case "all":
|
||
// Send all notifications
|
||
m.notificationMgr.SendAlert(alert)
|
||
}
|
||
|
||
// Update WebSocket with escalation
|
||
wsHub.BroadcastAlert(alert)
|
||
})
|
||
|
||
// Create separate tickers for polling and broadcasting using the configured cadence
|
||
|
||
workerCount := len(m.pveClients) + len(m.pbsClients) + len(m.pmgClients)
|
||
m.startTaskWorkers(ctx, workerCount)
|
||
|
||
pollTicker := time.NewTicker(pollingInterval)
|
||
defer pollTicker.Stop()
|
||
|
||
broadcastTicker := time.NewTicker(pollingInterval)
|
||
defer broadcastTicker.Stop()
|
||
|
||
// Start connection retry mechanism for failed clients
|
||
// This handles cases where network/Proxmox isn't ready on initial startup
|
||
if !mock.IsMockEnabled() {
|
||
go m.retryFailedConnections(ctx)
|
||
}
|
||
|
||
// Do an immediate poll on start (only if not in mock mode)
|
||
if mock.IsMockEnabled() {
|
||
log.Info().Msg("Mock mode enabled - skipping real node polling")
|
||
go m.checkMockAlerts()
|
||
} else {
|
||
go m.poll(ctx, wsHub)
|
||
}
|
||
|
||
for {
|
||
select {
|
||
case <-pollTicker.C:
|
||
now := time.Now()
|
||
m.evaluateDockerAgents(now)
|
||
m.evaluateKubernetesAgents(now)
|
||
m.evaluateHostAgents(now)
|
||
m.cleanupRemovedDockerHosts(now)
|
||
m.cleanupRemovedKubernetesClusters(now)
|
||
m.cleanupGuestMetadataCache(now)
|
||
m.cleanupDiagnosticSnapshots(now)
|
||
m.cleanupRRDCache(now)
|
||
m.cleanupTrackingMaps(now)
|
||
m.cleanupMetricsHistory()
|
||
m.cleanupRateTracker(now)
|
||
if mock.IsMockEnabled() {
|
||
// In mock mode, keep synthetic alerts fresh
|
||
go m.checkMockAlerts()
|
||
} else {
|
||
// Poll real infrastructure
|
||
go m.poll(ctx, wsHub)
|
||
}
|
||
|
||
case <-broadcastTicker.C:
|
||
// Broadcast current state regardless of polling status
|
||
// Use GetState() instead of m.state.GetSnapshot() to respect mock mode
|
||
state := m.GetState()
|
||
log.Info().
|
||
Int("nodes", len(state.Nodes)).
|
||
Int("vms", len(state.VMs)).
|
||
Int("containers", len(state.Containers)).
|
||
Int("hosts", len(state.Hosts)).
|
||
Int("pbs", len(state.PBSInstances)).
|
||
Int("pbsBackups", len(state.Backups.PBS)).
|
||
Int("physicalDisks", len(state.PhysicalDisks)).
|
||
Msg("Broadcasting state update (ticker)")
|
||
// Convert to frontend format before broadcasting (converts time.Time to int64, etc.)
|
||
frontendState := state.ToFrontend()
|
||
// Update and inject unified resources if resource store is available
|
||
m.updateResourceStore(state)
|
||
frontendState.Resources = m.getResourcesForBroadcast()
|
||
// Use tenant-aware broadcast method
|
||
m.broadcastState(wsHub, frontendState)
|
||
|
||
case <-ctx.Done():
|
||
log.Info().Msg("Monitoring loop stopped")
|
||
return
|
||
}
|
||
}
|
||
}
|
||
|
||
var connRetryDelays = []time.Duration{
|
||
5 * time.Second,
|
||
10 * time.Second,
|
||
20 * time.Second,
|
||
40 * time.Second,
|
||
60 * time.Second,
|
||
}
|
||
|
||
// retryFailedConnections attempts to recreate clients that failed during initialization
|
||
// This handles cases where Proxmox/network isn't ready when Pulse starts
|
||
func (m *Monitor) retryFailedConnections(ctx context.Context) {
|
||
defer recoverFromPanic("retryFailedConnections")
|
||
|
||
// Retry schedule: 5s, 10s, 20s, 40s, 60s, then every 60s for up to 5 minutes total
|
||
retryDelays := connRetryDelays
|
||
|
||
maxRetryDuration := 5 * time.Minute
|
||
startTime := time.Now()
|
||
retryIndex := 0
|
||
|
||
for {
|
||
// Stop retrying after max duration or if context is cancelled
|
||
select {
|
||
case <-ctx.Done():
|
||
return
|
||
default:
|
||
}
|
||
|
||
if time.Since(startTime) > maxRetryDuration {
|
||
log.Info().Msg("Connection retry period expired")
|
||
return
|
||
}
|
||
|
||
// Calculate next retry delay
|
||
var delay time.Duration
|
||
if retryIndex < len(retryDelays) {
|
||
delay = retryDelays[retryIndex]
|
||
retryIndex++
|
||
} else {
|
||
delay = 60 * time.Second // Continue retrying every 60s
|
||
}
|
||
|
||
// Wait before retry
|
||
select {
|
||
case <-time.After(delay):
|
||
case <-ctx.Done():
|
||
return
|
||
}
|
||
|
||
// Check for missing clients and try to recreate them
|
||
m.mu.Lock()
|
||
missingPVE := []config.PVEInstance{}
|
||
missingPBS := []config.PBSInstance{}
|
||
|
||
// Find PVE instances without clients
|
||
for _, pve := range m.config.PVEInstances {
|
||
if _, exists := m.pveClients[pve.Name]; !exists {
|
||
missingPVE = append(missingPVE, pve)
|
||
}
|
||
}
|
||
|
||
// Find PBS instances without clients
|
||
for _, pbs := range m.config.PBSInstances {
|
||
if _, exists := m.pbsClients[pbs.Name]; !exists {
|
||
missingPBS = append(missingPBS, pbs)
|
||
}
|
||
}
|
||
m.mu.Unlock()
|
||
|
||
// If no missing clients, we're done
|
||
if len(missingPVE) == 0 && len(missingPBS) == 0 {
|
||
log.Info().Msg("All client connections established successfully")
|
||
return
|
||
}
|
||
|
||
log.Info().
|
||
Int("missingPVE", len(missingPVE)).
|
||
Int("missingPBS", len(missingPBS)).
|
||
Dur("nextRetry", delay).
|
||
Msg("Attempting to reconnect failed clients")
|
||
|
||
// Try to recreate PVE clients
|
||
for _, pve := range missingPVE {
|
||
if pve.IsCluster && len(pve.ClusterEndpoints) > 0 {
|
||
// Create cluster client
|
||
hasValidEndpoints := false
|
||
endpoints := make([]string, 0, len(pve.ClusterEndpoints))
|
||
endpointFingerprints := make(map[string]string)
|
||
|
||
for _, ep := range pve.ClusterEndpoints {
|
||
// Use EffectiveIP() which prefers IPOverride over auto-discovered IP
|
||
host := ep.EffectiveIP()
|
||
if host == "" {
|
||
host = ep.Host
|
||
}
|
||
if host == "" {
|
||
continue
|
||
}
|
||
if strings.Contains(host, ".") || net.ParseIP(host) != nil {
|
||
hasValidEndpoints = true
|
||
}
|
||
if !strings.HasPrefix(host, "http") {
|
||
host = fmt.Sprintf("https://%s:8006", host)
|
||
}
|
||
endpoints = append(endpoints, host)
|
||
// Store per-endpoint fingerprint for TOFU
|
||
if ep.Fingerprint != "" {
|
||
endpointFingerprints[host] = ep.Fingerprint
|
||
}
|
||
}
|
||
|
||
if !hasValidEndpoints || len(endpoints) == 0 {
|
||
endpoints = []string{pve.Host}
|
||
if !strings.HasPrefix(endpoints[0], "http") {
|
||
endpoints[0] = fmt.Sprintf("https://%s:8006", endpoints[0])
|
||
}
|
||
}
|
||
|
||
clientConfig := config.CreateProxmoxConfig(&pve)
|
||
clientConfig.Timeout = m.config.ConnectionTimeout
|
||
clusterClient := proxmox.NewClusterClient(pve.Name, clientConfig, endpoints, endpointFingerprints)
|
||
|
||
m.mu.Lock()
|
||
m.pveClients[pve.Name] = clusterClient
|
||
m.state.SetConnectionHealth(pve.Name, true)
|
||
m.mu.Unlock()
|
||
|
||
log.Info().
|
||
Str("instance", pve.Name).
|
||
Str("cluster", pve.ClusterName).
|
||
Msg("Successfully reconnected cluster client")
|
||
} else {
|
||
// Create regular client
|
||
clientConfig := config.CreateProxmoxConfig(&pve)
|
||
clientConfig.Timeout = m.config.ConnectionTimeout
|
||
client, err := newProxmoxClientFunc(clientConfig)
|
||
if err != nil {
|
||
log.Warn().
|
||
Err(err).
|
||
Str("instance", pve.Name).
|
||
Msg("Failed to reconnect PVE client, will retry")
|
||
continue
|
||
}
|
||
|
||
m.mu.Lock()
|
||
m.pveClients[pve.Name] = client
|
||
m.state.SetConnectionHealth(pve.Name, true)
|
||
m.mu.Unlock()
|
||
|
||
log.Info().
|
||
Str("instance", pve.Name).
|
||
Msg("Successfully reconnected PVE client")
|
||
}
|
||
}
|
||
|
||
// Try to recreate PBS clients
|
||
for _, pbsInst := range missingPBS {
|
||
clientConfig := config.CreatePBSConfig(&pbsInst)
|
||
clientConfig.Timeout = 60 * time.Second
|
||
client, err := pbs.NewClient(clientConfig)
|
||
if err != nil {
|
||
log.Warn().
|
||
Err(err).
|
||
Str("instance", pbsInst.Name).
|
||
Msg("Failed to reconnect PBS client, will retry")
|
||
continue
|
||
}
|
||
|
||
m.mu.Lock()
|
||
m.pbsClients[pbsInst.Name] = client
|
||
m.state.SetConnectionHealth("pbs-"+pbsInst.Name, true)
|
||
m.mu.Unlock()
|
||
|
||
log.Info().
|
||
Str("instance", pbsInst.Name).
|
||
Msg("Successfully reconnected PBS client")
|
||
}
|
||
}
|
||
}
|
||
|
||
// poll fetches data from all configured instances
|
||
func (m *Monitor) poll(_ context.Context, wsHub *websocket.Hub) {
|
||
defer recoverFromPanic("poll")
|
||
|
||
// Limit concurrent polls to 2 to prevent resource exhaustion
|
||
currentCount := atomic.AddInt32(&m.activePollCount, 1)
|
||
if currentCount > 2 {
|
||
atomic.AddInt32(&m.activePollCount, -1)
|
||
if logging.IsLevelEnabled(zerolog.DebugLevel) {
|
||
log.Debug().Int32("activePolls", currentCount-1).Msg("Too many concurrent polls, skipping")
|
||
}
|
||
return
|
||
}
|
||
defer atomic.AddInt32(&m.activePollCount, -1)
|
||
|
||
if logging.IsLevelEnabled(zerolog.DebugLevel) {
|
||
log.Debug().Msg("Starting polling cycle")
|
||
}
|
||
startTime := time.Now()
|
||
now := startTime
|
||
|
||
plannedTasks := m.buildScheduledTasks(now)
|
||
for _, task := range plannedTasks {
|
||
m.taskQueue.Upsert(task)
|
||
}
|
||
m.updateQueueDepthMetric()
|
||
|
||
// Update performance metrics
|
||
m.state.Performance.LastPollDuration = time.Since(startTime).Seconds()
|
||
m.state.Stats.PollingCycles++
|
||
m.state.Stats.Uptime = int64(time.Since(m.startTime).Seconds())
|
||
m.state.Stats.WebSocketClients = wsHub.GetClientCount()
|
||
|
||
// Sync alert state so broadcasts include the latest acknowledgement data
|
||
m.syncAlertsToState()
|
||
|
||
// Increment poll counter
|
||
m.mu.Lock()
|
||
m.pollCounter++
|
||
m.mu.Unlock()
|
||
|
||
if logging.IsLevelEnabled(zerolog.DebugLevel) {
|
||
log.Debug().Dur("duration", time.Since(startTime)).Msg("Polling cycle completed")
|
||
}
|
||
|
||
// Broadcasting is now handled by the timer in Start()
|
||
}
|
||
|
||
// syncAlertsToState copies the latest alert manager data into the shared state snapshot.
|
||
// This keeps WebSocket broadcasts aligned with in-memory acknowledgement updates.
|
||
func (m *Monitor) syncAlertsToState() {
|
||
if m.pruneStaleDockerAlerts() {
|
||
if logging.IsLevelEnabled(zerolog.DebugLevel) {
|
||
log.Debug().Msg("Pruned stale docker alerts during sync")
|
||
}
|
||
}
|
||
|
||
activeAlerts := m.alertManager.GetActiveAlerts()
|
||
modelAlerts := make([]models.Alert, 0, len(activeAlerts))
|
||
for _, alert := range activeAlerts {
|
||
modelAlerts = append(modelAlerts, models.Alert{
|
||
ID: alert.ID,
|
||
Type: alert.Type,
|
||
Level: string(alert.Level),
|
||
ResourceID: alert.ResourceID,
|
||
ResourceName: alert.ResourceName,
|
||
Node: alert.Node,
|
||
NodeDisplayName: alert.NodeDisplayName,
|
||
Instance: alert.Instance,
|
||
Message: alert.Message,
|
||
Value: alert.Value,
|
||
Threshold: alert.Threshold,
|
||
StartTime: alert.StartTime,
|
||
Acknowledged: alert.Acknowledged,
|
||
AckTime: alert.AckTime,
|
||
AckUser: alert.AckUser,
|
||
})
|
||
if alert.Acknowledged && logging.IsLevelEnabled(zerolog.DebugLevel) {
|
||
log.Debug().Str("alertID", alert.ID).Interface("ackTime", alert.AckTime).Msg("Syncing acknowledged alert")
|
||
}
|
||
}
|
||
m.state.UpdateActiveAlerts(modelAlerts)
|
||
|
||
recentlyResolved := m.alertManager.GetRecentlyResolved()
|
||
if len(recentlyResolved) > 0 {
|
||
log.Info().Int("count", len(recentlyResolved)).Msg("Syncing recently resolved alerts")
|
||
}
|
||
m.state.UpdateRecentlyResolved(recentlyResolved)
|
||
}
|
||
|
||
// SyncAlertState is the exported wrapper used by APIs that mutate alerts outside the poll loop.
|
||
func (m *Monitor) SyncAlertState() {
|
||
m.syncAlertsToState()
|
||
}
|
||
|
||
// pruneStaleDockerAlerts removes docker alerts that reference hosts no longer present in state.
|
||
func (m *Monitor) pruneStaleDockerAlerts() bool {
|
||
if m.alertManager == nil {
|
||
return false
|
||
}
|
||
|
||
hosts := m.state.GetDockerHosts()
|
||
knownHosts := make(map[string]struct{}, len(hosts))
|
||
for _, host := range hosts {
|
||
id := strings.TrimSpace(host.ID)
|
||
if id != "" {
|
||
knownHosts[id] = struct{}{}
|
||
}
|
||
}
|
||
|
||
if len(knownHosts) == 0 {
|
||
// Still allow stale entries to be cleared if no hosts remain.
|
||
}
|
||
|
||
active := m.alertManager.GetActiveAlerts()
|
||
processed := make(map[string]struct{})
|
||
cleared := false
|
||
|
||
for _, alert := range active {
|
||
var hostID string
|
||
|
||
switch {
|
||
case alert.Type == "docker-host-offline":
|
||
hostID = strings.TrimPrefix(alert.ID, "docker-host-offline-")
|
||
case strings.HasPrefix(alert.ResourceID, "docker:"):
|
||
resource := strings.TrimPrefix(alert.ResourceID, "docker:")
|
||
if idx := strings.Index(resource, "/"); idx >= 0 {
|
||
hostID = resource[:idx]
|
||
} else {
|
||
hostID = resource
|
||
}
|
||
default:
|
||
continue
|
||
}
|
||
|
||
hostID = strings.TrimSpace(hostID)
|
||
if hostID == "" {
|
||
continue
|
||
}
|
||
|
||
if _, known := knownHosts[hostID]; known {
|
||
continue
|
||
}
|
||
if _, alreadyCleared := processed[hostID]; alreadyCleared {
|
||
continue
|
||
}
|
||
|
||
host := models.DockerHost{
|
||
ID: hostID,
|
||
DisplayName: alert.ResourceName,
|
||
Hostname: alert.Node,
|
||
}
|
||
if host.DisplayName == "" {
|
||
host.DisplayName = hostID
|
||
}
|
||
if host.Hostname == "" {
|
||
host.Hostname = hostID
|
||
}
|
||
|
||
m.alertManager.HandleDockerHostRemoved(host)
|
||
processed[hostID] = struct{}{}
|
||
cleared = true
|
||
}
|
||
|
||
return cleared
|
||
}
|
||
|
||
func (m *Monitor) startTaskWorkers(ctx context.Context, workers int) {
|
||
if m.taskQueue == nil {
|
||
return
|
||
}
|
||
if workers < 1 {
|
||
workers = 1
|
||
}
|
||
if workers > 10 {
|
||
workers = 10
|
||
}
|
||
for i := 0; i < workers; i++ {
|
||
go m.taskWorker(ctx, i)
|
||
}
|
||
}
|
||
|
||
func (m *Monitor) taskWorker(ctx context.Context, id int) {
|
||
defer recoverFromPanic(fmt.Sprintf("taskWorker-%d", id))
|
||
|
||
if logging.IsLevelEnabled(zerolog.DebugLevel) {
|
||
log.Debug().Int("worker", id).Msg("Task worker started")
|
||
}
|
||
for {
|
||
task, ok := m.taskQueue.WaitNext(ctx)
|
||
if !ok {
|
||
if logging.IsLevelEnabled(zerolog.DebugLevel) {
|
||
log.Debug().Int("worker", id).Msg("Task worker stopping")
|
||
}
|
||
return
|
||
}
|
||
|
||
m.executeScheduledTask(ctx, task)
|
||
|
||
m.rescheduleTask(task)
|
||
m.updateQueueDepthMetric()
|
||
}
|
||
}
|
||
|
||
func derivePollTimeout(cfg *config.Config) time.Duration {
|
||
timeout := defaultTaskTimeout
|
||
if cfg != nil && cfg.ConnectionTimeout > 0 {
|
||
timeout = cfg.ConnectionTimeout * 2
|
||
}
|
||
if timeout < minTaskTimeout {
|
||
timeout = minTaskTimeout
|
||
}
|
||
// Use configurable max timeout from config (set via MAX_POLL_TIMEOUT env var)
|
||
// Falls back to hardcoded maxTaskTimeout if config is nil or MaxPollTimeout not set
|
||
maxTimeout := maxTaskTimeout
|
||
if cfg != nil && cfg.MaxPollTimeout > 0 {
|
||
maxTimeout = cfg.MaxPollTimeout
|
||
}
|
||
if timeout > maxTimeout {
|
||
timeout = maxTimeout
|
||
}
|
||
return timeout
|
||
}
|
||
|
||
func (m *Monitor) taskExecutionTimeout(_ InstanceType) time.Duration {
|
||
if m == nil {
|
||
return defaultTaskTimeout
|
||
}
|
||
timeout := m.pollTimeout
|
||
if timeout <= 0 {
|
||
timeout = defaultTaskTimeout
|
||
}
|
||
return timeout
|
||
}
|
||
|
||
func (m *Monitor) executeScheduledTask(ctx context.Context, task ScheduledTask) {
|
||
if !m.allowExecution(task) {
|
||
if logging.IsLevelEnabled(zerolog.DebugLevel) {
|
||
log.Debug().
|
||
Str("instance", task.InstanceName).
|
||
Str("type", string(task.InstanceType)).
|
||
Msg("Task blocked by circuit breaker")
|
||
}
|
||
return
|
||
}
|
||
|
||
if m.pollMetrics != nil {
|
||
wait := time.Duration(0)
|
||
if !task.NextRun.IsZero() {
|
||
wait = time.Since(task.NextRun)
|
||
if wait < 0 {
|
||
wait = 0
|
||
}
|
||
}
|
||
instanceType := string(task.InstanceType)
|
||
if strings.TrimSpace(instanceType) == "" {
|
||
instanceType = "unknown"
|
||
}
|
||
m.pollMetrics.RecordQueueWait(instanceType, wait)
|
||
}
|
||
|
||
executor := m.getExecutor()
|
||
if executor == nil {
|
||
log.Error().
|
||
Str("instance", task.InstanceName).
|
||
Str("type", string(task.InstanceType)).
|
||
Msg("No poll executor configured; skipping task")
|
||
return
|
||
}
|
||
|
||
pollTask := PollTask{
|
||
InstanceName: task.InstanceName,
|
||
InstanceType: string(task.InstanceType),
|
||
}
|
||
|
||
switch task.InstanceType {
|
||
case InstanceTypePVE:
|
||
client, ok := m.pveClients[task.InstanceName]
|
||
if !ok || client == nil {
|
||
log.Warn().Str("instance", task.InstanceName).Msg("PVE client missing for scheduled task")
|
||
return
|
||
}
|
||
pollTask.PVEClient = client
|
||
case InstanceTypePBS:
|
||
client, ok := m.pbsClients[task.InstanceName]
|
||
if !ok || client == nil {
|
||
log.Warn().Str("instance", task.InstanceName).Msg("PBS client missing for scheduled task")
|
||
return
|
||
}
|
||
pollTask.PBSClient = client
|
||
case InstanceTypePMG:
|
||
client, ok := m.pmgClients[task.InstanceName]
|
||
if !ok || client == nil {
|
||
log.Warn().Str("instance", task.InstanceName).Msg("PMG client missing for scheduled task")
|
||
return
|
||
}
|
||
pollTask.PMGClient = client
|
||
default:
|
||
log.Debug().
|
||
Str("instance", task.InstanceName).
|
||
Str("type", string(task.InstanceType)).
|
||
Msg("Skipping unsupported task type")
|
||
return
|
||
}
|
||
|
||
taskCtx := ctx
|
||
var cancel context.CancelFunc
|
||
timeout := m.taskExecutionTimeout(task.InstanceType)
|
||
if timeout > 0 {
|
||
taskCtx, cancel = context.WithTimeout(ctx, timeout)
|
||
defer cancel()
|
||
}
|
||
|
||
executor.Execute(taskCtx, pollTask)
|
||
|
||
if timeout > 0 && stderrors.Is(taskCtx.Err(), context.DeadlineExceeded) {
|
||
log.Warn().
|
||
Str("instance", task.InstanceName).
|
||
Str("type", string(task.InstanceType)).
|
||
Dur("timeout", timeout).
|
||
Msg("Polling task timed out; rescheduling with fresh worker")
|
||
}
|
||
}
|
||
|
||
func (m *Monitor) rescheduleTask(task ScheduledTask) {
|
||
if m.taskQueue == nil {
|
||
return
|
||
}
|
||
|
||
key := schedulerKey(task.InstanceType, task.InstanceName)
|
||
m.mu.Lock()
|
||
outcome, hasOutcome := m.lastOutcome[key]
|
||
failureCount := m.failureCounts[key]
|
||
m.mu.Unlock()
|
||
|
||
if hasOutcome && !outcome.success {
|
||
if !outcome.transient || failureCount >= m.maxRetryAttempts {
|
||
m.sendToDeadLetter(task, outcome.err)
|
||
return
|
||
}
|
||
delay := m.backoffCfg.nextDelay(failureCount-1, m.randomFloat())
|
||
if delay <= 0 {
|
||
delay = 5 * time.Second
|
||
}
|
||
if m.config != nil && m.config.AdaptivePollingEnabled && m.config.AdaptivePollingMaxInterval > 0 && m.config.AdaptivePollingMaxInterval <= 15*time.Second {
|
||
maxDelay := 4 * time.Second
|
||
if delay > maxDelay {
|
||
delay = maxDelay
|
||
}
|
||
}
|
||
next := task
|
||
next.Interval = delay
|
||
next.NextRun = time.Now().Add(delay)
|
||
m.taskQueue.Upsert(next)
|
||
return
|
||
}
|
||
|
||
if m.scheduler == nil {
|
||
baseInterval := m.baseIntervalForInstanceType(task.InstanceType)
|
||
nextInterval := task.Interval
|
||
if nextInterval <= 0 {
|
||
nextInterval = baseInterval
|
||
}
|
||
if nextInterval <= 0 {
|
||
nextInterval = DefaultSchedulerConfig().BaseInterval
|
||
}
|
||
next := task
|
||
next.NextRun = time.Now().Add(nextInterval)
|
||
next.Interval = nextInterval
|
||
m.taskQueue.Upsert(next)
|
||
return
|
||
}
|
||
|
||
desc := InstanceDescriptor{
|
||
Name: task.InstanceName,
|
||
Type: task.InstanceType,
|
||
LastInterval: task.Interval,
|
||
LastScheduled: task.NextRun,
|
||
}
|
||
if m.stalenessTracker != nil {
|
||
if snap, ok := m.stalenessTracker.snapshot(task.InstanceType, task.InstanceName); ok {
|
||
desc.LastSuccess = snap.LastSuccess
|
||
desc.LastFailure = snap.LastError
|
||
if snap.ChangeHash != "" {
|
||
desc.Metadata = map[string]any{"changeHash": snap.ChangeHash}
|
||
}
|
||
}
|
||
}
|
||
|
||
tasks := m.scheduler.BuildPlan(time.Now(), []InstanceDescriptor{desc}, m.taskQueue.Size())
|
||
if len(tasks) == 0 {
|
||
next := task
|
||
nextInterval := task.Interval
|
||
if nextInterval <= 0 && m.config != nil {
|
||
nextInterval = m.config.AdaptivePollingBaseInterval
|
||
}
|
||
if nextInterval <= 0 {
|
||
nextInterval = DefaultSchedulerConfig().BaseInterval
|
||
}
|
||
next.Interval = nextInterval
|
||
next.NextRun = time.Now().Add(nextInterval)
|
||
m.taskQueue.Upsert(next)
|
||
return
|
||
}
|
||
for _, next := range tasks {
|
||
m.taskQueue.Upsert(next)
|
||
}
|
||
}
|
||
|
||
func (m *Monitor) sendToDeadLetter(task ScheduledTask, err error) {
|
||
if m.deadLetterQueue == nil {
|
||
log.Error().
|
||
Str("instance", task.InstanceName).
|
||
Str("type", string(task.InstanceType)).
|
||
Err(err).
|
||
Msg("Dead-letter queue unavailable; dropping task")
|
||
return
|
||
}
|
||
|
||
log.Error().
|
||
Str("instance", task.InstanceName).
|
||
Str("type", string(task.InstanceType)).
|
||
Err(err).
|
||
Msg("Routing task to dead-letter queue after repeated failures")
|
||
|
||
next := task
|
||
next.Interval = 30 * time.Minute
|
||
next.NextRun = time.Now().Add(next.Interval)
|
||
m.deadLetterQueue.Upsert(next)
|
||
m.updateDeadLetterMetrics()
|
||
|
||
key := schedulerKey(task.InstanceType, task.InstanceName)
|
||
now := time.Now()
|
||
|
||
m.mu.Lock()
|
||
if m.dlqInsightMap == nil {
|
||
m.dlqInsightMap = make(map[string]*dlqInsight)
|
||
}
|
||
info, ok := m.dlqInsightMap[key]
|
||
if !ok {
|
||
info = &dlqInsight{}
|
||
m.dlqInsightMap[key] = info
|
||
}
|
||
if info.FirstAttempt.IsZero() {
|
||
info.FirstAttempt = now
|
||
}
|
||
info.LastAttempt = now
|
||
info.RetryCount++
|
||
info.NextRetry = next.NextRun
|
||
if err != nil {
|
||
info.Reason = classifyDLQReason(err)
|
||
}
|
||
m.mu.Unlock()
|
||
}
|
||
|
||
func classifyDLQReason(err error) string {
|
||
if err == nil {
|
||
return ""
|
||
}
|
||
if errors.IsRetryableError(err) {
|
||
return "max_retry_attempts"
|
||
}
|
||
return "permanent_failure"
|
||
}
|
||
|
||
func (m *Monitor) updateDeadLetterMetrics() {
|
||
if m.pollMetrics == nil || m.deadLetterQueue == nil {
|
||
return
|
||
}
|
||
|
||
size := m.deadLetterQueue.Size()
|
||
if size <= 0 {
|
||
m.pollMetrics.UpdateDeadLetterCounts(nil)
|
||
return
|
||
}
|
||
|
||
tasks := m.deadLetterQueue.PeekAll(size)
|
||
m.pollMetrics.UpdateDeadLetterCounts(tasks)
|
||
}
|
||
|
||
func (m *Monitor) updateBreakerMetric(instanceType InstanceType, instance string, breaker *circuitBreaker) {
|
||
if m.pollMetrics == nil || breaker == nil {
|
||
return
|
||
}
|
||
|
||
state, failures, retryAt, _, _ := breaker.stateDetails()
|
||
m.pollMetrics.SetBreakerState(string(instanceType), instance, state, failures, retryAt)
|
||
}
|
||
|
||
func (m *Monitor) randomFloat() float64 {
|
||
if m.rng == nil {
|
||
m.rng = rand.New(rand.NewSource(time.Now().UnixNano()))
|
||
}
|
||
return m.rng.Float64()
|
||
}
|
||
|
||
func (m *Monitor) updateQueueDepthMetric() {
|
||
if m.pollMetrics == nil || m.taskQueue == nil {
|
||
return
|
||
}
|
||
snapshot := m.taskQueue.Snapshot()
|
||
m.pollMetrics.SetQueueDepth(snapshot.Depth)
|
||
m.pollMetrics.UpdateQueueSnapshot(snapshot)
|
||
}
|
||
|
||
func (m *Monitor) allowExecution(task ScheduledTask) bool {
|
||
if m.circuitBreakers == nil {
|
||
return true
|
||
}
|
||
key := schedulerKey(task.InstanceType, task.InstanceName)
|
||
breaker := m.ensureBreaker(key)
|
||
allowed := breaker.allow(time.Now())
|
||
m.updateBreakerMetric(task.InstanceType, task.InstanceName, breaker)
|
||
return allowed
|
||
}
|
||
|
||
func (m *Monitor) ensureBreaker(key string) *circuitBreaker {
|
||
m.mu.Lock()
|
||
defer m.mu.Unlock()
|
||
|
||
if m.circuitBreakers == nil {
|
||
m.circuitBreakers = make(map[string]*circuitBreaker)
|
||
}
|
||
if breaker, ok := m.circuitBreakers[key]; ok {
|
||
return breaker
|
||
}
|
||
baseRetry := m.breakerBaseRetry
|
||
if baseRetry <= 0 {
|
||
baseRetry = 5 * time.Second
|
||
}
|
||
maxDelay := m.breakerMaxDelay
|
||
if maxDelay <= 0 {
|
||
maxDelay = 5 * time.Minute
|
||
}
|
||
halfOpen := m.breakerHalfOpenWindow
|
||
if halfOpen <= 0 {
|
||
halfOpen = 30 * time.Second
|
||
}
|
||
breaker := newCircuitBreaker(3, baseRetry, maxDelay, halfOpen)
|
||
m.circuitBreakers[key] = breaker
|
||
return breaker
|
||
}
|
||
|
||
func (m *Monitor) recordTaskResult(instanceType InstanceType, instance string, pollErr error) {
|
||
if m == nil {
|
||
return
|
||
}
|
||
|
||
key := schedulerKey(instanceType, instance)
|
||
now := time.Now()
|
||
|
||
breaker := m.ensureBreaker(key)
|
||
|
||
m.mu.Lock()
|
||
status, ok := m.pollStatusMap[key]
|
||
if !ok {
|
||
status = &pollStatus{}
|
||
m.pollStatusMap[key] = status
|
||
}
|
||
|
||
if pollErr == nil {
|
||
if m.failureCounts != nil {
|
||
m.failureCounts[key] = 0
|
||
}
|
||
if m.lastOutcome != nil {
|
||
m.lastOutcome[key] = taskOutcome{
|
||
success: true,
|
||
transient: true,
|
||
err: nil,
|
||
recordedAt: now,
|
||
}
|
||
}
|
||
status.LastSuccess = now
|
||
status.ConsecutiveFailures = 0
|
||
status.FirstFailureAt = time.Time{}
|
||
m.mu.Unlock()
|
||
if breaker != nil {
|
||
breaker.recordSuccess()
|
||
m.updateBreakerMetric(instanceType, instance, breaker)
|
||
}
|
||
return
|
||
}
|
||
|
||
transient := isTransientError(pollErr)
|
||
category := "permanent"
|
||
if transient {
|
||
category = "transient"
|
||
}
|
||
if m.failureCounts != nil {
|
||
m.failureCounts[key] = m.failureCounts[key] + 1
|
||
}
|
||
if m.lastOutcome != nil {
|
||
m.lastOutcome[key] = taskOutcome{
|
||
success: false,
|
||
transient: transient,
|
||
err: pollErr,
|
||
recordedAt: now,
|
||
}
|
||
}
|
||
status.LastErrorAt = now
|
||
status.LastErrorMessage = pollErr.Error()
|
||
status.LastErrorCategory = category
|
||
status.ConsecutiveFailures++
|
||
if status.ConsecutiveFailures == 1 {
|
||
status.FirstFailureAt = now
|
||
}
|
||
m.mu.Unlock()
|
||
if breaker != nil {
|
||
breaker.recordFailure(now)
|
||
m.updateBreakerMetric(instanceType, instance, breaker)
|
||
}
|
||
}
|
||
|
||
// SchedulerHealthResponse contains complete scheduler health data for API exposure.
|
||
type SchedulerHealthResponse struct {
|
||
UpdatedAt time.Time `json:"updatedAt"`
|
||
Enabled bool `json:"enabled"`
|
||
Queue QueueSnapshot `json:"queue"`
|
||
DeadLetter DeadLetterSnapshot `json:"deadLetter"`
|
||
Breakers []BreakerSnapshot `json:"breakers,omitempty"`
|
||
Staleness []StalenessSnapshot `json:"staleness,omitempty"`
|
||
Instances []InstanceHealth `json:"instances"`
|
||
}
|
||
|
||
// DeadLetterSnapshot contains dead-letter queue data.
|
||
type DeadLetterSnapshot struct {
|
||
Count int `json:"count"`
|
||
Tasks []DeadLetterTask `json:"tasks"`
|
||
}
|
||
|
||
// SchedulerHealth returns a complete snapshot of scheduler health for API exposure.
|
||
func (m *Monitor) SchedulerHealth() SchedulerHealthResponse {
|
||
response := SchedulerHealthResponse{
|
||
UpdatedAt: time.Now(),
|
||
Enabled: m.config != nil && m.config.AdaptivePollingEnabled,
|
||
}
|
||
|
||
// Queue snapshot
|
||
if m.taskQueue != nil {
|
||
response.Queue = m.taskQueue.Snapshot()
|
||
if m.pollMetrics != nil {
|
||
m.pollMetrics.UpdateQueueSnapshot(response.Queue)
|
||
}
|
||
}
|
||
|
||
// Dead-letter queue snapshot
|
||
if m.deadLetterQueue != nil {
|
||
deadLetterTasks := m.deadLetterQueue.PeekAll(25) // limit to top 25
|
||
m.mu.RLock()
|
||
for i := range deadLetterTasks {
|
||
key := schedulerKey(InstanceType(deadLetterTasks[i].Type), deadLetterTasks[i].Instance)
|
||
if outcome, ok := m.lastOutcome[key]; ok && outcome.err != nil {
|
||
deadLetterTasks[i].LastError = outcome.err.Error()
|
||
}
|
||
if count, ok := m.failureCounts[key]; ok {
|
||
deadLetterTasks[i].Failures = count
|
||
}
|
||
}
|
||
m.mu.RUnlock()
|
||
response.DeadLetter = DeadLetterSnapshot{
|
||
Count: m.deadLetterQueue.Size(),
|
||
Tasks: deadLetterTasks,
|
||
}
|
||
m.updateDeadLetterMetrics()
|
||
}
|
||
|
||
// Circuit breaker snapshots
|
||
m.mu.RLock()
|
||
breakerSnapshots := make([]BreakerSnapshot, 0, len(m.circuitBreakers))
|
||
for key, breaker := range m.circuitBreakers {
|
||
state, failures, retryAt := breaker.State()
|
||
// Only include breakers that are not in default closed state with 0 failures
|
||
if state != "closed" || failures > 0 {
|
||
// Parse instance type and name from key
|
||
parts := strings.SplitN(key, "::", 2)
|
||
instanceType, instanceName := "unknown", key
|
||
if len(parts) == 2 {
|
||
instanceType, instanceName = parts[0], parts[1]
|
||
}
|
||
breakerSnapshots = append(breakerSnapshots, BreakerSnapshot{
|
||
Instance: instanceName,
|
||
Type: instanceType,
|
||
State: state,
|
||
Failures: failures,
|
||
RetryAt: retryAt,
|
||
})
|
||
}
|
||
}
|
||
m.mu.RUnlock()
|
||
response.Breakers = breakerSnapshots
|
||
|
||
// Staleness snapshots
|
||
if m.stalenessTracker != nil {
|
||
response.Staleness = m.stalenessTracker.Snapshot()
|
||
}
|
||
|
||
instanceInfos := make(map[string]*instanceInfo)
|
||
pollStatuses := make(map[string]pollStatus)
|
||
dlqInsights := make(map[string]dlqInsight)
|
||
breakerRefs := make(map[string]*circuitBreaker)
|
||
|
||
m.mu.RLock()
|
||
for k, v := range m.instanceInfoCache {
|
||
if v == nil {
|
||
continue
|
||
}
|
||
copyVal := *v
|
||
instanceInfos[k] = ©Val
|
||
}
|
||
for k, v := range m.pollStatusMap {
|
||
if v == nil {
|
||
continue
|
||
}
|
||
pollStatuses[k] = *v
|
||
}
|
||
for k, v := range m.dlqInsightMap {
|
||
if v == nil {
|
||
continue
|
||
}
|
||
dlqInsights[k] = *v
|
||
}
|
||
for k, v := range m.circuitBreakers {
|
||
if v != nil {
|
||
breakerRefs[k] = v
|
||
}
|
||
}
|
||
m.mu.RUnlock()
|
||
for key, breaker := range breakerRefs {
|
||
instanceType := InstanceType("unknown")
|
||
instanceName := key
|
||
if parts := strings.SplitN(key, "::", 2); len(parts) == 2 {
|
||
if parts[0] != "" {
|
||
instanceType = InstanceType(parts[0])
|
||
}
|
||
if parts[1] != "" {
|
||
instanceName = parts[1]
|
||
}
|
||
}
|
||
m.updateBreakerMetric(instanceType, instanceName, breaker)
|
||
}
|
||
|
||
keySet := make(map[string]struct{})
|
||
for k := range instanceInfos {
|
||
if k != "" {
|
||
keySet[k] = struct{}{}
|
||
}
|
||
}
|
||
for k := range pollStatuses {
|
||
if k != "" {
|
||
keySet[k] = struct{}{}
|
||
}
|
||
}
|
||
for k := range dlqInsights {
|
||
if k != "" {
|
||
keySet[k] = struct{}{}
|
||
}
|
||
}
|
||
for k := range breakerRefs {
|
||
if k != "" {
|
||
keySet[k] = struct{}{}
|
||
}
|
||
}
|
||
for _, task := range response.DeadLetter.Tasks {
|
||
if task.Instance == "" {
|
||
continue
|
||
}
|
||
keySet[schedulerKey(InstanceType(task.Type), task.Instance)] = struct{}{}
|
||
}
|
||
for _, snap := range response.Staleness {
|
||
if snap.Instance == "" {
|
||
continue
|
||
}
|
||
keySet[schedulerKey(InstanceType(snap.Type), snap.Instance)] = struct{}{}
|
||
}
|
||
|
||
if len(keySet) > 0 {
|
||
keys := make([]string, 0, len(keySet))
|
||
for k := range keySet {
|
||
keys = append(keys, k)
|
||
}
|
||
sort.Strings(keys)
|
||
|
||
instances := make([]InstanceHealth, 0, len(keys))
|
||
for _, key := range keys {
|
||
instType := "unknown"
|
||
instName := key
|
||
if parts := strings.SplitN(key, "::", 2); len(parts) == 2 {
|
||
if parts[0] != "" {
|
||
instType = parts[0]
|
||
}
|
||
if parts[1] != "" {
|
||
instName = parts[1]
|
||
}
|
||
}
|
||
instType = strings.TrimSpace(instType)
|
||
instName = strings.TrimSpace(instName)
|
||
|
||
info := instanceInfos[key]
|
||
display := instName
|
||
connection := ""
|
||
if info != nil {
|
||
if instType == "unknown" || instType == "" {
|
||
if info.Type != "" {
|
||
instType = string(info.Type)
|
||
}
|
||
}
|
||
if strings.Contains(info.Key, "::") {
|
||
if parts := strings.SplitN(info.Key, "::", 2); len(parts) == 2 {
|
||
if instName == key {
|
||
instName = parts[1]
|
||
}
|
||
if (instType == "" || instType == "unknown") && parts[0] != "" {
|
||
instType = parts[0]
|
||
}
|
||
}
|
||
}
|
||
if info.DisplayName != "" {
|
||
display = info.DisplayName
|
||
}
|
||
if info.Connection != "" {
|
||
connection = info.Connection
|
||
}
|
||
}
|
||
display = strings.TrimSpace(display)
|
||
connection = strings.TrimSpace(connection)
|
||
if display == "" {
|
||
display = instName
|
||
}
|
||
if display == "" {
|
||
display = connection
|
||
}
|
||
if instType == "" {
|
||
instType = "unknown"
|
||
}
|
||
if instName == "" {
|
||
instName = key
|
||
}
|
||
|
||
status, hasStatus := pollStatuses[key]
|
||
instanceStatus := InstancePollStatus{}
|
||
if hasStatus {
|
||
instanceStatus.ConsecutiveFailures = status.ConsecutiveFailures
|
||
instanceStatus.LastSuccess = timePtr(status.LastSuccess)
|
||
if !status.FirstFailureAt.IsZero() {
|
||
instanceStatus.FirstFailureAt = timePtr(status.FirstFailureAt)
|
||
}
|
||
if !status.LastErrorAt.IsZero() && status.LastErrorMessage != "" {
|
||
instanceStatus.LastError = &ErrorDetail{
|
||
At: status.LastErrorAt,
|
||
Message: status.LastErrorMessage,
|
||
Category: status.LastErrorCategory,
|
||
}
|
||
}
|
||
}
|
||
|
||
breakerInfo := InstanceBreaker{
|
||
State: "closed",
|
||
FailureCount: 0,
|
||
}
|
||
if br, ok := breakerRefs[key]; ok && br != nil {
|
||
state, failures, retryAt, since, lastTransition := br.stateDetails()
|
||
if state != "" {
|
||
breakerInfo.State = state
|
||
}
|
||
breakerInfo.FailureCount = failures
|
||
breakerInfo.RetryAt = timePtr(retryAt)
|
||
breakerInfo.Since = timePtr(since)
|
||
breakerInfo.LastTransition = timePtr(lastTransition)
|
||
}
|
||
|
||
dlqInfo := InstanceDLQ{Present: false}
|
||
if dlq, ok := dlqInsights[key]; ok {
|
||
dlqInfo.Present = true
|
||
dlqInfo.Reason = dlq.Reason
|
||
dlqInfo.FirstAttempt = timePtr(dlq.FirstAttempt)
|
||
dlqInfo.LastAttempt = timePtr(dlq.LastAttempt)
|
||
dlqInfo.RetryCount = dlq.RetryCount
|
||
dlqInfo.NextRetry = timePtr(dlq.NextRetry)
|
||
}
|
||
|
||
// Collect any warnings for this instance
|
||
var warnings []string
|
||
if instType == "pve" {
|
||
if warning, ok := m.backupPermissionWarnings[instName]; ok {
|
||
warnings = append(warnings, warning)
|
||
}
|
||
}
|
||
|
||
instances = append(instances, InstanceHealth{
|
||
Key: key,
|
||
Type: instType,
|
||
DisplayName: display,
|
||
Instance: instName,
|
||
Connection: connection,
|
||
PollStatus: instanceStatus,
|
||
Breaker: breakerInfo,
|
||
DeadLetter: dlqInfo,
|
||
Warnings: warnings,
|
||
})
|
||
}
|
||
|
||
response.Instances = instances
|
||
} else {
|
||
response.Instances = []InstanceHealth{}
|
||
}
|
||
|
||
return response
|
||
}
|
||
|
||
func isTransientError(err error) bool {
|
||
if err == nil {
|
||
return true
|
||
}
|
||
if errors.IsRetryableError(err) {
|
||
return true
|
||
}
|
||
if stderrors.Is(err, context.Canceled) || stderrors.Is(err, context.DeadlineExceeded) {
|
||
return true
|
||
}
|
||
return false
|
||
}
|
||
|
||
func shouldTryPortlessFallback(err error) bool {
|
||
if err == nil {
|
||
return false
|
||
}
|
||
msg := strings.ToLower(err.Error())
|
||
if strings.Contains(msg, "connection refused") ||
|
||
strings.Contains(msg, "connection reset") ||
|
||
strings.Contains(msg, "no such host") ||
|
||
strings.Contains(msg, "client.timeout exceeded") ||
|
||
strings.Contains(msg, "i/o timeout") ||
|
||
strings.Contains(msg, "context deadline exceeded") {
|
||
return true
|
||
}
|
||
return false
|
||
}
|
||
|
||
// retryPVEPortFallback handles the case where a normalized :8006 host is unreachable
|
||
// because the actual endpoint is fronted by a reverse proxy on 443. If the initial
|
||
// GetNodes call fails with a connection error and the host has the default PVE port,
|
||
// retry without the default port to hit the proxy. On success, swap the client so
|
||
// subsequent polls reuse the working endpoint.
|
||
func (m *Monitor) retryPVEPortFallback(ctx context.Context, instanceName string, instanceCfg *config.PVEInstance, currentClient PVEClientInterface, cause error) ([]proxmox.Node, PVEClientInterface, error) {
|
||
if instanceCfg == nil || !shouldTryPortlessFallback(cause) {
|
||
return nil, currentClient, cause
|
||
}
|
||
|
||
fallbackHost := config.StripDefaultPort(instanceCfg.Host, config.DefaultPVEPort)
|
||
if fallbackHost == "" || fallbackHost == instanceCfg.Host {
|
||
return nil, currentClient, cause
|
||
}
|
||
|
||
clientCfg := config.CreateProxmoxConfigWithHost(instanceCfg, fallbackHost, false)
|
||
if clientCfg.Timeout <= 0 {
|
||
clientCfg.Timeout = m.config.ConnectionTimeout
|
||
}
|
||
|
||
fallbackClient, err := newProxmoxClientFunc(clientCfg)
|
||
if err != nil {
|
||
return nil, currentClient, cause
|
||
}
|
||
|
||
fallbackNodes, err := fallbackClient.GetNodes(ctx)
|
||
if err != nil {
|
||
return nil, currentClient, cause
|
||
}
|
||
|
||
// Switch to the working host for the remainder of the poll (and future polls)
|
||
primaryHost := instanceCfg.Host
|
||
|
||
// Persist with an explicit port to avoid re-normalization back to :8006 on reloads.
|
||
persistHost := fallbackHost
|
||
if parsed, err := url.Parse(fallbackHost); err == nil && parsed.Host != "" && parsed.Port() == "" {
|
||
port := "443"
|
||
if strings.EqualFold(parsed.Scheme, "http") {
|
||
port = "80"
|
||
}
|
||
parsed.Host = net.JoinHostPort(parsed.Hostname(), port)
|
||
persistHost = parsed.Scheme + "://" + parsed.Host
|
||
}
|
||
|
||
instanceCfg.Host = persistHost
|
||
m.pveClients[instanceName] = fallbackClient
|
||
|
||
// Update in-memory config so subsequent polls build clients against the working port.
|
||
for i := range m.config.PVEInstances {
|
||
if m.config.PVEInstances[i].Name == instanceName {
|
||
m.config.PVEInstances[i].Host = persistHost
|
||
break
|
||
}
|
||
}
|
||
|
||
// Persist to disk so restarts keep the working endpoint.
|
||
if m.persistence != nil {
|
||
if err := m.persistence.SaveNodesConfig(m.config.PVEInstances, m.config.PBSInstances, m.config.PMGInstances); err != nil {
|
||
log.Warn().Err(err).Str("instance", instanceName).Msg("Failed to persist fallback PVE host")
|
||
}
|
||
}
|
||
|
||
log.Warn().
|
||
Str("instance", instanceName).
|
||
Str("primary", primaryHost).
|
||
Str("fallback", persistHost).
|
||
Msg("Primary PVE host failed; using fallback without default port")
|
||
|
||
return fallbackNodes, fallbackClient, nil
|
||
}
|
||
|
||
// pollPVEInstance polls a single PVE instance
|
||
func (m *Monitor) pollPVEInstance(ctx context.Context, instanceName string, client PVEClientInterface) {
|
||
defer recoverFromPanic(fmt.Sprintf("pollPVEInstance-%s", instanceName))
|
||
|
||
start := time.Now()
|
||
debugEnabled := logging.IsLevelEnabled(zerolog.DebugLevel)
|
||
var pollErr error
|
||
if m.pollMetrics != nil {
|
||
m.pollMetrics.IncInFlight("pve")
|
||
defer m.pollMetrics.DecInFlight("pve")
|
||
defer func() {
|
||
m.pollMetrics.RecordResult(PollResult{
|
||
InstanceName: instanceName,
|
||
InstanceType: "pve",
|
||
Success: pollErr == nil,
|
||
Error: pollErr,
|
||
StartTime: start,
|
||
EndTime: time.Now(),
|
||
})
|
||
}()
|
||
}
|
||
if m.stalenessTracker != nil {
|
||
defer func() {
|
||
if pollErr == nil {
|
||
m.stalenessTracker.UpdateSuccess(InstanceTypePVE, instanceName, nil)
|
||
} else {
|
||
m.stalenessTracker.UpdateError(InstanceTypePVE, instanceName)
|
||
}
|
||
}()
|
||
}
|
||
defer m.recordTaskResult(InstanceTypePVE, instanceName, pollErr)
|
||
|
||
// Check if context is cancelled
|
||
select {
|
||
case <-ctx.Done():
|
||
pollErr = ctx.Err()
|
||
if debugEnabled {
|
||
log.Debug().Str("instance", instanceName).Msg("Polling cancelled")
|
||
}
|
||
return
|
||
default:
|
||
}
|
||
|
||
if debugEnabled {
|
||
log.Debug().Str("instance", instanceName).Msg("Polling PVE instance")
|
||
}
|
||
|
||
// Get instance config
|
||
var instanceCfg *config.PVEInstance
|
||
for _, cfg := range m.config.PVEInstances {
|
||
if cfg.Name == instanceName {
|
||
instanceCfg = &cfg
|
||
break
|
||
}
|
||
}
|
||
if instanceCfg == nil {
|
||
pollErr = fmt.Errorf("pve instance config not found for %s", instanceName)
|
||
return
|
||
}
|
||
|
||
// Poll nodes
|
||
nodes, err := client.GetNodes(ctx)
|
||
if err != nil {
|
||
if fallbackNodes, fallbackClient, fallbackErr := m.retryPVEPortFallback(ctx, instanceName, instanceCfg, client, err); fallbackErr == nil {
|
||
client = fallbackClient
|
||
nodes = fallbackNodes
|
||
} else {
|
||
monErr := errors.WrapConnectionError("poll_nodes", instanceName, err)
|
||
pollErr = monErr
|
||
log.Error().Err(monErr).Str("instance", instanceName).Msg("Failed to get nodes")
|
||
m.state.SetConnectionHealth(instanceName, false)
|
||
|
||
// Track auth failure if it's an authentication error
|
||
if errors.IsAuthError(err) {
|
||
m.recordAuthFailure(instanceName, "pve")
|
||
}
|
||
return
|
||
}
|
||
}
|
||
|
||
// Reset auth failures on successful connection
|
||
m.resetAuthFailures(instanceName, "pve")
|
||
|
||
// Check if client is a ClusterClient to determine health status
|
||
connectionHealthStr := "healthy"
|
||
if clusterClient, ok := client.(*proxmox.ClusterClient); ok {
|
||
// For cluster clients, check if all endpoints are healthy
|
||
healthStatus := clusterClient.GetHealthStatus()
|
||
healthyCount := 0
|
||
totalCount := len(healthStatus)
|
||
|
||
for _, isHealthy := range healthStatus {
|
||
if isHealthy {
|
||
healthyCount++
|
||
}
|
||
}
|
||
|
||
if healthyCount == 0 {
|
||
// All endpoints are down
|
||
connectionHealthStr = "error"
|
||
m.state.SetConnectionHealth(instanceName, false)
|
||
} else if healthyCount < totalCount {
|
||
// Some endpoints are down - check if cluster still has quorum
|
||
// A cluster with quorum is healthy even if some nodes are intentionally offline
|
||
// (e.g., backup nodes not running). Only mark as degraded if no quorum.
|
||
isQuorate, err := clusterClient.IsQuorate(ctx)
|
||
if err != nil {
|
||
// Couldn't check quorum - log but continue (assume healthy if we have connectivity)
|
||
log.Debug().
|
||
Str("instance", instanceName).
|
||
Err(err).
|
||
Msg("Could not check cluster quorum status")
|
||
isQuorate = true // Assume healthy if we can't check
|
||
}
|
||
|
||
if isQuorate {
|
||
// Cluster has quorum - healthy even with some nodes offline
|
||
connectionHealthStr = "healthy"
|
||
m.state.SetConnectionHealth(instanceName, true)
|
||
log.Debug().
|
||
Str("instance", instanceName).
|
||
Int("healthy", healthyCount).
|
||
Int("total", totalCount).
|
||
Msg("Cluster has quorum - some API endpoints unreachable but cluster is healthy")
|
||
} else {
|
||
// Cluster lost quorum - this is actually degraded/critical
|
||
connectionHealthStr = "degraded"
|
||
m.state.SetConnectionHealth(instanceName, true) // Still functional but degraded
|
||
log.Warn().
|
||
Str("instance", instanceName).
|
||
Int("healthy", healthyCount).
|
||
Int("total", totalCount).
|
||
Msg("Cluster lost quorum - degraded state")
|
||
}
|
||
} else {
|
||
// All endpoints are healthy
|
||
connectionHealthStr = "healthy"
|
||
m.state.SetConnectionHealth(instanceName, true)
|
||
}
|
||
} else {
|
||
// Regular client - simple healthy/unhealthy
|
||
m.state.SetConnectionHealth(instanceName, true)
|
||
}
|
||
|
||
// Capture previous memory metrics so we can preserve them if detailed status fails
|
||
prevState := m.GetState()
|
||
prevNodeMemory := make(map[string]models.Memory)
|
||
prevInstanceNodes := make([]models.Node, 0)
|
||
for _, existingNode := range prevState.Nodes {
|
||
if existingNode.Instance != instanceName {
|
||
continue
|
||
}
|
||
prevNodeMemory[existingNode.ID] = existingNode.Memory
|
||
prevInstanceNodes = append(prevInstanceNodes, existingNode)
|
||
}
|
||
|
||
// Convert to models
|
||
var modelNodes []models.Node
|
||
nodeEffectiveStatus := make(map[string]string) // Track effective status (with grace period) for each node
|
||
// Parallel node polling
|
||
type nodePollResult struct {
|
||
node models.Node
|
||
effectiveStatus string
|
||
}
|
||
|
||
resultChan := make(chan nodePollResult, len(nodes))
|
||
var wg sync.WaitGroup
|
||
|
||
if debugEnabled {
|
||
log.Debug().
|
||
Str("instance", instanceName).
|
||
Int("nodes", len(nodes)).
|
||
Msg("Starting parallel node polling")
|
||
}
|
||
|
||
for _, node := range nodes {
|
||
wg.Add(1)
|
||
go func(node proxmox.Node) {
|
||
defer wg.Done()
|
||
|
||
modelNode, effectiveStatus, _ := m.pollPVENode(ctx, instanceName, instanceCfg, client, node, connectionHealthStr, prevNodeMemory, prevInstanceNodes)
|
||
|
||
resultChan <- nodePollResult{
|
||
node: modelNode,
|
||
effectiveStatus: effectiveStatus,
|
||
}
|
||
}(node)
|
||
}
|
||
|
||
wg.Wait()
|
||
close(resultChan)
|
||
|
||
for res := range resultChan {
|
||
modelNodes = append(modelNodes, res.node)
|
||
nodeEffectiveStatus[res.node.Name] = res.effectiveStatus
|
||
}
|
||
|
||
if len(modelNodes) == 0 && len(prevInstanceNodes) > 0 {
|
||
log.Warn().
|
||
Str("instance", instanceName).
|
||
Int("previousCount", len(prevInstanceNodes)).
|
||
Msg("No Proxmox nodes returned this cycle - preserving previous state")
|
||
|
||
// Mark connection health as degraded to reflect polling failure
|
||
m.state.SetConnectionHealth(instanceName, false)
|
||
|
||
preserved := make([]models.Node, 0, len(prevInstanceNodes))
|
||
for _, prevNode := range prevInstanceNodes {
|
||
nodeCopy := prevNode
|
||
nodeCopy.Status = "offline"
|
||
nodeCopy.ConnectionHealth = "error"
|
||
nodeCopy.Uptime = 0
|
||
nodeCopy.CPU = 0
|
||
preserved = append(preserved, nodeCopy)
|
||
}
|
||
modelNodes = preserved
|
||
}
|
||
|
||
// Update state first so we have nodes available
|
||
m.state.UpdateNodesForInstance(instanceName, modelNodes)
|
||
|
||
// Storage fallback is used to provide disk metrics when rootfs is not available.
|
||
// We run this asynchronously with a short timeout so it doesn't block VM/container polling.
|
||
// This addresses the issue where slow storage APIs (e.g., NFS mounts) can cause the entire
|
||
// polling task to timeout before reaching VM/container polling.
|
||
storageByNode := make(map[string]models.Disk)
|
||
var storageByNodeMu sync.Mutex
|
||
storageFallbackDone := make(chan struct{})
|
||
|
||
if instanceCfg.MonitorStorage {
|
||
go func() {
|
||
defer close(storageFallbackDone)
|
||
|
||
// Use a short timeout for storage fallback - it's an optimization, not critical
|
||
storageFallbackTimeout := 10 * time.Second
|
||
storageCtx, storageCancel := context.WithTimeout(context.Background(), storageFallbackTimeout)
|
||
defer storageCancel()
|
||
|
||
_, err := client.GetAllStorage(storageCtx)
|
||
if err != nil {
|
||
if storageCtx.Err() != nil {
|
||
log.Debug().
|
||
Str("instance", instanceName).
|
||
Dur("timeout", storageFallbackTimeout).
|
||
Msg("Storage fallback timed out - continuing without disk fallback data")
|
||
}
|
||
return
|
||
}
|
||
|
||
for _, node := range nodes {
|
||
// Check if context was cancelled
|
||
select {
|
||
case <-storageCtx.Done():
|
||
log.Debug().
|
||
Str("instance", instanceName).
|
||
Msg("Storage fallback cancelled - partial data collected")
|
||
return
|
||
default:
|
||
}
|
||
|
||
// Skip offline nodes to avoid 595 errors
|
||
if nodeEffectiveStatus[node.Node] != "online" {
|
||
continue
|
||
}
|
||
|
||
nodeStorages, err := client.GetStorage(storageCtx, node.Node)
|
||
if err != nil {
|
||
continue
|
||
}
|
||
|
||
// Look for local or local-lvm storage as most stable disk metric
|
||
for _, storage := range nodeStorages {
|
||
if reason, skip := readOnlyFilesystemReason(storage.Type, storage.Total, storage.Used); skip {
|
||
log.Debug().
|
||
Str("node", node.Node).
|
||
Str("storage", storage.Storage).
|
||
Str("type", storage.Type).
|
||
Str("skipReason", reason).
|
||
Uint64("total", storage.Total).
|
||
Uint64("used", storage.Used).
|
||
Msg("Skipping read-only storage while building disk fallback")
|
||
continue
|
||
}
|
||
if storage.Storage == "local" || storage.Storage == "local-lvm" {
|
||
disk := models.Disk{
|
||
Total: int64(storage.Total),
|
||
Used: int64(storage.Used),
|
||
Free: int64(storage.Available),
|
||
Usage: safePercentage(float64(storage.Used), float64(storage.Total)),
|
||
}
|
||
// Prefer "local" over "local-lvm"
|
||
storageByNodeMu.Lock()
|
||
if _, exists := storageByNode[node.Node]; !exists || storage.Storage == "local" {
|
||
storageByNode[node.Node] = disk
|
||
log.Debug().
|
||
Str("node", node.Node).
|
||
Str("storage", storage.Storage).
|
||
Float64("usage", disk.Usage).
|
||
Msg("Using storage for disk metrics fallback")
|
||
}
|
||
storageByNodeMu.Unlock()
|
||
}
|
||
}
|
||
}
|
||
}()
|
||
} else {
|
||
// No storage monitoring, close channel immediately
|
||
close(storageFallbackDone)
|
||
}
|
||
|
||
// Pre-populate node display name cache so guest alerts created below
|
||
// can resolve friendly names. CheckNode() also does this, but it runs
|
||
// after guest polling — without this, the first alert notification for
|
||
// a guest would show the raw Proxmox node name.
|
||
for i := range modelNodes {
|
||
if modelNodes[i].DisplayName != "" {
|
||
m.alertManager.UpdateNodeDisplayName(modelNodes[i].Name, modelNodes[i].DisplayName)
|
||
}
|
||
}
|
||
|
||
// Poll VMs and containers FIRST - this is the most critical data.
|
||
// This happens immediately after starting the storage fallback goroutine,
|
||
// so VM/container polling runs in parallel with (and is not blocked by) storage operations.
|
||
if instanceCfg.MonitorVMs || instanceCfg.MonitorContainers {
|
||
select {
|
||
case <-ctx.Done():
|
||
pollErr = ctx.Err()
|
||
return
|
||
default:
|
||
// Always try the efficient cluster/resources endpoint first
|
||
// This endpoint works on both clustered and standalone nodes
|
||
// Testing confirmed it works on standalone nodes like pimox
|
||
useClusterEndpoint := m.pollVMsAndContainersEfficient(ctx, instanceName, instanceCfg.ClusterName, instanceCfg.IsCluster, client, nodeEffectiveStatus)
|
||
|
||
if !useClusterEndpoint {
|
||
// Fall back to traditional polling only if cluster/resources not available
|
||
// This should be rare - only for very old Proxmox versions
|
||
log.Debug().
|
||
Str("instance", instanceName).
|
||
Msg("cluster/resources endpoint not available, using traditional polling")
|
||
|
||
// Check if configuration needs updating
|
||
if instanceCfg.IsCluster {
|
||
isActuallyCluster, checkErr := client.IsClusterMember(ctx)
|
||
if checkErr == nil && !isActuallyCluster {
|
||
log.Warn().
|
||
Str("instance", instanceName).
|
||
Msg("Instance marked as cluster but is actually standalone - consider updating configuration")
|
||
instanceCfg.IsCluster = false
|
||
}
|
||
}
|
||
|
||
// Use optimized parallel polling for better performance
|
||
if instanceCfg.MonitorVMs {
|
||
m.pollVMsWithNodes(ctx, instanceName, instanceCfg.ClusterName, instanceCfg.IsCluster, client, nodes, nodeEffectiveStatus)
|
||
}
|
||
if instanceCfg.MonitorContainers {
|
||
m.pollContainersWithNodes(ctx, instanceName, instanceCfg.ClusterName, instanceCfg.IsCluster, client, nodes, nodeEffectiveStatus)
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// Poll physical disks for health monitoring (enabled by default unless explicitly disabled)
|
||
// Skip if MonitorPhysicalDisks is explicitly set to false
|
||
// Physical disk polling runs in a background goroutine since GetDisks can be slow
|
||
// and we don't want it to cause task timeouts. It has its own 5-minute interval anyway.
|
||
if instanceCfg.MonitorPhysicalDisks != nil && !*instanceCfg.MonitorPhysicalDisks {
|
||
log.Debug().Str("instance", instanceName).Msg("Physical disk monitoring explicitly disabled")
|
||
// Keep any existing disk data visible (don't clear it)
|
||
} else {
|
||
// Enabled by default (when nil or true)
|
||
// Determine polling interval (default 5 minutes to avoid spinning up HDDs too frequently)
|
||
pollingInterval := 5 * time.Minute
|
||
if instanceCfg.PhysicalDiskPollingMinutes > 0 {
|
||
pollingInterval = time.Duration(instanceCfg.PhysicalDiskPollingMinutes) * time.Minute
|
||
}
|
||
|
||
// Check if enough time has elapsed since last poll
|
||
m.mu.Lock()
|
||
lastPoll, exists := m.lastPhysicalDiskPoll[instanceName]
|
||
shouldPoll := !exists || time.Since(lastPoll) >= pollingInterval
|
||
if shouldPoll {
|
||
m.lastPhysicalDiskPoll[instanceName] = time.Now()
|
||
}
|
||
m.mu.Unlock()
|
||
|
||
if !shouldPoll {
|
||
log.Debug().
|
||
Str("instance", instanceName).
|
||
Dur("sinceLastPoll", time.Since(lastPoll)).
|
||
Dur("interval", pollingInterval).
|
||
Msg("Skipping physical disk poll - interval not elapsed")
|
||
// Refresh NVMe temperatures using the latest sensor data even when we skip the disk poll
|
||
currentState := m.state.GetSnapshot()
|
||
existing := make([]models.PhysicalDisk, 0)
|
||
for _, disk := range currentState.PhysicalDisks {
|
||
if disk.Instance == instanceName {
|
||
existing = append(existing, disk)
|
||
}
|
||
}
|
||
if len(existing) > 0 {
|
||
// Use nodes from state snapshot - they have LinkedHostAgentID populated
|
||
// (the local modelNodes variable doesn't have this field set)
|
||
updated := mergeNVMeTempsIntoDisks(existing, currentState.Nodes)
|
||
// Also merge SMART data from linked host agents
|
||
updated = mergeHostAgentSMARTIntoDisks(updated, currentState.Nodes, currentState.Hosts)
|
||
m.state.UpdatePhysicalDisks(instanceName, updated)
|
||
}
|
||
} else {
|
||
// Run physical disk polling in background to avoid blocking the main task
|
||
go func(inst string, pveClient PVEClientInterface, nodeList []proxmox.Node, nodeStatus map[string]string, modelNodesCopy []models.Node) {
|
||
defer recoverFromPanic(fmt.Sprintf("pollPhysicalDisks-%s", inst))
|
||
|
||
// Use a generous timeout for disk polling
|
||
diskTimeout := 60 * time.Second
|
||
diskCtx, diskCancel := context.WithTimeout(context.Background(), diskTimeout)
|
||
defer diskCancel()
|
||
|
||
log.Debug().
|
||
Int("nodeCount", len(nodeList)).
|
||
Dur("interval", pollingInterval).
|
||
Msg("Starting disk health polling")
|
||
|
||
// Get existing disks from state to preserve data for offline nodes
|
||
currentState := m.state.GetSnapshot()
|
||
existingDisksMap := make(map[string]models.PhysicalDisk)
|
||
for _, disk := range currentState.PhysicalDisks {
|
||
if disk.Instance == inst {
|
||
existingDisksMap[disk.ID] = disk
|
||
}
|
||
}
|
||
|
||
var allDisks []models.PhysicalDisk
|
||
polledNodes := make(map[string]bool) // Track which nodes we successfully polled
|
||
|
||
for _, node := range nodeList {
|
||
// Check if context timed out
|
||
select {
|
||
case <-diskCtx.Done():
|
||
log.Debug().
|
||
Str("instance", inst).
|
||
Msg("Physical disk polling timed out - preserving existing data")
|
||
return
|
||
default:
|
||
}
|
||
|
||
// Skip offline nodes but preserve their existing disk data
|
||
if nodeStatus[node.Node] != "online" {
|
||
log.Debug().Str("node", node.Node).Msg("Skipping disk poll for offline node - preserving existing data")
|
||
continue
|
||
}
|
||
|
||
// Get disk list for this node
|
||
log.Debug().Str("node", node.Node).Msg("Getting disk list for node")
|
||
disks, err := pveClient.GetDisks(diskCtx, node.Node)
|
||
if err != nil {
|
||
// Check if it's a permission error or if the endpoint doesn't exist
|
||
errStr := err.Error()
|
||
if strings.Contains(errStr, "401") || strings.Contains(errStr, "403") {
|
||
log.Warn().
|
||
Str("node", node.Node).
|
||
Err(err).
|
||
Msg("Insufficient permissions to access disk information - check API token permissions")
|
||
} else if strings.Contains(errStr, "404") || strings.Contains(errStr, "501") {
|
||
log.Info().
|
||
Str("node", node.Node).
|
||
Msg("Disk monitoring not available on this node (may be using non-standard storage)")
|
||
} else {
|
||
log.Warn().
|
||
Str("node", node.Node).
|
||
Err(err).
|
||
Msg("Failed to get disk list")
|
||
}
|
||
continue
|
||
}
|
||
|
||
// Mark this node as successfully polled
|
||
polledNodes[node.Node] = true
|
||
|
||
// Check each disk for health issues and add to state
|
||
for _, disk := range disks {
|
||
// Create PhysicalDisk model
|
||
diskID := fmt.Sprintf("%s-%s-%s", inst, node.Node, strings.ReplaceAll(disk.DevPath, "/", "-"))
|
||
physicalDisk := models.PhysicalDisk{
|
||
ID: diskID,
|
||
Node: node.Node,
|
||
Instance: inst,
|
||
DevPath: disk.DevPath,
|
||
Model: disk.Model,
|
||
Serial: disk.Serial,
|
||
WWN: disk.WWN,
|
||
Type: disk.Type,
|
||
Size: disk.Size,
|
||
Health: disk.Health,
|
||
Wearout: disk.Wearout,
|
||
RPM: disk.RPM,
|
||
Used: disk.Used,
|
||
LastChecked: time.Now(),
|
||
}
|
||
|
||
allDisks = append(allDisks, physicalDisk)
|
||
|
||
log.Debug().
|
||
Str("node", node.Node).
|
||
Str("disk", disk.DevPath).
|
||
Str("model", disk.Model).
|
||
Str("health", disk.Health).
|
||
Int("wearout", disk.Wearout).
|
||
Msg("Checking disk health")
|
||
|
||
normalizedHealth := strings.ToUpper(strings.TrimSpace(disk.Health))
|
||
if normalizedHealth != "" && normalizedHealth != "UNKNOWN" && normalizedHealth != "PASSED" && normalizedHealth != "OK" {
|
||
// Disk has failed or is failing - alert manager will handle this
|
||
log.Warn().
|
||
Str("node", node.Node).
|
||
Str("disk", disk.DevPath).
|
||
Str("model", disk.Model).
|
||
Str("health", disk.Health).
|
||
Int("wearout", disk.Wearout).
|
||
Msg("Disk health issue detected")
|
||
|
||
// Pass disk info to alert manager
|
||
m.alertManager.CheckDiskHealth(inst, node.Node, disk)
|
||
} else if disk.Wearout > 0 && disk.Wearout < 10 {
|
||
// Low wearout warning (less than 10% life remaining)
|
||
log.Warn().
|
||
Str("node", node.Node).
|
||
Str("disk", disk.DevPath).
|
||
Str("model", disk.Model).
|
||
Int("wearout", disk.Wearout).
|
||
Msg("SSD wearout critical - less than 10% life remaining")
|
||
|
||
// Pass to alert manager for wearout alert
|
||
m.alertManager.CheckDiskHealth(inst, node.Node, disk)
|
||
}
|
||
}
|
||
}
|
||
|
||
// Preserve existing disk data for nodes that weren't polled (offline or error)
|
||
for _, existingDisk := range existingDisksMap {
|
||
// Only preserve if we didn't poll this node
|
||
if !polledNodes[existingDisk.Node] {
|
||
// Keep the existing disk data but update the LastChecked to indicate it's stale
|
||
allDisks = append(allDisks, existingDisk)
|
||
log.Debug().
|
||
Str("node", existingDisk.Node).
|
||
Str("disk", existingDisk.DevPath).
|
||
Msg("Preserving existing disk data for unpolled node")
|
||
}
|
||
}
|
||
|
||
// Use nodes from state snapshot - they have LinkedHostAgentID populated
|
||
// (modelNodesCopy passed to this goroutine doesn't have this field set)
|
||
allDisks = mergeNVMeTempsIntoDisks(allDisks, currentState.Nodes)
|
||
// Also merge SMART data from linked host agents
|
||
allDisks = mergeHostAgentSMARTIntoDisks(allDisks, currentState.Nodes, currentState.Hosts)
|
||
|
||
// Write SMART metrics to persistent store
|
||
if m.metricsStore != nil {
|
||
now := time.Now()
|
||
for _, disk := range allDisks {
|
||
m.writeSMARTMetrics(disk, now)
|
||
}
|
||
}
|
||
|
||
// Update physical disks in state
|
||
log.Debug().
|
||
Str("instance", inst).
|
||
Int("diskCount", len(allDisks)).
|
||
Int("preservedCount", len(existingDisksMap)-len(polledNodes)).
|
||
Msg("Updating physical disks in state")
|
||
m.state.UpdatePhysicalDisks(inst, allDisks)
|
||
}(instanceName, client, nodes, nodeEffectiveStatus, modelNodes)
|
||
}
|
||
}
|
||
// Note: Physical disk monitoring is now enabled by default with a 5-minute polling interval.
|
||
// Users can explicitly disable it in node settings. Disk data is preserved between polls.
|
||
|
||
// Wait for storage fallback to complete (with a short timeout) before using the data.
|
||
// This is non-blocking in the sense that VM/container polling has already completed by now.
|
||
// We give the storage fallback goroutine up to 2 additional seconds to finish if it's still running.
|
||
select {
|
||
case <-storageFallbackDone:
|
||
// Storage fallback completed normally
|
||
case <-time.After(2 * time.Second):
|
||
log.Debug().
|
||
Str("instance", instanceName).
|
||
Msg("Storage fallback still running - proceeding without waiting (disk fallback may be unavailable)")
|
||
}
|
||
|
||
// Update nodes with storage fallback if rootfs was not available
|
||
// Copy storageByNode under lock, then release to avoid holding during metric updates
|
||
storageByNodeMu.Lock()
|
||
localStorageByNode := make(map[string]models.Disk, len(storageByNode))
|
||
for k, v := range storageByNode {
|
||
localStorageByNode[k] = v
|
||
}
|
||
storageByNodeMu.Unlock()
|
||
|
||
for i := range modelNodes {
|
||
if modelNodes[i].Disk.Total == 0 {
|
||
if disk, exists := localStorageByNode[modelNodes[i].Name]; exists {
|
||
modelNodes[i].Disk = disk
|
||
log.Debug().
|
||
Str("node", modelNodes[i].Name).
|
||
Float64("usage", disk.Usage).
|
||
Msg("Applied storage fallback for disk metrics")
|
||
}
|
||
}
|
||
|
||
if modelNodes[i].Status == "online" {
|
||
// Record node metrics history only for online nodes
|
||
now := time.Now()
|
||
m.metricsHistory.AddNodeMetric(modelNodes[i].ID, "cpu", modelNodes[i].CPU*100, now)
|
||
m.metricsHistory.AddNodeMetric(modelNodes[i].ID, "memory", modelNodes[i].Memory.Usage, now)
|
||
m.metricsHistory.AddNodeMetric(modelNodes[i].ID, "disk", modelNodes[i].Disk.Usage, now)
|
||
// Also write to persistent store
|
||
if m.metricsStore != nil {
|
||
m.metricsStore.Write("node", modelNodes[i].ID, "cpu", modelNodes[i].CPU*100, now)
|
||
m.metricsStore.Write("node", modelNodes[i].ID, "memory", modelNodes[i].Memory.Usage, now)
|
||
m.metricsStore.Write("node", modelNodes[i].ID, "disk", modelNodes[i].Disk.Usage, now)
|
||
}
|
||
}
|
||
|
||
// Check thresholds for alerts
|
||
m.alertManager.CheckNode(modelNodes[i])
|
||
}
|
||
|
||
// Update state again with corrected disk metrics
|
||
m.state.UpdateNodesForInstance(instanceName, modelNodes)
|
||
|
||
// Clean up alerts for nodes that no longer exist
|
||
// Get all nodes from the global state (includes all instances)
|
||
existingNodes := make(map[string]bool)
|
||
allState := m.state.GetSnapshot()
|
||
for _, node := range allState.Nodes {
|
||
existingNodes[node.Name] = true
|
||
}
|
||
m.alertManager.CleanupAlertsForNodes(existingNodes)
|
||
|
||
// Periodically re-check cluster status for nodes marked as standalone
|
||
// This addresses issue #437 where clusters aren't detected on first attempt
|
||
if !instanceCfg.IsCluster {
|
||
// Check every 5 minutes if this is actually a cluster
|
||
if time.Since(m.lastClusterCheck[instanceName]) > 5*time.Minute {
|
||
m.lastClusterCheck[instanceName] = time.Now()
|
||
|
||
// Try to detect if this is actually a cluster
|
||
isActuallyCluster, checkErr := client.IsClusterMember(ctx)
|
||
if checkErr == nil && isActuallyCluster {
|
||
// This node is actually part of a cluster!
|
||
log.Info().
|
||
Str("instance", instanceName).
|
||
Msg("Detected that standalone node is actually part of a cluster - updating configuration")
|
||
|
||
// Update the configuration
|
||
for i := range m.config.PVEInstances {
|
||
if m.config.PVEInstances[i].Name == instanceName {
|
||
m.config.PVEInstances[i].IsCluster = true
|
||
// Note: We can't get the cluster name here without direct client access
|
||
// It will be detected on the next configuration update
|
||
log.Info().
|
||
Str("instance", instanceName).
|
||
Msg("Marked node as cluster member - cluster name will be detected on next update")
|
||
|
||
// Save the updated configuration
|
||
if m.persistence != nil {
|
||
if err := m.persistence.SaveNodesConfig(m.config.PVEInstances, m.config.PBSInstances, m.config.PMGInstances); err != nil {
|
||
log.Warn().Err(err).Msg("Failed to persist updated node configuration")
|
||
}
|
||
}
|
||
break
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// Update cluster endpoint online status if this is a cluster
|
||
if instanceCfg.IsCluster && len(instanceCfg.ClusterEndpoints) > 0 {
|
||
// Create a map of online nodes from our polling results
|
||
onlineNodes := make(map[string]bool)
|
||
for _, node := range modelNodes {
|
||
// Node is online if we successfully got its data
|
||
onlineNodes[node.Name] = node.Status == "online"
|
||
}
|
||
|
||
// Get Pulse connectivity status from ClusterClient if available
|
||
var pulseHealth map[string]proxmox.EndpointHealth
|
||
if clusterClient, ok := client.(*proxmox.ClusterClient); ok {
|
||
pulseHealth = clusterClient.GetHealthStatusWithErrors()
|
||
}
|
||
|
||
// Update the online status for each cluster endpoint
|
||
hasFingerprint := instanceCfg.Fingerprint != ""
|
||
for i := range instanceCfg.ClusterEndpoints {
|
||
if online, exists := onlineNodes[instanceCfg.ClusterEndpoints[i].NodeName]; exists {
|
||
instanceCfg.ClusterEndpoints[i].Online = online
|
||
if online {
|
||
instanceCfg.ClusterEndpoints[i].LastSeen = time.Now()
|
||
}
|
||
}
|
||
|
||
// Update Pulse connectivity status
|
||
if pulseHealth != nil {
|
||
// Try to find the endpoint in the health map by matching the effective URL
|
||
endpointURL := clusterEndpointEffectiveURL(instanceCfg.ClusterEndpoints[i], instanceCfg.VerifySSL, hasFingerprint)
|
||
if health, exists := pulseHealth[endpointURL]; exists {
|
||
reachable := health.Healthy
|
||
instanceCfg.ClusterEndpoints[i].PulseReachable = &reachable
|
||
if !health.LastCheck.IsZero() {
|
||
instanceCfg.ClusterEndpoints[i].LastPulseCheck = &health.LastCheck
|
||
}
|
||
instanceCfg.ClusterEndpoints[i].PulseError = health.LastError
|
||
}
|
||
}
|
||
}
|
||
|
||
// Update the config with the new online status
|
||
// This is needed so the UI can reflect the current status
|
||
for idx, cfg := range m.config.PVEInstances {
|
||
if cfg.Name == instanceName {
|
||
m.config.PVEInstances[idx].ClusterEndpoints = instanceCfg.ClusterEndpoints
|
||
break
|
||
}
|
||
}
|
||
}
|
||
|
||
// Poll storage in background if enabled - storage APIs can be slow (NFS mounts, etc.)
|
||
// so we run this asynchronously to prevent it from causing task timeouts.
|
||
// This is similar to how backup polling runs in the background.
|
||
if instanceCfg.MonitorStorage {
|
||
select {
|
||
case <-ctx.Done():
|
||
pollErr = ctx.Err()
|
||
return
|
||
default:
|
||
go func(inst string, pveClient PVEClientInterface, nodeList []proxmox.Node) {
|
||
defer recoverFromPanic(fmt.Sprintf("pollStorageWithNodes-%s", inst))
|
||
|
||
// Use a generous timeout for storage polling - it's not blocking the main task
|
||
storageTimeout := 60 * time.Second
|
||
storageCtx, storageCancel := context.WithTimeout(context.Background(), storageTimeout)
|
||
defer storageCancel()
|
||
|
||
m.pollStorageWithNodes(storageCtx, inst, pveClient, nodeList)
|
||
}(instanceName, client, nodes)
|
||
}
|
||
}
|
||
|
||
// Poll backups if enabled - respect configured interval or cycle gating
|
||
if instanceCfg.MonitorBackups {
|
||
if !m.config.EnableBackupPolling {
|
||
log.Debug().
|
||
Str("instance", instanceName).
|
||
Msg("Skipping backup polling - globally disabled")
|
||
} else {
|
||
now := time.Now()
|
||
|
||
m.mu.RLock()
|
||
lastPoll := m.lastPVEBackupPoll[instanceName]
|
||
m.mu.RUnlock()
|
||
|
||
shouldPoll, reason, newLast := m.shouldRunBackupPoll(lastPoll, now)
|
||
if !shouldPoll {
|
||
if reason != "" {
|
||
log.Debug().
|
||
Str("instance", instanceName).
|
||
Str("reason", reason).
|
||
Msg("Skipping PVE backup polling this cycle")
|
||
}
|
||
} else {
|
||
select {
|
||
case <-ctx.Done():
|
||
pollErr = ctx.Err()
|
||
return
|
||
default:
|
||
// Set initial timestamp before starting goroutine (prevents concurrent starts)
|
||
m.mu.Lock()
|
||
m.lastPVEBackupPoll[instanceName] = newLast
|
||
m.mu.Unlock()
|
||
|
||
// Run backup polling in a separate goroutine to avoid blocking real-time stats
|
||
go func(startTime time.Time, inst string, pveClient PVEClientInterface) {
|
||
defer recoverFromPanic(fmt.Sprintf("pollPVEBackups-%s", inst))
|
||
timeout := m.calculateBackupOperationTimeout(inst)
|
||
log.Info().
|
||
Str("instance", inst).
|
||
Dur("timeout", timeout).
|
||
Msg("Starting background backup/snapshot polling")
|
||
|
||
// The per-cycle ctx is canceled as soon as the main polling loop finishes,
|
||
// so derive the backup poll context from the long-lived runtime context instead.
|
||
parentCtx := m.runtimeCtx
|
||
if parentCtx == nil {
|
||
parentCtx = context.Background()
|
||
}
|
||
|
||
backupCtx, cancel := context.WithTimeout(parentCtx, timeout)
|
||
defer cancel()
|
||
|
||
// Poll backup tasks
|
||
m.pollBackupTasks(backupCtx, inst, pveClient)
|
||
|
||
// Poll storage backups - pass nodes to avoid duplicate API calls
|
||
m.pollStorageBackupsWithNodes(backupCtx, inst, pveClient, nodes, nodeEffectiveStatus)
|
||
|
||
// Poll guest snapshots
|
||
m.pollGuestSnapshots(backupCtx, inst, pveClient)
|
||
|
||
duration := time.Since(startTime)
|
||
log.Info().
|
||
Str("instance", inst).
|
||
Dur("duration", duration).
|
||
Msg("Completed background backup/snapshot polling")
|
||
|
||
// Update timestamp after completion for accurate interval scheduling
|
||
m.mu.Lock()
|
||
m.lastPVEBackupPoll[inst] = time.Now()
|
||
m.mu.Unlock()
|
||
}(now, instanceName, client)
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// pollVMsAndContainersEfficient uses the cluster/resources endpoint to get all VMs and containers in one call
|
||
// This works on both clustered and standalone nodes for efficient polling
|
||
// When the instance is part of a cluster, the cluster name is used for guest IDs to prevent duplicates
|
||
// when multiple cluster nodes are configured as separate PVE instances.
|
||
func (m *Monitor) pollVMsAndContainersEfficient(ctx context.Context, instanceName string, clusterName string, isCluster bool, client PVEClientInterface, nodeEffectiveStatus map[string]string) bool {
|
||
log.Debug().
|
||
Str("instance", instanceName).
|
||
Str("clusterName", clusterName).
|
||
Bool("isCluster", isCluster).
|
||
Msg("Polling VMs and containers using efficient cluster/resources endpoint")
|
||
|
||
// Get all resources in a single API call
|
||
resources, err := client.GetClusterResources(ctx, "vm")
|
||
if err != nil {
|
||
log.Debug().Err(err).Str("instance", instanceName).Msg("cluster/resources not available, falling back to traditional polling")
|
||
return false
|
||
}
|
||
|
||
// Seed OCI classification from previous state so we never "downgrade" to LXC
|
||
// if container config fetching intermittently fails (permissions or transient API errors).
|
||
prevState := m.GetState()
|
||
prevContainerIsOCI := make(map[int]bool)
|
||
for _, ct := range prevState.Containers {
|
||
if ct.Instance != instanceName {
|
||
continue
|
||
}
|
||
if ct.VMID <= 0 {
|
||
continue
|
||
}
|
||
if ct.Type == "oci" || ct.IsOCI {
|
||
prevContainerIsOCI[ct.VMID] = true
|
||
}
|
||
}
|
||
|
||
var allVMs []models.VM
|
||
var allContainers []models.Container
|
||
|
||
for _, res := range resources {
|
||
// Generate canonical guest ID: instance:node:vmid
|
||
guestID := makeGuestID(instanceName, res.Node, res.VMID)
|
||
|
||
// Debug log the resource type
|
||
log.Debug().
|
||
Str("instance", instanceName).
|
||
Str("name", res.Name).
|
||
Int("vmid", res.VMID).
|
||
Str("type", res.Type).
|
||
Msg("Processing cluster resource")
|
||
|
||
// Initialize I/O metrics from cluster resources (may be 0 for VMs)
|
||
diskReadBytes := int64(res.DiskRead)
|
||
diskWriteBytes := int64(res.DiskWrite)
|
||
networkInBytes := int64(res.NetIn)
|
||
networkOutBytes := int64(res.NetOut)
|
||
var individualDisks []models.Disk // Store individual filesystems for multi-disk monitoring
|
||
var ipAddresses []string
|
||
var networkInterfaces []models.GuestNetworkInterface
|
||
var osName, osVersion, agentVersion string
|
||
|
||
if res.Type == "qemu" {
|
||
// Skip templates if configured
|
||
if res.Template == 1 {
|
||
continue
|
||
}
|
||
|
||
memTotal := res.MaxMem
|
||
memUsed := res.Mem
|
||
memorySource := "cluster-resources"
|
||
guestRaw := VMMemoryRaw{
|
||
ListingMem: res.Mem,
|
||
ListingMaxMem: res.MaxMem,
|
||
}
|
||
var detailedStatus *proxmox.VMStatus
|
||
|
||
// Try to get actual disk usage from guest agent if VM is running
|
||
diskUsed := res.Disk
|
||
diskTotal := res.MaxDisk
|
||
diskFree := diskTotal - diskUsed
|
||
diskUsage := safePercentage(float64(diskUsed), float64(diskTotal))
|
||
|
||
// If VM shows 0 disk usage but has allocated disk, it's likely guest agent issue
|
||
// Set to -1 to indicate "unknown" rather than showing misleading 0%
|
||
if res.Type == "qemu" && diskUsed == 0 && diskTotal > 0 && res.Status == "running" {
|
||
diskUsage = -1
|
||
}
|
||
|
||
// For running VMs, always try to get filesystem info from guest agent
|
||
// The cluster/resources endpoint often returns 0 or incorrect values for disk usage
|
||
// We should prefer guest agent data when available for accurate metrics
|
||
if res.Status == "running" && res.Type == "qemu" {
|
||
// First check if agent is enabled by getting VM status
|
||
status, err := client.GetVMStatus(ctx, res.Node, res.VMID)
|
||
if err != nil {
|
||
log.Debug().
|
||
Err(err).
|
||
Str("instance", instanceName).
|
||
Str("vm", res.Name).
|
||
Int("vmid", res.VMID).
|
||
Msg("Could not get VM status to check guest agent availability")
|
||
} else if status != nil {
|
||
detailedStatus = status
|
||
guestRaw.StatusMaxMem = detailedStatus.MaxMem
|
||
guestRaw.StatusMem = detailedStatus.Mem
|
||
guestRaw.StatusFreeMem = detailedStatus.FreeMem
|
||
guestRaw.Balloon = detailedStatus.Balloon
|
||
guestRaw.BalloonMin = detailedStatus.BalloonMin
|
||
guestRaw.Agent = detailedStatus.Agent.Value
|
||
memAvailable := uint64(0)
|
||
if detailedStatus.MemInfo != nil {
|
||
guestRaw.MemInfoUsed = detailedStatus.MemInfo.Used
|
||
guestRaw.MemInfoFree = detailedStatus.MemInfo.Free
|
||
guestRaw.MemInfoTotal = detailedStatus.MemInfo.Total
|
||
guestRaw.MemInfoAvailable = detailedStatus.MemInfo.Available
|
||
guestRaw.MemInfoBuffers = detailedStatus.MemInfo.Buffers
|
||
guestRaw.MemInfoCached = detailedStatus.MemInfo.Cached
|
||
guestRaw.MemInfoShared = detailedStatus.MemInfo.Shared
|
||
|
||
switch {
|
||
case detailedStatus.MemInfo.Available > 0:
|
||
memAvailable = detailedStatus.MemInfo.Available
|
||
memorySource = "meminfo-available"
|
||
case detailedStatus.MemInfo.Free > 0 ||
|
||
detailedStatus.MemInfo.Buffers > 0 ||
|
||
detailedStatus.MemInfo.Cached > 0:
|
||
memAvailable = detailedStatus.MemInfo.Free +
|
||
detailedStatus.MemInfo.Buffers +
|
||
detailedStatus.MemInfo.Cached
|
||
memorySource = "meminfo-derived"
|
||
}
|
||
}
|
||
|
||
// Use actual disk I/O values from detailed status
|
||
diskReadBytes = int64(detailedStatus.DiskRead)
|
||
diskWriteBytes = int64(detailedStatus.DiskWrite)
|
||
networkInBytes = int64(detailedStatus.NetIn)
|
||
networkOutBytes = int64(detailedStatus.NetOut)
|
||
|
||
// Note: We intentionally do NOT override memTotal with balloon.
|
||
// The balloon value is tracked separately in memory.balloon for
|
||
// visualization purposes. Using balloon as total causes user
|
||
// confusion (showing 1GB/1GB at 100% when VM is configured for 4GB)
|
||
// and makes the frontend's balloon marker logic ineffective.
|
||
// Refs: #1070
|
||
if detailedStatus.MaxMem > 0 {
|
||
memTotal = detailedStatus.MaxMem
|
||
}
|
||
|
||
switch {
|
||
case memAvailable > 0:
|
||
if memAvailable > memTotal {
|
||
memAvailable = memTotal
|
||
}
|
||
memUsed = memTotal - memAvailable
|
||
case detailedStatus.Mem > 0:
|
||
// Prefer Mem over FreeMem: Proxmox calculates Mem as
|
||
// (total_mem - free_mem) using the balloon's guest-visible
|
||
// total, which is correct even when ballooning is active.
|
||
// FreeMem is relative to the balloon allocation (not MaxMem),
|
||
// so subtracting it from MaxMem produces wildly inflated
|
||
// usage when the balloon has reduced the VM's memory.
|
||
// Refs: #1185
|
||
memUsed = detailedStatus.Mem
|
||
memorySource = "status-mem"
|
||
case detailedStatus.FreeMem > 0 && memTotal >= detailedStatus.FreeMem:
|
||
memUsed = memTotal - detailedStatus.FreeMem
|
||
memorySource = "status-freemem"
|
||
}
|
||
if memUsed > memTotal {
|
||
memUsed = memTotal
|
||
}
|
||
|
||
// Gather guest metadata from the agent when available
|
||
guestIPs, guestIfaces, guestOSName, guestOSVersion, guestAgentVersion := m.fetchGuestAgentMetadata(ctx, client, instanceName, res.Node, res.Name, res.VMID, detailedStatus)
|
||
if len(guestIPs) > 0 {
|
||
ipAddresses = guestIPs
|
||
}
|
||
if len(guestIfaces) > 0 {
|
||
networkInterfaces = guestIfaces
|
||
}
|
||
if guestOSName != "" {
|
||
osName = guestOSName
|
||
}
|
||
if guestOSVersion != "" {
|
||
osVersion = guestOSVersion
|
||
}
|
||
if guestAgentVersion != "" {
|
||
agentVersion = guestAgentVersion
|
||
}
|
||
|
||
// Always try to get filesystem info if agent is enabled
|
||
// Prefer guest agent data over cluster/resources data for accuracy
|
||
if detailedStatus.Agent.Value > 0 {
|
||
log.Debug().
|
||
Str("instance", instanceName).
|
||
Str("vm", res.Name).
|
||
Int("vmid", res.VMID).
|
||
Int("agent", detailedStatus.Agent.Value).
|
||
Uint64("current_disk", diskUsed).
|
||
Uint64("current_maxdisk", diskTotal).
|
||
Msg("Guest agent enabled, querying filesystem info for accurate disk usage")
|
||
|
||
// Use retry logic for guest agent calls to handle transient timeouts (refs #630)
|
||
fsInfoRaw, err := m.retryGuestAgentCall(ctx, m.guestAgentFSInfoTimeout, m.guestAgentRetries, func(ctx context.Context) (interface{}, error) {
|
||
return client.GetVMFSInfo(ctx, res.Node, res.VMID)
|
||
})
|
||
var fsInfo []proxmox.VMFileSystem
|
||
if err == nil {
|
||
if fs, ok := fsInfoRaw.([]proxmox.VMFileSystem); ok {
|
||
fsInfo = fs
|
||
}
|
||
}
|
||
if err != nil {
|
||
// Log more helpful error messages based on the error type
|
||
errMsg := err.Error()
|
||
if strings.Contains(errMsg, "500") || strings.Contains(errMsg, "QEMU guest agent is not running") {
|
||
log.Info().
|
||
Str("instance", instanceName).
|
||
Str("vm", res.Name).
|
||
Int("vmid", res.VMID).
|
||
Msg("Guest agent enabled in VM config but not running inside guest OS. Install and start qemu-guest-agent in the VM")
|
||
log.Info().
|
||
Str("instance", instanceName).
|
||
Str("vm", res.Name).
|
||
Msg("To verify: ssh into VM and run 'systemctl status qemu-guest-agent' or 'ps aux | grep qemu-ga'")
|
||
} else if strings.Contains(errMsg, "timeout") {
|
||
log.Info().
|
||
Str("instance", instanceName).
|
||
Str("vm", res.Name).
|
||
Int("vmid", res.VMID).
|
||
Msg("Guest agent timeout - agent may be installed but not responding")
|
||
} else if strings.Contains(errMsg, "403") || strings.Contains(errMsg, "401") || strings.Contains(errMsg, "authentication error") {
|
||
// Permission error - user/token lacks required permissions
|
||
log.Info().
|
||
Str("instance", instanceName).
|
||
Str("vm", res.Name).
|
||
Int("vmid", res.VMID).
|
||
Msg("VM disk monitoring permission denied. Check permissions:")
|
||
log.Info().
|
||
Str("instance", instanceName).
|
||
Str("vm", res.Name).
|
||
Msg("• Proxmox 9: Ensure token/user has VM.GuestAgent.Audit privilege (Pulse setup adds this via PulseMonitor role)")
|
||
log.Info().
|
||
Str("instance", instanceName).
|
||
Str("vm", res.Name).
|
||
Msg("• Proxmox 8: Ensure token/user has VM.Monitor privilege (Pulse setup adds this via PulseMonitor role)")
|
||
log.Info().
|
||
Str("instance", instanceName).
|
||
Str("vm", res.Name).
|
||
Msg("• All versions: Sys.Audit is recommended for Ceph metrics and applied when available")
|
||
log.Info().
|
||
Str("instance", instanceName).
|
||
Str("vm", res.Name).
|
||
Msg("• Re-run Pulse setup script if node was added before v4.7")
|
||
log.Info().
|
||
Str("instance", instanceName).
|
||
Str("vm", res.Name).
|
||
Msg("• Verify guest agent is installed and running inside the VM")
|
||
} else {
|
||
log.Debug().
|
||
Err(err).
|
||
Str("instance", instanceName).
|
||
Str("vm", res.Name).
|
||
Int("vmid", res.VMID).
|
||
Msg("Failed to get filesystem info from guest agent")
|
||
}
|
||
} else if len(fsInfo) == 0 {
|
||
log.Info().
|
||
Str("instance", instanceName).
|
||
Str("vm", res.Name).
|
||
Int("vmid", res.VMID).
|
||
Msg("Guest agent returned no filesystem info - agent may need restart or VM may have no mounted filesystems")
|
||
} else {
|
||
log.Debug().
|
||
Str("instance", instanceName).
|
||
Str("vm", res.Name).
|
||
Int("filesystems", len(fsInfo)).
|
||
Msg("Got filesystem info from guest agent")
|
||
|
||
// Aggregate disk usage from all filesystems AND preserve individual disk data
|
||
var totalBytes, usedBytes uint64
|
||
var skippedFS []string
|
||
var includedFS []string
|
||
|
||
// Track seen filesystems to dedupe btrfs/zfs subvolumes that share the same pool.
|
||
// These filesystems mount multiple subvolumes from one storage pool, each reporting
|
||
// the same TotalBytes. Without deduplication, we'd sum 11 × 77GB = 851GB instead of 77GB.
|
||
// Key: "fstype:device:totalBytes" or "fstype::totalBytes" if device unknown.
|
||
seenFilesystems := make(map[string]bool)
|
||
|
||
// Log all filesystems received for debugging
|
||
log.Debug().
|
||
Str("instance", instanceName).
|
||
Str("vm", res.Name).
|
||
Int("vmid", res.VMID).
|
||
Int("filesystem_count", len(fsInfo)).
|
||
Msg("Processing filesystems from guest agent")
|
||
|
||
for _, fs := range fsInfo {
|
||
// Skip special filesystems and mounts
|
||
shouldSkip, reasons := fsfilters.ShouldSkipFilesystem(fs.Type, fs.Mountpoint, fs.TotalBytes, fs.UsedBytes)
|
||
if shouldSkip {
|
||
// Check if any reason is read-only for detailed logging
|
||
for _, r := range reasons {
|
||
if strings.HasPrefix(r, "read-only-") {
|
||
log.Debug().
|
||
Str("instance", instanceName).
|
||
Str("vm", res.Name).
|
||
Int("vmid", res.VMID).
|
||
Str("mountpoint", fs.Mountpoint).
|
||
Str("type", fs.Type).
|
||
Float64("total_gb", float64(fs.TotalBytes)/1073741824).
|
||
Float64("used_gb", float64(fs.UsedBytes)/1073741824).
|
||
Msg("Skipping read-only filesystem from disk aggregation")
|
||
break
|
||
}
|
||
}
|
||
skippedFS = append(skippedFS, fmt.Sprintf("%s(%s,%s)",
|
||
fs.Mountpoint, fs.Type, strings.Join(reasons, ",")))
|
||
continue
|
||
}
|
||
|
||
// Only count real filesystems with valid data
|
||
// Some filesystems report 0 bytes (like unformatted or system partitions)
|
||
if fs.TotalBytes > 0 {
|
||
// Deduplication for COW filesystems (btrfs, zfs) that mount multiple
|
||
// subvolumes from the same pool. Each subvolume reports identical TotalBytes
|
||
// because they share the underlying storage pool.
|
||
// Key format: "fstype:device:totalBytes" - if multiple mounts have the same
|
||
// key, they're subvolumes of the same pool and should only be counted once.
|
||
fsTypeLower := strings.ToLower(fs.Type)
|
||
needsDedupe := fsTypeLower == "btrfs" || fsTypeLower == "zfs" ||
|
||
strings.HasPrefix(fsTypeLower, "zfs")
|
||
|
||
countThisFS := true
|
||
if needsDedupe {
|
||
// Use device if available, otherwise fall back to just type+size
|
||
dedupeKey := fmt.Sprintf("%s:%s:%d", fsTypeLower, fs.Disk, fs.TotalBytes)
|
||
if seenFilesystems[dedupeKey] {
|
||
// Already counted this pool - skip adding to totals but still add to
|
||
// individual disks for display purposes
|
||
countThisFS = false
|
||
log.Debug().
|
||
Str("instance", instanceName).
|
||
Str("vm", res.Name).
|
||
Int("vmid", res.VMID).
|
||
Str("mountpoint", fs.Mountpoint).
|
||
Str("type", fs.Type).
|
||
Str("device", fs.Disk).
|
||
Uint64("total", fs.TotalBytes).
|
||
Str("dedupe_key", dedupeKey).
|
||
Msg("Skipping duplicate btrfs/zfs subvolume in total calculation")
|
||
} else {
|
||
seenFilesystems[dedupeKey] = true
|
||
}
|
||
}
|
||
|
||
if countThisFS {
|
||
totalBytes += fs.TotalBytes
|
||
usedBytes += fs.UsedBytes
|
||
}
|
||
includedFS = append(includedFS, fmt.Sprintf("%s(%s,%.1fGB)",
|
||
fs.Mountpoint, fs.Type, float64(fs.TotalBytes)/1073741824))
|
||
|
||
// Add to individual disks array (always include for display)
|
||
individualDisks = append(individualDisks, models.Disk{
|
||
Total: int64(fs.TotalBytes),
|
||
Used: int64(fs.UsedBytes),
|
||
Free: int64(fs.TotalBytes - fs.UsedBytes),
|
||
Usage: safePercentage(float64(fs.UsedBytes), float64(fs.TotalBytes)),
|
||
Mountpoint: fs.Mountpoint,
|
||
Type: fs.Type,
|
||
Device: fs.Disk,
|
||
})
|
||
|
||
log.Debug().
|
||
Str("instance", instanceName).
|
||
Str("vm", res.Name).
|
||
Int("vmid", res.VMID).
|
||
Str("mountpoint", fs.Mountpoint).
|
||
Str("type", fs.Type).
|
||
Uint64("total", fs.TotalBytes).
|
||
Uint64("used", fs.UsedBytes).
|
||
Float64("total_gb", float64(fs.TotalBytes)/1073741824).
|
||
Float64("used_gb", float64(fs.UsedBytes)/1073741824).
|
||
Bool("counted_in_total", countThisFS).
|
||
Msg("Including filesystem in disk usage calculation")
|
||
} else if fs.TotalBytes == 0 && len(fs.Mountpoint) > 0 {
|
||
skippedFS = append(skippedFS, fmt.Sprintf("%s(%s,0GB)", fs.Mountpoint, fs.Type))
|
||
log.Debug().
|
||
Str("instance", instanceName).
|
||
Str("vm", res.Name).
|
||
Int("vmid", res.VMID).
|
||
Str("mountpoint", fs.Mountpoint).
|
||
Str("type", fs.Type).
|
||
Msg("Skipping filesystem with zero total bytes")
|
||
}
|
||
}
|
||
|
||
if len(skippedFS) > 0 {
|
||
log.Debug().
|
||
Str("instance", instanceName).
|
||
Str("vm", res.Name).
|
||
Strs("skipped", skippedFS).
|
||
Msg("Skipped special filesystems")
|
||
}
|
||
|
||
if len(includedFS) > 0 {
|
||
log.Debug().
|
||
Str("instance", instanceName).
|
||
Str("vm", res.Name).
|
||
Int("vmid", res.VMID).
|
||
Strs("included", includedFS).
|
||
Msg("Filesystems included in disk calculation")
|
||
}
|
||
|
||
// If we got valid data from guest agent, use it
|
||
if totalBytes > 0 {
|
||
// Sanity check: if the reported disk is way larger than allocated disk,
|
||
// we might be getting host disk info somehow
|
||
allocatedDiskGB := float64(res.MaxDisk) / 1073741824
|
||
reportedDiskGB := float64(totalBytes) / 1073741824
|
||
|
||
// If reported disk is more than 2x the allocated disk, log a warning
|
||
// This could indicate we're getting host disk or network shares
|
||
if allocatedDiskGB > 0 && reportedDiskGB > allocatedDiskGB*2 {
|
||
log.Warn().
|
||
Str("instance", instanceName).
|
||
Str("vm", res.Name).
|
||
Int("vmid", res.VMID).
|
||
Float64("allocated_gb", allocatedDiskGB).
|
||
Float64("reported_gb", reportedDiskGB).
|
||
Float64("ratio", reportedDiskGB/allocatedDiskGB).
|
||
Strs("filesystems", includedFS).
|
||
Msg("VM reports disk usage significantly larger than allocated disk - possible issue with filesystem detection")
|
||
}
|
||
|
||
diskTotal = totalBytes
|
||
diskUsed = usedBytes
|
||
diskFree = totalBytes - usedBytes
|
||
diskUsage = safePercentage(float64(usedBytes), float64(totalBytes))
|
||
|
||
log.Debug().
|
||
Str("instance", instanceName).
|
||
Str("vm", res.Name).
|
||
Int("vmid", res.VMID).
|
||
Uint64("totalBytes", totalBytes).
|
||
Uint64("usedBytes", usedBytes).
|
||
Float64("total_gb", float64(totalBytes)/1073741824).
|
||
Float64("used_gb", float64(usedBytes)/1073741824).
|
||
Float64("allocated_gb", allocatedDiskGB).
|
||
Float64("usage", diskUsage).
|
||
Uint64("old_disk", res.Disk).
|
||
Uint64("old_maxdisk", res.MaxDisk).
|
||
Msg("Using guest agent data for accurate disk usage (replacing cluster/resources data)")
|
||
} else {
|
||
// Only special filesystems found - show allocated disk size instead
|
||
if diskTotal > 0 {
|
||
diskUsage = -1 // Show as allocated size
|
||
}
|
||
log.Info().
|
||
Str("instance", instanceName).
|
||
Str("vm", res.Name).
|
||
Int("filesystems_found", len(fsInfo)).
|
||
Msg("Guest agent provided filesystem info but no usable filesystems found (all were special mounts)")
|
||
}
|
||
}
|
||
} else {
|
||
// Agent disabled - show allocated disk size
|
||
if diskTotal > 0 {
|
||
diskUsage = -1 // Show as allocated size
|
||
}
|
||
log.Debug().
|
||
Str("instance", instanceName).
|
||
Str("vm", res.Name).
|
||
Int("vmid", res.VMID).
|
||
Int("agent", detailedStatus.Agent.Value).
|
||
Msg("VM does not have guest agent enabled in config")
|
||
}
|
||
} else {
|
||
// No vmStatus available - keep cluster/resources data
|
||
log.Debug().
|
||
Str("instance", instanceName).
|
||
Str("vm", res.Name).
|
||
Int("vmid", res.VMID).
|
||
Msg("Could not get VM status, using cluster/resources disk data")
|
||
}
|
||
}
|
||
|
||
if res.Status != "running" {
|
||
memorySource = "powered-off"
|
||
memUsed = 0
|
||
}
|
||
|
||
memFree := uint64(0)
|
||
if memTotal >= memUsed {
|
||
memFree = memTotal - memUsed
|
||
}
|
||
|
||
sampleTime := time.Now()
|
||
currentMetrics := IOMetrics{
|
||
DiskRead: diskReadBytes,
|
||
DiskWrite: diskWriteBytes,
|
||
NetworkIn: networkInBytes,
|
||
NetworkOut: networkOutBytes,
|
||
Timestamp: sampleTime,
|
||
}
|
||
diskReadRate, diskWriteRate, netInRate, netOutRate := m.rateTracker.CalculateRates(guestID, currentMetrics)
|
||
|
||
memoryUsage := safePercentage(float64(memUsed), float64(memTotal))
|
||
memory := models.Memory{
|
||
Total: int64(memTotal),
|
||
Used: int64(memUsed),
|
||
Free: int64(memFree),
|
||
Usage: memoryUsage,
|
||
}
|
||
if memory.Free < 0 {
|
||
memory.Free = 0
|
||
}
|
||
if memory.Used > memory.Total {
|
||
memory.Used = memory.Total
|
||
}
|
||
if detailedStatus != nil && detailedStatus.Balloon > 0 {
|
||
memory.Balloon = int64(detailedStatus.Balloon)
|
||
}
|
||
|
||
vm := models.VM{
|
||
ID: guestID,
|
||
VMID: res.VMID,
|
||
Name: res.Name,
|
||
Node: res.Node,
|
||
Instance: instanceName,
|
||
Status: res.Status,
|
||
Type: "qemu",
|
||
CPU: safeFloat(res.CPU),
|
||
CPUs: res.MaxCPU,
|
||
Memory: memory,
|
||
Disk: models.Disk{
|
||
Total: int64(diskTotal),
|
||
Used: int64(diskUsed),
|
||
Free: int64(diskFree),
|
||
Usage: diskUsage,
|
||
},
|
||
Disks: individualDisks, // Individual filesystem data
|
||
IPAddresses: ipAddresses,
|
||
OSName: osName,
|
||
OSVersion: osVersion,
|
||
AgentVersion: agentVersion,
|
||
NetworkInterfaces: networkInterfaces,
|
||
NetworkIn: max(0, int64(netInRate)),
|
||
NetworkOut: max(0, int64(netOutRate)),
|
||
DiskRead: max(0, int64(diskReadRate)),
|
||
DiskWrite: max(0, int64(diskWriteRate)),
|
||
Uptime: int64(res.Uptime),
|
||
Template: res.Template == 1,
|
||
LastSeen: sampleTime,
|
||
}
|
||
|
||
// Parse tags
|
||
if res.Tags != "" {
|
||
vm.Tags = strings.Split(res.Tags, ";")
|
||
|
||
// Log if Pulse-specific tags are detected
|
||
for _, tag := range vm.Tags {
|
||
switch tag {
|
||
case "pulse-no-alerts", "pulse-monitor-only", "pulse-relaxed":
|
||
log.Info().
|
||
Str("vm", vm.Name).
|
||
Str("node", vm.Node).
|
||
Str("tag", tag).
|
||
Msg("Pulse control tag detected on VM")
|
||
}
|
||
}
|
||
}
|
||
|
||
// Trigger guest metadata migration if old format exists
|
||
if m.guestMetadataStore != nil {
|
||
m.guestMetadataStore.GetWithLegacyMigration(guestID, instanceName, res.Node, res.VMID)
|
||
}
|
||
|
||
allVMs = append(allVMs, vm)
|
||
|
||
m.recordGuestSnapshot(instanceName, vm.Type, res.Node, res.VMID, GuestMemorySnapshot{
|
||
Name: vm.Name,
|
||
Status: vm.Status,
|
||
RetrievedAt: sampleTime,
|
||
MemorySource: memorySource,
|
||
Memory: vm.Memory,
|
||
Raw: guestRaw,
|
||
})
|
||
|
||
// For non-running VMs, zero out resource usage metrics to prevent false alerts
|
||
// Proxmox may report stale or residual metrics for stopped VMs
|
||
if vm.Status != "running" {
|
||
log.Debug().
|
||
Str("vm", vm.Name).
|
||
Str("status", vm.Status).
|
||
Float64("originalCpu", vm.CPU).
|
||
Float64("originalMemUsage", vm.Memory.Usage).
|
||
Msg("Non-running VM detected - zeroing metrics")
|
||
|
||
// Zero out all usage metrics for stopped/paused/suspended VMs
|
||
vm.CPU = 0
|
||
vm.Memory.Usage = 0
|
||
vm.Disk.Usage = 0
|
||
vm.NetworkIn = 0
|
||
vm.NetworkOut = 0
|
||
vm.DiskRead = 0
|
||
vm.DiskWrite = 0
|
||
}
|
||
|
||
// Check thresholds for alerts
|
||
m.alertManager.CheckGuest(vm, instanceName)
|
||
|
||
} else if res.Type == "lxc" {
|
||
// Skip templates if configured
|
||
if res.Template == 1 {
|
||
continue
|
||
}
|
||
|
||
// Calculate I/O rates for container
|
||
sampleTime := time.Now()
|
||
currentMetrics := IOMetrics{
|
||
DiskRead: int64(res.DiskRead),
|
||
DiskWrite: int64(res.DiskWrite),
|
||
NetworkIn: int64(res.NetIn),
|
||
NetworkOut: int64(res.NetOut),
|
||
Timestamp: sampleTime,
|
||
}
|
||
diskReadRate, diskWriteRate, netInRate, netOutRate := m.rateTracker.CalculateRates(guestID, currentMetrics)
|
||
|
||
// Calculate cache-aware memory for LXC containers
|
||
// The cluster resources API returns mem from cgroup which includes cache/buffers (inflated).
|
||
// Try to get more accurate memory metrics from RRD data.
|
||
memTotal := res.MaxMem
|
||
memUsed := res.Mem
|
||
memorySource := "cluster-resources"
|
||
guestRaw := VMMemoryRaw{
|
||
ListingMem: res.Mem,
|
||
ListingMaxMem: res.MaxMem,
|
||
}
|
||
|
||
// For running containers, try to get RRD data for cache-aware memory calculation
|
||
if res.Status == "running" {
|
||
rrdCtx, rrdCancel := context.WithTimeout(ctx, 5*time.Second)
|
||
rrdPoints, err := client.GetLXCRRDData(rrdCtx, res.Node, res.VMID, "hour", "AVERAGE", []string{"memavailable", "memused", "maxmem"})
|
||
rrdCancel()
|
||
|
||
if err == nil && len(rrdPoints) > 0 {
|
||
// Use the most recent RRD point
|
||
point := rrdPoints[len(rrdPoints)-1]
|
||
|
||
if point.MaxMem != nil && *point.MaxMem > 0 {
|
||
guestRaw.StatusMaxMem = uint64(*point.MaxMem)
|
||
}
|
||
|
||
// Prefer memavailable-based calculation (excludes cache/buffers)
|
||
if point.MemAvailable != nil && *point.MemAvailable > 0 {
|
||
memAvailable := uint64(*point.MemAvailable)
|
||
if memAvailable <= memTotal {
|
||
memUsed = memTotal - memAvailable
|
||
memorySource = "rrd-memavailable"
|
||
guestRaw.MemInfoAvailable = memAvailable
|
||
log.Debug().
|
||
Str("container", res.Name).
|
||
Str("node", res.Node).
|
||
Uint64("total", memTotal).
|
||
Uint64("available", memAvailable).
|
||
Uint64("used", memUsed).
|
||
Float64("usage", safePercentage(float64(memUsed), float64(memTotal))).
|
||
Msg("LXC memory: using RRD memavailable (excludes reclaimable cache)")
|
||
}
|
||
} else if point.MemUsed != nil && *point.MemUsed > 0 {
|
||
// Fall back to memused from RRD if available
|
||
memUsed = uint64(*point.MemUsed)
|
||
if memUsed <= memTotal {
|
||
memorySource = "rrd-memused"
|
||
guestRaw.MemInfoUsed = memUsed
|
||
log.Debug().
|
||
Str("container", res.Name).
|
||
Str("node", res.Node).
|
||
Uint64("total", memTotal).
|
||
Uint64("used", memUsed).
|
||
Float64("usage", safePercentage(float64(memUsed), float64(memTotal))).
|
||
Msg("LXC memory: using RRD memused (excludes reclaimable cache)")
|
||
}
|
||
}
|
||
} else if err != nil {
|
||
log.Debug().
|
||
Err(err).
|
||
Str("instance", instanceName).
|
||
Str("container", res.Name).
|
||
Int("vmid", res.VMID).
|
||
Msg("RRD memory data unavailable for LXC, using cluster resources value")
|
||
}
|
||
}
|
||
|
||
// Clamp memory and disk values to prevent >100% usage
|
||
// (Proxmox can report used > total for LXC due to cgroup accounting,
|
||
// shared pages, or thin-provisioned disk overcommit)
|
||
clampedMemUsed := memUsed
|
||
if clampedMemUsed > memTotal && memTotal > 0 {
|
||
clampedMemUsed = memTotal
|
||
}
|
||
memFree := int64(memTotal) - int64(clampedMemUsed)
|
||
if memFree < 0 {
|
||
memFree = 0
|
||
}
|
||
diskUsed := res.Disk
|
||
if diskUsed > res.MaxDisk && res.MaxDisk > 0 {
|
||
diskUsed = res.MaxDisk
|
||
}
|
||
diskFree := int64(res.MaxDisk) - int64(diskUsed)
|
||
if diskFree < 0 {
|
||
diskFree = 0
|
||
}
|
||
|
||
container := models.Container{
|
||
ID: guestID,
|
||
VMID: res.VMID,
|
||
Name: res.Name,
|
||
Node: res.Node,
|
||
Instance: instanceName,
|
||
Status: res.Status,
|
||
Type: "lxc",
|
||
CPU: safeFloat(res.CPU),
|
||
CPUs: res.MaxCPU,
|
||
Memory: models.Memory{
|
||
Total: int64(memTotal),
|
||
Used: int64(clampedMemUsed),
|
||
Free: memFree,
|
||
Usage: safePercentage(float64(clampedMemUsed), float64(memTotal)),
|
||
},
|
||
Disk: models.Disk{
|
||
Total: int64(res.MaxDisk),
|
||
Used: int64(diskUsed),
|
||
Free: diskFree,
|
||
Usage: safePercentage(float64(diskUsed), float64(res.MaxDisk)),
|
||
},
|
||
NetworkIn: max(0, int64(netInRate)),
|
||
NetworkOut: max(0, int64(netOutRate)),
|
||
DiskRead: max(0, int64(diskReadRate)),
|
||
DiskWrite: max(0, int64(diskWriteRate)),
|
||
Uptime: int64(res.Uptime),
|
||
Template: res.Template == 1,
|
||
LastSeen: time.Now(),
|
||
}
|
||
|
||
if prevContainerIsOCI[container.VMID] {
|
||
container.IsOCI = true
|
||
container.Type = "oci"
|
||
}
|
||
|
||
// Parse tags
|
||
if res.Tags != "" {
|
||
container.Tags = strings.Split(res.Tags, ";")
|
||
|
||
// Log if Pulse-specific tags are detected
|
||
for _, tag := range container.Tags {
|
||
switch tag {
|
||
case "pulse-no-alerts", "pulse-monitor-only", "pulse-relaxed":
|
||
log.Info().
|
||
Str("container", container.Name).
|
||
Str("node", container.Node).
|
||
Str("tag", tag).
|
||
Msg("Pulse control tag detected on container")
|
||
}
|
||
}
|
||
}
|
||
|
||
m.enrichContainerMetadata(ctx, client, instanceName, res.Node, &container)
|
||
|
||
// For non-running containers, zero out resource usage metrics to prevent false alerts.
|
||
// Proxmox may report stale or residual metrics for stopped containers.
|
||
if container.Status != "running" {
|
||
log.Debug().
|
||
Str("container", container.Name).
|
||
Str("status", container.Status).
|
||
Float64("originalCpu", container.CPU).
|
||
Float64("originalMemUsage", container.Memory.Usage).
|
||
Msg("Non-running container detected - zeroing metrics")
|
||
|
||
container.CPU = 0
|
||
container.Memory.Usage = 0
|
||
container.Disk.Usage = 0
|
||
container.NetworkIn = 0
|
||
container.NetworkOut = 0
|
||
container.DiskRead = 0
|
||
container.DiskWrite = 0
|
||
}
|
||
|
||
// Trigger guest metadata migration if old format exists
|
||
if m.guestMetadataStore != nil {
|
||
m.guestMetadataStore.GetWithLegacyMigration(guestID, instanceName, res.Node, res.VMID)
|
||
}
|
||
|
||
allContainers = append(allContainers, container)
|
||
|
||
m.recordGuestSnapshot(instanceName, container.Type, res.Node, res.VMID, GuestMemorySnapshot{
|
||
Name: container.Name,
|
||
Status: container.Status,
|
||
RetrievedAt: sampleTime,
|
||
MemorySource: memorySource,
|
||
Memory: container.Memory,
|
||
Raw: guestRaw,
|
||
})
|
||
|
||
// Check thresholds for alerts
|
||
m.alertManager.CheckGuest(container, instanceName)
|
||
}
|
||
}
|
||
|
||
// Preserve VMs and containers from nodes within grace period
|
||
// The cluster/resources endpoint doesn't return VMs/containers from nodes Proxmox considers offline,
|
||
// but we want to keep showing them if the node is within grace period
|
||
// Count previous resources for this instance
|
||
prevVMCount := 0
|
||
prevContainerCount := 0
|
||
for _, vm := range prevState.VMs {
|
||
if vm.Instance == instanceName {
|
||
prevVMCount++
|
||
}
|
||
}
|
||
for _, container := range prevState.Containers {
|
||
if container.Instance == instanceName {
|
||
prevContainerCount++
|
||
}
|
||
}
|
||
|
||
// Build map of which nodes are covered by current resources
|
||
nodesWithResources := make(map[string]bool)
|
||
for _, res := range resources {
|
||
nodesWithResources[res.Node] = true
|
||
}
|
||
|
||
log.Debug().
|
||
Str("instance", instanceName).
|
||
Int("nodesInResources", len(nodesWithResources)).
|
||
Int("totalVMsFromResources", len(allVMs)).
|
||
Int("totalContainersFromResources", len(allContainers)).
|
||
Int("prevVMs", prevVMCount).
|
||
Int("prevContainers", prevContainerCount).
|
||
Msg("Cluster resources received, checking for grace period preservation")
|
||
|
||
// If we got ZERO resources but had resources before, and we have no node data,
|
||
// this likely means the cluster health check failed. Preserve everything.
|
||
if len(allVMs) == 0 && len(allContainers) == 0 &&
|
||
(prevVMCount > 0 || prevContainerCount > 0) &&
|
||
len(nodeEffectiveStatus) == 0 {
|
||
log.Warn().
|
||
Str("instance", instanceName).
|
||
Int("prevVMs", prevVMCount).
|
||
Int("prevContainers", prevContainerCount).
|
||
Msg("Cluster returned zero resources but had resources before - likely cluster health issue, preserving all previous resources")
|
||
|
||
// Preserve all previous VMs and containers for this instance
|
||
for _, vm := range prevState.VMs {
|
||
if vm.Instance == instanceName {
|
||
allVMs = append(allVMs, vm)
|
||
}
|
||
}
|
||
for _, container := range prevState.Containers {
|
||
if container.Instance == instanceName {
|
||
allContainers = append(allContainers, container)
|
||
}
|
||
}
|
||
}
|
||
|
||
// Check for nodes that are within grace period but not in cluster/resources response
|
||
preservedVMCount := 0
|
||
preservedContainerCount := 0
|
||
for nodeName, effectiveStatus := range nodeEffectiveStatus {
|
||
if effectiveStatus == "online" && !nodesWithResources[nodeName] {
|
||
// This node is within grace period but Proxmox didn't return its resources
|
||
// Preserve previous VMs and containers from this node
|
||
vmsBefore := len(allVMs)
|
||
containersBefore := len(allContainers)
|
||
|
||
// Preserve VMs from this node
|
||
for _, vm := range prevState.VMs {
|
||
if vm.Instance == instanceName && vm.Node == nodeName {
|
||
allVMs = append(allVMs, vm)
|
||
}
|
||
}
|
||
|
||
// Preserve containers from this node
|
||
for _, container := range prevState.Containers {
|
||
if container.Instance == instanceName && container.Node == nodeName {
|
||
allContainers = append(allContainers, container)
|
||
}
|
||
}
|
||
|
||
vmsPreserved := len(allVMs) - vmsBefore
|
||
containersPreserved := len(allContainers) - containersBefore
|
||
preservedVMCount += vmsPreserved
|
||
preservedContainerCount += containersPreserved
|
||
|
||
log.Info().
|
||
Str("instance", instanceName).
|
||
Str("node", nodeName).
|
||
Int("vmsPreserved", vmsPreserved).
|
||
Int("containersPreserved", containersPreserved).
|
||
Msg("Preserved VMs/containers from node in grace period")
|
||
}
|
||
}
|
||
|
||
if preservedVMCount > 0 || preservedContainerCount > 0 {
|
||
log.Info().
|
||
Str("instance", instanceName).
|
||
Int("totalPreservedVMs", preservedVMCount).
|
||
Int("totalPreservedContainers", preservedContainerCount).
|
||
Msg("Grace period preservation complete")
|
||
}
|
||
|
||
// Always update state when using efficient polling path
|
||
// Even if arrays are empty, we need to update to clear out VMs from genuinely offline nodes
|
||
m.state.UpdateVMsForInstance(instanceName, allVMs)
|
||
|
||
// Check Docker presence for containers that need it (new, restarted, started)
|
||
allContainers = m.CheckContainersForDocker(ctx, allContainers)
|
||
|
||
m.state.UpdateContainersForInstance(instanceName, allContainers)
|
||
|
||
// Record guest metrics history for running guests (enables sparkline/trends view)
|
||
now := time.Now()
|
||
for _, vm := range allVMs {
|
||
if vm.Status == "running" {
|
||
m.metricsHistory.AddGuestMetric(vm.ID, "cpu", vm.CPU*100, now)
|
||
m.metricsHistory.AddGuestMetric(vm.ID, "memory", vm.Memory.Usage, now)
|
||
if vm.Disk.Usage >= 0 {
|
||
m.metricsHistory.AddGuestMetric(vm.ID, "disk", vm.Disk.Usage, now)
|
||
}
|
||
if vm.DiskRead >= 0 {
|
||
m.metricsHistory.AddGuestMetric(vm.ID, "diskread", float64(vm.DiskRead), now)
|
||
}
|
||
if vm.DiskWrite >= 0 {
|
||
m.metricsHistory.AddGuestMetric(vm.ID, "diskwrite", float64(vm.DiskWrite), now)
|
||
}
|
||
if vm.NetworkIn >= 0 {
|
||
m.metricsHistory.AddGuestMetric(vm.ID, "netin", float64(vm.NetworkIn), now)
|
||
}
|
||
if vm.NetworkOut >= 0 {
|
||
m.metricsHistory.AddGuestMetric(vm.ID, "netout", float64(vm.NetworkOut), now)
|
||
}
|
||
// Also write to persistent store
|
||
if m.metricsStore != nil {
|
||
m.metricsStore.Write("vm", vm.ID, "cpu", vm.CPU*100, now)
|
||
m.metricsStore.Write("vm", vm.ID, "memory", vm.Memory.Usage, now)
|
||
if vm.Disk.Usage >= 0 {
|
||
m.metricsStore.Write("vm", vm.ID, "disk", vm.Disk.Usage, now)
|
||
}
|
||
if vm.DiskRead >= 0 {
|
||
m.metricsStore.Write("vm", vm.ID, "diskread", float64(vm.DiskRead), now)
|
||
}
|
||
if vm.DiskWrite >= 0 {
|
||
m.metricsStore.Write("vm", vm.ID, "diskwrite", float64(vm.DiskWrite), now)
|
||
}
|
||
if vm.NetworkIn >= 0 {
|
||
m.metricsStore.Write("vm", vm.ID, "netin", float64(vm.NetworkIn), now)
|
||
}
|
||
if vm.NetworkOut >= 0 {
|
||
m.metricsStore.Write("vm", vm.ID, "netout", float64(vm.NetworkOut), now)
|
||
}
|
||
}
|
||
}
|
||
}
|
||
for _, ct := range allContainers {
|
||
if ct.Status == "running" {
|
||
m.metricsHistory.AddGuestMetric(ct.ID, "cpu", ct.CPU*100, now)
|
||
m.metricsHistory.AddGuestMetric(ct.ID, "memory", ct.Memory.Usage, now)
|
||
if ct.Disk.Usage >= 0 {
|
||
m.metricsHistory.AddGuestMetric(ct.ID, "disk", ct.Disk.Usage, now)
|
||
}
|
||
if ct.DiskRead >= 0 {
|
||
m.metricsHistory.AddGuestMetric(ct.ID, "diskread", float64(ct.DiskRead), now)
|
||
}
|
||
if ct.DiskWrite >= 0 {
|
||
m.metricsHistory.AddGuestMetric(ct.ID, "diskwrite", float64(ct.DiskWrite), now)
|
||
}
|
||
if ct.NetworkIn >= 0 {
|
||
m.metricsHistory.AddGuestMetric(ct.ID, "netin", float64(ct.NetworkIn), now)
|
||
}
|
||
if ct.NetworkOut >= 0 {
|
||
m.metricsHistory.AddGuestMetric(ct.ID, "netout", float64(ct.NetworkOut), now)
|
||
}
|
||
// Also write to persistent store
|
||
if m.metricsStore != nil {
|
||
m.metricsStore.Write("container", ct.ID, "cpu", ct.CPU*100, now)
|
||
m.metricsStore.Write("container", ct.ID, "memory", ct.Memory.Usage, now)
|
||
if ct.Disk.Usage >= 0 {
|
||
m.metricsStore.Write("container", ct.ID, "disk", ct.Disk.Usage, now)
|
||
}
|
||
if ct.DiskRead >= 0 {
|
||
m.metricsStore.Write("container", ct.ID, "diskread", float64(ct.DiskRead), now)
|
||
}
|
||
if ct.DiskWrite >= 0 {
|
||
m.metricsStore.Write("container", ct.ID, "diskwrite", float64(ct.DiskWrite), now)
|
||
}
|
||
if ct.NetworkIn >= 0 {
|
||
m.metricsStore.Write("container", ct.ID, "netin", float64(ct.NetworkIn), now)
|
||
}
|
||
if ct.NetworkOut >= 0 {
|
||
m.metricsStore.Write("container", ct.ID, "netout", float64(ct.NetworkOut), now)
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
m.pollReplicationStatus(ctx, instanceName, client, allVMs)
|
||
|
||
log.Debug().
|
||
Str("instance", instanceName).
|
||
Int("vms", len(allVMs)).
|
||
Int("containers", len(allContainers)).
|
||
Msg("VMs and containers polled efficiently with cluster/resources")
|
||
|
||
return true
|
||
}
|
||
|
||
// pollBackupTasks polls backup tasks from a PVE instance
|
||
func (m *Monitor) pollBackupTasks(ctx context.Context, instanceName string, client PVEClientInterface) {
|
||
log.Debug().Str("instance", instanceName).Msg("Polling backup tasks")
|
||
|
||
tasks, err := client.GetBackupTasks(ctx)
|
||
if err != nil {
|
||
monErr := errors.WrapAPIError("get_backup_tasks", instanceName, err, 0)
|
||
log.Error().Err(monErr).Str("instance", instanceName).Msg("Failed to get backup tasks")
|
||
return
|
||
}
|
||
|
||
var backupTasks []models.BackupTask
|
||
for _, task := range tasks {
|
||
// Extract VMID from task ID (format: "UPID:node:pid:starttime:type:vmid:user@realm:")
|
||
vmid := 0
|
||
if task.ID != "" {
|
||
if vmidInt, err := strconv.Atoi(task.ID); err == nil {
|
||
vmid = vmidInt
|
||
}
|
||
}
|
||
|
||
taskID := fmt.Sprintf("%s-%s", instanceName, task.UPID)
|
||
|
||
backupTask := models.BackupTask{
|
||
ID: taskID,
|
||
Node: task.Node,
|
||
Instance: instanceName,
|
||
Type: task.Type,
|
||
VMID: vmid,
|
||
Status: task.Status,
|
||
StartTime: time.Unix(task.StartTime, 0),
|
||
}
|
||
|
||
if task.EndTime > 0 {
|
||
backupTask.EndTime = time.Unix(task.EndTime, 0)
|
||
}
|
||
|
||
backupTasks = append(backupTasks, backupTask)
|
||
}
|
||
|
||
// Update state with new backup tasks for this instance
|
||
m.state.UpdateBackupTasksForInstance(instanceName, backupTasks)
|
||
}
|
||
|
||
// pollReplicationStatus polls storage replication jobs for a PVE instance.
|
||
func (m *Monitor) pollReplicationStatus(ctx context.Context, instanceName string, client PVEClientInterface, vms []models.VM) {
|
||
log.Debug().Str("instance", instanceName).Msg("Polling replication status")
|
||
|
||
jobs, err := client.GetReplicationStatus(ctx)
|
||
if err != nil {
|
||
errMsg := err.Error()
|
||
lowerMsg := strings.ToLower(errMsg)
|
||
if strings.Contains(errMsg, "501") || strings.Contains(errMsg, "404") || strings.Contains(lowerMsg, "not implemented") || strings.Contains(lowerMsg, "not supported") {
|
||
log.Debug().
|
||
Str("instance", instanceName).
|
||
Msg("Replication API not available on this Proxmox instance")
|
||
m.state.UpdateReplicationJobsForInstance(instanceName, []models.ReplicationJob{})
|
||
return
|
||
}
|
||
|
||
monErr := errors.WrapAPIError("get_replication_status", instanceName, err, 0)
|
||
log.Warn().
|
||
Err(monErr).
|
||
Str("instance", instanceName).
|
||
Msg("Failed to get replication status")
|
||
return
|
||
}
|
||
|
||
if len(jobs) == 0 {
|
||
m.state.UpdateReplicationJobsForInstance(instanceName, []models.ReplicationJob{})
|
||
return
|
||
}
|
||
|
||
vmByID := make(map[int]models.VM, len(vms))
|
||
for _, vm := range vms {
|
||
vmByID[vm.VMID] = vm
|
||
}
|
||
|
||
converted := make([]models.ReplicationJob, 0, len(jobs))
|
||
now := time.Now()
|
||
|
||
for idx, job := range jobs {
|
||
guestID := job.GuestID
|
||
if guestID == 0 {
|
||
if parsed, err := strconv.Atoi(strings.TrimSpace(job.Guest)); err == nil {
|
||
guestID = parsed
|
||
}
|
||
}
|
||
|
||
guestName := ""
|
||
guestType := ""
|
||
guestNode := ""
|
||
if guestID > 0 {
|
||
if vm, ok := vmByID[guestID]; ok {
|
||
guestName = vm.Name
|
||
guestType = vm.Type
|
||
guestNode = vm.Node
|
||
}
|
||
}
|
||
if guestNode == "" {
|
||
guestNode = strings.TrimSpace(job.Source)
|
||
}
|
||
|
||
sourceNode := strings.TrimSpace(job.Source)
|
||
if sourceNode == "" {
|
||
sourceNode = guestNode
|
||
}
|
||
|
||
targetNode := strings.TrimSpace(job.Target)
|
||
|
||
var lastSyncTime *time.Time
|
||
if job.LastSyncTime != nil && !job.LastSyncTime.IsZero() {
|
||
t := job.LastSyncTime.UTC()
|
||
lastSyncTime = &t
|
||
}
|
||
|
||
var nextSyncTime *time.Time
|
||
if job.NextSyncTime != nil && !job.NextSyncTime.IsZero() {
|
||
t := job.NextSyncTime.UTC()
|
||
nextSyncTime = &t
|
||
}
|
||
|
||
lastSyncDurationHuman := job.LastSyncDurationHuman
|
||
if lastSyncDurationHuman == "" && job.LastSyncDurationSeconds > 0 {
|
||
lastSyncDurationHuman = formatSeconds(job.LastSyncDurationSeconds)
|
||
}
|
||
durationHuman := job.DurationHuman
|
||
if durationHuman == "" && job.DurationSeconds > 0 {
|
||
durationHuman = formatSeconds(job.DurationSeconds)
|
||
}
|
||
|
||
rateLimit := copyFloatPointer(job.RateLimitMbps)
|
||
|
||
status := job.Status
|
||
if status == "" {
|
||
status = job.State
|
||
}
|
||
|
||
jobID := strings.TrimSpace(job.ID)
|
||
if jobID == "" {
|
||
if job.JobNumber > 0 && guestID > 0 {
|
||
jobID = fmt.Sprintf("%d-%d", guestID, job.JobNumber)
|
||
} else {
|
||
jobID = fmt.Sprintf("job-%s-%d", instanceName, idx)
|
||
}
|
||
}
|
||
|
||
uniqueID := fmt.Sprintf("%s-%s", instanceName, jobID)
|
||
|
||
converted = append(converted, models.ReplicationJob{
|
||
ID: uniqueID,
|
||
Instance: instanceName,
|
||
JobID: jobID,
|
||
JobNumber: job.JobNumber,
|
||
Guest: job.Guest,
|
||
GuestID: guestID,
|
||
GuestName: guestName,
|
||
GuestType: guestType,
|
||
GuestNode: guestNode,
|
||
SourceNode: sourceNode,
|
||
SourceStorage: job.SourceStorage,
|
||
TargetNode: targetNode,
|
||
TargetStorage: job.TargetStorage,
|
||
Schedule: job.Schedule,
|
||
Type: job.Type,
|
||
Enabled: job.Enabled,
|
||
State: job.State,
|
||
Status: status,
|
||
LastSyncStatus: job.LastSyncStatus,
|
||
LastSyncTime: lastSyncTime,
|
||
LastSyncUnix: job.LastSyncUnix,
|
||
LastSyncDurationSeconds: job.LastSyncDurationSeconds,
|
||
LastSyncDurationHuman: lastSyncDurationHuman,
|
||
NextSyncTime: nextSyncTime,
|
||
NextSyncUnix: job.NextSyncUnix,
|
||
DurationSeconds: job.DurationSeconds,
|
||
DurationHuman: durationHuman,
|
||
FailCount: job.FailCount,
|
||
Error: job.Error,
|
||
Comment: job.Comment,
|
||
RemoveJob: job.RemoveJob,
|
||
RateLimitMbps: rateLimit,
|
||
LastPolled: now,
|
||
})
|
||
}
|
||
|
||
m.state.UpdateReplicationJobsForInstance(instanceName, converted)
|
||
}
|
||
|
||
func formatSeconds(total int) string {
|
||
if total <= 0 {
|
||
return ""
|
||
}
|
||
hours := total / 3600
|
||
minutes := (total % 3600) / 60
|
||
seconds := total % 60
|
||
return fmt.Sprintf("%02d:%02d:%02d", hours, minutes, seconds)
|
||
}
|
||
|
||
func copyFloatPointer(src *float64) *float64 {
|
||
if src == nil {
|
||
return nil
|
||
}
|
||
val := *src
|
||
return &val
|
||
}
|
||
|
||
// matchesDatastoreExclude checks if a datastore name matches any exclusion pattern.
|
||
// Patterns can be exact names or wildcards (* for any characters).
|
||
// Examples: "exthdd*" matches "exthdd1500gb", "*backup*" matches "my-backup-store"
|
||
func matchesDatastoreExclude(datastoreName string, excludePatterns []string) bool {
|
||
if len(excludePatterns) == 0 {
|
||
return false
|
||
}
|
||
|
||
for _, pattern := range excludePatterns {
|
||
pattern = strings.TrimSpace(pattern)
|
||
if pattern == "" {
|
||
continue
|
||
}
|
||
|
||
// Contains pattern: *substring*
|
||
if strings.HasPrefix(pattern, "*") && strings.HasSuffix(pattern, "*") && len(pattern) > 2 {
|
||
substring := strings.ToLower(pattern[1 : len(pattern)-1])
|
||
if strings.Contains(strings.ToLower(datastoreName), substring) {
|
||
return true
|
||
}
|
||
continue
|
||
}
|
||
|
||
// Suffix pattern: *suffix
|
||
if strings.HasPrefix(pattern, "*") && len(pattern) > 1 {
|
||
suffix := strings.ToLower(pattern[1:])
|
||
if strings.HasSuffix(strings.ToLower(datastoreName), suffix) {
|
||
return true
|
||
}
|
||
continue
|
||
}
|
||
|
||
// Prefix pattern: prefix*
|
||
if strings.HasSuffix(pattern, "*") && len(pattern) > 1 {
|
||
prefix := strings.ToLower(pattern[:len(pattern)-1])
|
||
if strings.HasPrefix(strings.ToLower(datastoreName), prefix) {
|
||
return true
|
||
}
|
||
continue
|
||
}
|
||
|
||
// Exact match (case-insensitive)
|
||
if strings.EqualFold(pattern, datastoreName) {
|
||
return true
|
||
}
|
||
}
|
||
|
||
return false
|
||
}
|
||
|
||
// pollPBSInstance polls a single PBS instance
|
||
func (m *Monitor) pollPBSInstance(ctx context.Context, instanceName string, client *pbs.Client) {
|
||
defer recoverFromPanic(fmt.Sprintf("pollPBSInstance-%s", instanceName))
|
||
|
||
start := time.Now()
|
||
debugEnabled := logging.IsLevelEnabled(zerolog.DebugLevel)
|
||
var pollErr error
|
||
if m.pollMetrics != nil {
|
||
m.pollMetrics.IncInFlight("pbs")
|
||
defer m.pollMetrics.DecInFlight("pbs")
|
||
defer func() {
|
||
m.pollMetrics.RecordResult(PollResult{
|
||
InstanceName: instanceName,
|
||
InstanceType: "pbs",
|
||
Success: pollErr == nil,
|
||
Error: pollErr,
|
||
StartTime: start,
|
||
EndTime: time.Now(),
|
||
})
|
||
}()
|
||
}
|
||
if m.stalenessTracker != nil {
|
||
defer func() {
|
||
if pollErr == nil {
|
||
m.stalenessTracker.UpdateSuccess(InstanceTypePBS, instanceName, nil)
|
||
} else {
|
||
m.stalenessTracker.UpdateError(InstanceTypePBS, instanceName)
|
||
}
|
||
}()
|
||
}
|
||
defer m.recordTaskResult(InstanceTypePBS, instanceName, pollErr)
|
||
|
||
// Check if context is cancelled
|
||
select {
|
||
case <-ctx.Done():
|
||
pollErr = ctx.Err()
|
||
if debugEnabled {
|
||
log.Debug().Str("instance", instanceName).Msg("Polling cancelled")
|
||
}
|
||
return
|
||
default:
|
||
}
|
||
|
||
if debugEnabled {
|
||
log.Debug().Str("instance", instanceName).Msg("Polling PBS instance")
|
||
}
|
||
|
||
// Get instance config
|
||
var instanceCfg *config.PBSInstance
|
||
for _, cfg := range m.config.PBSInstances {
|
||
if cfg.Name == instanceName {
|
||
instanceCfg = &cfg
|
||
if debugEnabled {
|
||
log.Debug().
|
||
Str("instance", instanceName).
|
||
Bool("monitorDatastores", cfg.MonitorDatastores).
|
||
Msg("Found PBS instance config")
|
||
}
|
||
break
|
||
}
|
||
}
|
||
if instanceCfg == nil {
|
||
log.Error().Str("instance", instanceName).Msg("PBS instance config not found")
|
||
return
|
||
}
|
||
|
||
// Initialize PBS instance with default values
|
||
pbsInst := models.PBSInstance{
|
||
ID: "pbs-" + instanceName,
|
||
Name: instanceName,
|
||
Host: instanceCfg.Host,
|
||
GuestURL: instanceCfg.GuestURL,
|
||
Status: "offline",
|
||
Version: "unknown",
|
||
ConnectionHealth: "unhealthy",
|
||
LastSeen: time.Now(),
|
||
}
|
||
|
||
// Try to get version first
|
||
version, versionErr := client.GetVersion(ctx)
|
||
if versionErr == nil {
|
||
pbsInst.Status = "online"
|
||
pbsInst.Version = version.Version
|
||
pbsInst.ConnectionHealth = "healthy"
|
||
m.resetAuthFailures(instanceName, "pbs")
|
||
m.state.SetConnectionHealth("pbs-"+instanceName, true)
|
||
|
||
if debugEnabled {
|
||
log.Debug().
|
||
Str("instance", instanceName).
|
||
Str("version", version.Version).
|
||
Bool("monitorDatastores", instanceCfg.MonitorDatastores).
|
||
Msg("PBS version retrieved successfully")
|
||
}
|
||
} else {
|
||
if debugEnabled {
|
||
log.Debug().Err(versionErr).Str("instance", instanceName).Msg("Failed to get PBS version, trying fallback")
|
||
}
|
||
|
||
// Use parent context for proper cancellation chain
|
||
ctx2, cancel2 := context.WithTimeout(ctx, 10*time.Second)
|
||
defer cancel2()
|
||
_, datastoreErr := client.GetDatastores(ctx2)
|
||
if datastoreErr == nil {
|
||
pbsInst.Status = "online"
|
||
pbsInst.Version = "connected"
|
||
pbsInst.ConnectionHealth = "healthy"
|
||
m.resetAuthFailures(instanceName, "pbs")
|
||
m.state.SetConnectionHealth("pbs-"+instanceName, true)
|
||
|
||
log.Info().
|
||
Str("instance", instanceName).
|
||
Msg("PBS connected (version unavailable but datastores accessible)")
|
||
} else {
|
||
pbsInst.Status = "offline"
|
||
pbsInst.ConnectionHealth = "error"
|
||
monErr := errors.WrapConnectionError("get_pbs_version", instanceName, versionErr)
|
||
log.Error().Err(monErr).Str("instance", instanceName).Msg("Failed to connect to PBS")
|
||
m.state.SetConnectionHealth("pbs-"+instanceName, false)
|
||
|
||
if errors.IsAuthError(versionErr) || errors.IsAuthError(datastoreErr) {
|
||
m.recordAuthFailure(instanceName, "pbs")
|
||
return
|
||
}
|
||
}
|
||
}
|
||
|
||
// Get node status (CPU, memory, etc.)
|
||
nodeStatus, err := client.GetNodeStatus(ctx)
|
||
if err != nil {
|
||
if debugEnabled {
|
||
log.Debug().Err(err).Str("instance", instanceName).Msg("Could not get PBS node status (may need Sys.Audit permission)")
|
||
}
|
||
} else if nodeStatus != nil {
|
||
pbsInst.CPU = nodeStatus.CPU
|
||
if nodeStatus.Memory.Total > 0 {
|
||
pbsInst.Memory = float64(nodeStatus.Memory.Used) / float64(nodeStatus.Memory.Total) * 100
|
||
pbsInst.MemoryUsed = nodeStatus.Memory.Used
|
||
pbsInst.MemoryTotal = nodeStatus.Memory.Total
|
||
}
|
||
pbsInst.Uptime = nodeStatus.Uptime
|
||
|
||
log.Debug().
|
||
Str("instance", instanceName).
|
||
Float64("cpu", pbsInst.CPU).
|
||
Float64("memory", pbsInst.Memory).
|
||
Int64("uptime", pbsInst.Uptime).
|
||
Msg("PBS node status retrieved")
|
||
}
|
||
|
||
// Poll datastores if enabled
|
||
if instanceCfg.MonitorDatastores {
|
||
datastores, err := client.GetDatastores(ctx)
|
||
if err != nil {
|
||
monErr := errors.WrapAPIError("get_datastores", instanceName, err, 0)
|
||
log.Error().Err(monErr).Str("instance", instanceName).Msg("Failed to get datastores")
|
||
} else {
|
||
log.Info().
|
||
Str("instance", instanceName).
|
||
Int("count", len(datastores)).
|
||
Msg("Got PBS datastores")
|
||
|
||
for _, ds := range datastores {
|
||
// Skip excluded datastores (for removable/unmounted datastores)
|
||
if matchesDatastoreExclude(ds.Store, instanceCfg.ExcludeDatastores) {
|
||
log.Debug().
|
||
Str("instance", instanceName).
|
||
Str("datastore", ds.Store).
|
||
Msg("Skipping excluded datastore")
|
||
continue
|
||
}
|
||
total := ds.Total
|
||
if total == 0 && ds.TotalSpace > 0 {
|
||
total = ds.TotalSpace
|
||
}
|
||
used := ds.Used
|
||
if used == 0 && ds.UsedSpace > 0 {
|
||
used = ds.UsedSpace
|
||
}
|
||
avail := ds.Avail
|
||
if avail == 0 && ds.AvailSpace > 0 {
|
||
avail = ds.AvailSpace
|
||
}
|
||
if total == 0 && used > 0 && avail > 0 {
|
||
total = used + avail
|
||
}
|
||
|
||
log.Debug().
|
||
Str("store", ds.Store).
|
||
Int64("total", total).
|
||
Int64("used", used).
|
||
Int64("avail", avail).
|
||
Int64("orig_total", ds.Total).
|
||
Int64("orig_total_space", ds.TotalSpace).
|
||
Msg("PBS datastore details")
|
||
|
||
modelDS := models.PBSDatastore{
|
||
Name: ds.Store,
|
||
Total: total,
|
||
Used: used,
|
||
Free: avail,
|
||
Usage: safePercentage(float64(used), float64(total)),
|
||
Status: "available",
|
||
DeduplicationFactor: ds.DeduplicationFactor,
|
||
}
|
||
|
||
namespaces, err := client.ListNamespaces(ctx, ds.Store, "", 0)
|
||
if err != nil {
|
||
log.Warn().Err(err).
|
||
Str("instance", instanceName).
|
||
Str("datastore", ds.Store).
|
||
Msg("Failed to list namespaces")
|
||
} else {
|
||
for _, ns := range namespaces {
|
||
nsPath := ns.NS
|
||
if nsPath == "" {
|
||
nsPath = ns.Path
|
||
}
|
||
if nsPath == "" {
|
||
nsPath = ns.Name
|
||
}
|
||
|
||
modelNS := models.PBSNamespace{
|
||
Path: nsPath,
|
||
Parent: ns.Parent,
|
||
Depth: strings.Count(nsPath, "/"),
|
||
}
|
||
modelDS.Namespaces = append(modelDS.Namespaces, modelNS)
|
||
}
|
||
|
||
hasRoot := false
|
||
for _, ns := range modelDS.Namespaces {
|
||
if ns.Path == "" {
|
||
hasRoot = true
|
||
break
|
||
}
|
||
}
|
||
if !hasRoot {
|
||
modelDS.Namespaces = append([]models.PBSNamespace{{Path: "", Depth: 0}}, modelDS.Namespaces...)
|
||
}
|
||
}
|
||
|
||
pbsInst.Datastores = append(pbsInst.Datastores, modelDS)
|
||
}
|
||
}
|
||
}
|
||
|
||
// Update state and run alerts
|
||
m.state.UpdatePBSInstance(pbsInst)
|
||
log.Info().
|
||
Str("instance", instanceName).
|
||
Str("id", pbsInst.ID).
|
||
Int("datastores", len(pbsInst.Datastores)).
|
||
Msg("PBS instance updated in state")
|
||
|
||
// Convert PBS datastores to Storage entries for unified storage view
|
||
if len(pbsInst.Datastores) > 0 && instanceCfg.MonitorDatastores {
|
||
var pbsStorages []models.Storage
|
||
for _, ds := range pbsInst.Datastores {
|
||
// Create a storage entry for this PBS datastore
|
||
storageID := fmt.Sprintf("pbs-%s-%s", instanceName, ds.Name)
|
||
pbsStorage := models.Storage{
|
||
ID: storageID,
|
||
Name: ds.Name,
|
||
Node: instanceName, // Use PBS instance name as "node"
|
||
Instance: "pbs-" + instanceName,
|
||
Type: "pbs",
|
||
Status: ds.Status,
|
||
Total: ds.Total,
|
||
Used: ds.Used,
|
||
Free: ds.Free,
|
||
Usage: ds.Usage,
|
||
Content: "backup", // PBS datastores are for backups
|
||
Shared: true, // PBS datastores are typically shared/network storage
|
||
Enabled: true,
|
||
Active: pbsInst.Status == "online",
|
||
}
|
||
pbsStorages = append(pbsStorages, pbsStorage)
|
||
}
|
||
m.state.UpdateStorageForInstance("pbs-"+instanceName, pbsStorages)
|
||
log.Debug().
|
||
Str("instance", instanceName).
|
||
Int("storageEntries", len(pbsStorages)).
|
||
Msg("Added PBS datastores to unified storage view")
|
||
}
|
||
|
||
if m.alertManager != nil {
|
||
m.alertManager.CheckPBS(pbsInst)
|
||
}
|
||
|
||
// Poll backups if enabled
|
||
if instanceCfg.MonitorBackups {
|
||
if len(pbsInst.Datastores) == 0 {
|
||
log.Debug().
|
||
Str("instance", instanceName).
|
||
Msg("No PBS datastores available for backup polling")
|
||
} else if !m.config.EnableBackupPolling {
|
||
log.Debug().
|
||
Str("instance", instanceName).
|
||
Msg("Skipping PBS backup polling - globally disabled")
|
||
} else {
|
||
now := time.Now()
|
||
|
||
m.mu.Lock()
|
||
lastPoll := m.lastPBSBackupPoll[instanceName]
|
||
if m.pbsBackupPollers == nil {
|
||
m.pbsBackupPollers = make(map[string]bool)
|
||
}
|
||
inProgress := m.pbsBackupPollers[instanceName]
|
||
m.mu.Unlock()
|
||
|
||
shouldPoll, reason, newLast := m.shouldRunBackupPoll(lastPoll, now)
|
||
if !shouldPoll {
|
||
if reason != "" {
|
||
log.Debug().
|
||
Str("instance", instanceName).
|
||
Str("reason", reason).
|
||
Msg("Skipping PBS backup polling this cycle")
|
||
}
|
||
} else if inProgress {
|
||
log.Debug().
|
||
Str("instance", instanceName).
|
||
Msg("PBS backup polling already in progress")
|
||
} else {
|
||
datastoreSnapshot := make([]models.PBSDatastore, len(pbsInst.Datastores))
|
||
copy(datastoreSnapshot, pbsInst.Datastores)
|
||
|
||
// Atomically check and set poller flag
|
||
m.mu.Lock()
|
||
if m.pbsBackupPollers[instanceName] {
|
||
// Race: another goroutine started between our check and lock
|
||
m.mu.Unlock()
|
||
log.Debug().
|
||
Str("instance", instanceName).
|
||
Msg("PBS backup polling started by another goroutine")
|
||
} else {
|
||
m.pbsBackupPollers[instanceName] = true
|
||
m.lastPBSBackupPoll[instanceName] = newLast
|
||
m.mu.Unlock()
|
||
|
||
go func(ds []models.PBSDatastore, inst string, start time.Time, pbsClient *pbs.Client) {
|
||
defer recoverFromPanic(fmt.Sprintf("pollPBSBackups-%s", inst))
|
||
defer func() {
|
||
m.mu.Lock()
|
||
delete(m.pbsBackupPollers, inst)
|
||
m.lastPBSBackupPoll[inst] = time.Now()
|
||
m.mu.Unlock()
|
||
}()
|
||
|
||
log.Info().
|
||
Str("instance", inst).
|
||
Int("datastores", len(ds)).
|
||
Msg("Starting background PBS backup polling")
|
||
|
||
// The per-cycle ctx is canceled as soon as the main polling loop finishes,
|
||
// so derive the backup poll context from the long-lived runtime context instead.
|
||
parentCtx := m.runtimeCtx
|
||
if parentCtx == nil {
|
||
parentCtx = context.Background()
|
||
}
|
||
backupCtx, cancel := context.WithTimeout(parentCtx, 5*time.Minute)
|
||
defer cancel()
|
||
|
||
m.pollPBSBackups(backupCtx, inst, pbsClient, ds)
|
||
|
||
log.Info().
|
||
Str("instance", inst).
|
||
Dur("duration", time.Since(start)).
|
||
Msg("Completed background PBS backup polling")
|
||
}(datastoreSnapshot, instanceName, now, client)
|
||
}
|
||
}
|
||
}
|
||
} else {
|
||
log.Debug().
|
||
Str("instance", instanceName).
|
||
Msg("PBS backup monitoring disabled")
|
||
}
|
||
}
|
||
|
||
// pollPMGInstance polls a single Proxmox Mail Gateway instance
|
||
func (m *Monitor) pollPMGInstance(ctx context.Context, instanceName string, client *pmg.Client) {
|
||
defer recoverFromPanic(fmt.Sprintf("pollPMGInstance-%s", instanceName))
|
||
|
||
start := time.Now()
|
||
debugEnabled := logging.IsLevelEnabled(zerolog.DebugLevel)
|
||
var pollErr error
|
||
if m.pollMetrics != nil {
|
||
m.pollMetrics.IncInFlight("pmg")
|
||
defer m.pollMetrics.DecInFlight("pmg")
|
||
defer func() {
|
||
m.pollMetrics.RecordResult(PollResult{
|
||
InstanceName: instanceName,
|
||
InstanceType: "pmg",
|
||
Success: pollErr == nil,
|
||
Error: pollErr,
|
||
StartTime: start,
|
||
EndTime: time.Now(),
|
||
})
|
||
}()
|
||
}
|
||
if m.stalenessTracker != nil {
|
||
defer func() {
|
||
if pollErr == nil {
|
||
m.stalenessTracker.UpdateSuccess(InstanceTypePMG, instanceName, nil)
|
||
} else {
|
||
m.stalenessTracker.UpdateError(InstanceTypePMG, instanceName)
|
||
}
|
||
}()
|
||
}
|
||
defer m.recordTaskResult(InstanceTypePMG, instanceName, pollErr)
|
||
|
||
select {
|
||
case <-ctx.Done():
|
||
pollErr = ctx.Err()
|
||
if debugEnabled {
|
||
log.Debug().Str("instance", instanceName).Msg("PMG polling cancelled by context")
|
||
}
|
||
return
|
||
default:
|
||
}
|
||
|
||
if debugEnabled {
|
||
log.Debug().Str("instance", instanceName).Msg("Polling PMG instance")
|
||
}
|
||
|
||
var instanceCfg *config.PMGInstance
|
||
for idx := range m.config.PMGInstances {
|
||
if m.config.PMGInstances[idx].Name == instanceName {
|
||
instanceCfg = &m.config.PMGInstances[idx]
|
||
break
|
||
}
|
||
}
|
||
|
||
if instanceCfg == nil {
|
||
log.Error().Str("instance", instanceName).Msg("PMG instance config not found")
|
||
pollErr = fmt.Errorf("pmg instance config not found for %s", instanceName)
|
||
return
|
||
}
|
||
|
||
now := time.Now()
|
||
pmgInst := models.PMGInstance{
|
||
ID: "pmg-" + instanceName,
|
||
Name: instanceName,
|
||
Host: instanceCfg.Host,
|
||
GuestURL: instanceCfg.GuestURL,
|
||
Status: "offline",
|
||
ConnectionHealth: "unhealthy",
|
||
LastSeen: now,
|
||
LastUpdated: now,
|
||
}
|
||
|
||
version, err := client.GetVersion(ctx)
|
||
if err != nil {
|
||
monErr := errors.WrapConnectionError("pmg_get_version", instanceName, err)
|
||
pollErr = monErr
|
||
log.Error().Err(monErr).Str("instance", instanceName).Msg("Failed to connect to PMG instance")
|
||
m.state.SetConnectionHealth("pmg-"+instanceName, false)
|
||
m.state.UpdatePMGInstance(pmgInst)
|
||
|
||
// Check PMG offline status against alert thresholds
|
||
if m.alertManager != nil {
|
||
m.alertManager.CheckPMG(pmgInst)
|
||
}
|
||
|
||
if errors.IsAuthError(err) {
|
||
m.recordAuthFailure(instanceName, "pmg")
|
||
}
|
||
return
|
||
}
|
||
|
||
pmgInst.Status = "online"
|
||
pmgInst.ConnectionHealth = "healthy"
|
||
if version != nil {
|
||
pmgInst.Version = strings.TrimSpace(version.Version)
|
||
}
|
||
m.state.SetConnectionHealth("pmg-"+instanceName, true)
|
||
m.resetAuthFailures(instanceName, "pmg")
|
||
|
||
cluster, err := client.GetClusterStatus(ctx, true)
|
||
if err != nil {
|
||
if debugEnabled {
|
||
log.Debug().Err(err).Str("instance", instanceName).Msg("Failed to retrieve PMG cluster status")
|
||
}
|
||
}
|
||
|
||
backupNodes := make(map[string]struct{})
|
||
|
||
if len(cluster) > 0 {
|
||
nodes := make([]models.PMGNodeStatus, 0, len(cluster))
|
||
for _, entry := range cluster {
|
||
status := strings.ToLower(strings.TrimSpace(entry.Type))
|
||
if status == "" {
|
||
status = "online"
|
||
}
|
||
node := models.PMGNodeStatus{
|
||
Name: entry.Name,
|
||
Status: status,
|
||
Role: entry.Type,
|
||
}
|
||
|
||
backupNodes[entry.Name] = struct{}{}
|
||
|
||
// Fetch queue status for this node
|
||
if queueData, qErr := client.GetQueueStatus(ctx, entry.Name); qErr != nil {
|
||
if debugEnabled {
|
||
log.Debug().Err(qErr).
|
||
Str("instance", instanceName).
|
||
Str("node", entry.Name).
|
||
Msg("Failed to fetch PMG queue status")
|
||
}
|
||
} else if queueData != nil {
|
||
total := queueData.Active.Int64() + queueData.Deferred.Int64() + queueData.Hold.Int64() + queueData.Incoming.Int64()
|
||
node.QueueStatus = &models.PMGQueueStatus{
|
||
Active: queueData.Active.Int(),
|
||
Deferred: queueData.Deferred.Int(),
|
||
Hold: queueData.Hold.Int(),
|
||
Incoming: queueData.Incoming.Int(),
|
||
Total: int(total),
|
||
OldestAge: queueData.OldestAge.Int64(),
|
||
UpdatedAt: time.Now(),
|
||
}
|
||
}
|
||
|
||
nodes = append(nodes, node)
|
||
}
|
||
pmgInst.Nodes = nodes
|
||
}
|
||
|
||
if len(backupNodes) == 0 {
|
||
trimmed := strings.TrimSpace(instanceName)
|
||
if trimmed != "" {
|
||
backupNodes[trimmed] = struct{}{}
|
||
}
|
||
}
|
||
|
||
pmgBackups := make([]models.PMGBackup, 0)
|
||
seenBackupIDs := make(map[string]struct{})
|
||
|
||
for nodeName := range backupNodes {
|
||
if ctx.Err() != nil {
|
||
break
|
||
}
|
||
|
||
backups, backupErr := client.ListBackups(ctx, nodeName)
|
||
if backupErr != nil {
|
||
if debugEnabled {
|
||
log.Debug().Err(backupErr).
|
||
Str("instance", instanceName).
|
||
Str("node", nodeName).
|
||
Msg("Failed to list PMG configuration backups")
|
||
}
|
||
continue
|
||
}
|
||
|
||
for _, b := range backups {
|
||
timestamp := b.Timestamp.Int64()
|
||
backupTime := time.Unix(timestamp, 0)
|
||
id := fmt.Sprintf("pmg-%s-%s-%d", instanceName, nodeName, timestamp)
|
||
if _, exists := seenBackupIDs[id]; exists {
|
||
continue
|
||
}
|
||
seenBackupIDs[id] = struct{}{}
|
||
pmgBackups = append(pmgBackups, models.PMGBackup{
|
||
ID: id,
|
||
Instance: instanceName,
|
||
Node: nodeName,
|
||
Filename: b.Filename,
|
||
BackupTime: backupTime,
|
||
Size: b.Size.Int64(),
|
||
})
|
||
}
|
||
}
|
||
|
||
if debugEnabled {
|
||
log.Debug().
|
||
Str("instance", instanceName).
|
||
Int("backupCount", len(pmgBackups)).
|
||
Msg("PMG backups polled")
|
||
}
|
||
|
||
if stats, err := client.GetMailStatistics(ctx, ""); err != nil {
|
||
log.Warn().Err(err).Str("instance", instanceName).Msg("Failed to fetch PMG mail statistics")
|
||
} else if stats != nil {
|
||
pmgInst.MailStats = &models.PMGMailStats{
|
||
Timeframe: "day",
|
||
CountTotal: stats.Count.Float64(),
|
||
CountIn: stats.CountIn.Float64(),
|
||
CountOut: stats.CountOut.Float64(),
|
||
SpamIn: stats.SpamIn.Float64(),
|
||
SpamOut: stats.SpamOut.Float64(),
|
||
VirusIn: stats.VirusIn.Float64(),
|
||
VirusOut: stats.VirusOut.Float64(),
|
||
BouncesIn: stats.BouncesIn.Float64(),
|
||
BouncesOut: stats.BouncesOut.Float64(),
|
||
BytesIn: stats.BytesIn.Float64(),
|
||
BytesOut: stats.BytesOut.Float64(),
|
||
GreylistCount: stats.GreylistCount.Float64(),
|
||
JunkIn: stats.JunkIn.Float64(),
|
||
AverageProcessTimeMs: stats.AvgProcessSec.Float64() * 1000,
|
||
RBLRejects: stats.RBLRejects.Float64(),
|
||
PregreetRejects: stats.Pregreet.Float64(),
|
||
UpdatedAt: time.Now(),
|
||
}
|
||
}
|
||
|
||
if counts, err := client.GetMailCount(ctx, 86400); err != nil {
|
||
if debugEnabled {
|
||
log.Debug().Err(err).Str("instance", instanceName).Msg("Failed to fetch PMG mail count data")
|
||
}
|
||
} else if len(counts) > 0 {
|
||
points := make([]models.PMGMailCountPoint, 0, len(counts))
|
||
for _, entry := range counts {
|
||
ts := time.Unix(entry.Time.Int64(), 0)
|
||
points = append(points, models.PMGMailCountPoint{
|
||
Timestamp: ts,
|
||
Count: entry.Count.Float64(),
|
||
CountIn: entry.CountIn.Float64(),
|
||
CountOut: entry.CountOut.Float64(),
|
||
SpamIn: entry.SpamIn.Float64(),
|
||
SpamOut: entry.SpamOut.Float64(),
|
||
VirusIn: entry.VirusIn.Float64(),
|
||
VirusOut: entry.VirusOut.Float64(),
|
||
RBLRejects: entry.RBLRejects.Float64(),
|
||
Pregreet: entry.PregreetReject.Float64(),
|
||
BouncesIn: entry.BouncesIn.Float64(),
|
||
BouncesOut: entry.BouncesOut.Float64(),
|
||
Greylist: entry.GreylistCount.Float64(),
|
||
Index: entry.Index.Int(),
|
||
Timeframe: "hour",
|
||
WindowStart: ts,
|
||
})
|
||
}
|
||
pmgInst.MailCount = points
|
||
}
|
||
|
||
if scores, err := client.GetSpamScores(ctx); err != nil {
|
||
if debugEnabled {
|
||
log.Debug().Err(err).Str("instance", instanceName).Msg("Failed to fetch PMG spam score distribution")
|
||
}
|
||
} else if len(scores) > 0 {
|
||
buckets := make([]models.PMGSpamBucket, 0, len(scores))
|
||
for _, bucket := range scores {
|
||
buckets = append(buckets, models.PMGSpamBucket{
|
||
Score: bucket.Level,
|
||
Count: float64(bucket.Count.Int()),
|
||
})
|
||
}
|
||
pmgInst.SpamDistribution = buckets
|
||
}
|
||
|
||
quarantine := models.PMGQuarantineTotals{}
|
||
if spamStatus, err := client.GetQuarantineStatus(ctx, "spam"); err == nil && spamStatus != nil {
|
||
quarantine.Spam = int(spamStatus.Count.Int64())
|
||
}
|
||
if virusStatus, err := client.GetQuarantineStatus(ctx, "virus"); err == nil && virusStatus != nil {
|
||
quarantine.Virus = int(virusStatus.Count.Int64())
|
||
}
|
||
pmgInst.Quarantine = &quarantine
|
||
|
||
m.state.UpdatePMGBackups(instanceName, pmgBackups)
|
||
m.state.UpdatePMGInstance(pmgInst)
|
||
log.Info().
|
||
Str("instance", instanceName).
|
||
Str("status", pmgInst.Status).
|
||
Int("nodes", len(pmgInst.Nodes)).
|
||
Msg("PMG instance updated in state")
|
||
|
||
// Check PMG metrics against alert thresholds
|
||
if m.alertManager != nil {
|
||
m.alertManager.CheckPMG(pmgInst)
|
||
}
|
||
}
|
||
|
||
// GetState returns the current state
|
||
func (m *Monitor) GetState() models.StateSnapshot {
|
||
// Check if mock mode is enabled
|
||
if mock.IsMockEnabled() {
|
||
state := mock.GetMockState()
|
||
if state.ActiveAlerts == nil {
|
||
// Populate snapshot lazily if the cache hasn't been filled yet.
|
||
mock.UpdateAlertSnapshots(m.alertManager.GetActiveAlerts(), m.alertManager.GetRecentlyResolved())
|
||
state = mock.GetMockState()
|
||
}
|
||
return state
|
||
}
|
||
return m.state.GetSnapshot()
|
||
}
|
||
|
||
// SetOrgID sets the organization ID for this monitor instance.
|
||
// This is used for tenant isolation in multi-tenant deployments.
|
||
func (m *Monitor) SetOrgID(orgID string) {
|
||
m.mu.Lock()
|
||
defer m.mu.Unlock()
|
||
m.orgID = orgID
|
||
}
|
||
|
||
// GetOrgID returns the organization ID for this monitor instance.
|
||
// Returns empty string for default/legacy monitors.
|
||
func (m *Monitor) GetOrgID() string {
|
||
m.mu.RLock()
|
||
defer m.mu.RUnlock()
|
||
return m.orgID
|
||
}
|
||
|
||
// broadcastState broadcasts state to WebSocket clients.
|
||
// For tenant monitors, it broadcasts only to clients of that tenant.
|
||
// For default monitors, it broadcasts to all clients.
|
||
func (m *Monitor) broadcastState(hub *websocket.Hub, frontendState interface{}) {
|
||
if hub == nil {
|
||
return
|
||
}
|
||
|
||
orgID := m.GetOrgID()
|
||
if orgID != "" && orgID != "default" {
|
||
// Tenant-specific broadcast
|
||
hub.BroadcastStateToTenant(orgID, frontendState)
|
||
} else {
|
||
// Legacy broadcast to all clients
|
||
hub.BroadcastState(frontendState)
|
||
}
|
||
}
|
||
|
||
// SetMockMode switches between mock data and real infrastructure data at runtime.
|
||
func (m *Monitor) SetMockMode(enable bool) {
|
||
current := mock.IsMockEnabled()
|
||
if current == enable {
|
||
log.Info().Bool("mockMode", enable).Msg("Mock mode already in desired state")
|
||
return
|
||
}
|
||
|
||
if enable {
|
||
m.stopMockMetricsSampler()
|
||
mock.SetEnabled(true)
|
||
m.alertManager.ClearActiveAlerts()
|
||
m.mu.Lock()
|
||
m.resetStateLocked()
|
||
m.metricsHistory.Reset()
|
||
m.mu.Unlock()
|
||
m.StopDiscoveryService()
|
||
m.mu.RLock()
|
||
ctx := m.runtimeCtx
|
||
m.mu.RUnlock()
|
||
if ctx != nil {
|
||
m.startMockMetricsSampler(ctx)
|
||
}
|
||
log.Info().Msg("Switched monitor to mock mode")
|
||
} else {
|
||
m.stopMockMetricsSampler()
|
||
mock.SetEnabled(false)
|
||
m.alertManager.ClearActiveAlerts()
|
||
m.mu.Lock()
|
||
m.resetStateLocked()
|
||
m.metricsHistory.Reset()
|
||
m.mu.Unlock()
|
||
log.Info().Msg("Switched monitor to real data mode")
|
||
}
|
||
|
||
m.mu.RLock()
|
||
ctx := m.runtimeCtx
|
||
hub := m.wsHub
|
||
m.mu.RUnlock()
|
||
|
||
if hub != nil {
|
||
state := m.GetState()
|
||
frontendState := state.ToFrontend()
|
||
m.updateResourceStore(state)
|
||
frontendState.Resources = m.getResourcesForBroadcast()
|
||
// Use tenant-aware broadcast method
|
||
m.broadcastState(hub, frontendState)
|
||
}
|
||
|
||
if !enable && ctx != nil && hub != nil {
|
||
// Kick off an immediate poll to repopulate state with live data
|
||
go m.poll(ctx, hub)
|
||
if m.config.DiscoveryEnabled {
|
||
go m.StartDiscoveryService(ctx, hub, m.config.DiscoverySubnet)
|
||
}
|
||
}
|
||
}
|
||
|
||
func (m *Monitor) resetStateLocked() {
|
||
m.state = models.NewState()
|
||
m.state.Stats = models.Stats{
|
||
StartTime: m.startTime,
|
||
Version: "2.0.0-go",
|
||
}
|
||
}
|
||
|
||
// GetStartTime returns the monitor start time
|
||
func (m *Monitor) GetStartTime() time.Time {
|
||
return m.startTime
|
||
}
|
||
|
||
// GetDiscoveryService returns the discovery service
|
||
func (m *Monitor) GetDiscoveryService() *discovery.Service {
|
||
return m.discoveryService
|
||
}
|
||
|
||
// StartDiscoveryService starts the discovery service if not already running
|
||
func (m *Monitor) StartDiscoveryService(ctx context.Context, wsHub *websocket.Hub, subnet string) {
|
||
m.mu.Lock()
|
||
defer m.mu.Unlock()
|
||
|
||
if m.discoveryService != nil {
|
||
log.Debug().Msg("Discovery service already running")
|
||
return
|
||
}
|
||
|
||
if subnet == "" {
|
||
subnet = "auto"
|
||
}
|
||
|
||
cfgProvider := func() config.DiscoveryConfig {
|
||
m.mu.RLock()
|
||
defer m.mu.RUnlock()
|
||
if m.config == nil {
|
||
return config.DefaultDiscoveryConfig()
|
||
}
|
||
return config.CloneDiscoveryConfig(m.config.Discovery)
|
||
}
|
||
|
||
m.discoveryService = discovery.NewService(wsHub, 5*time.Minute, subnet, cfgProvider)
|
||
if m.discoveryService != nil {
|
||
m.discoveryService.Start(ctx)
|
||
log.Info().Str("subnet", subnet).Msg("Discovery service started")
|
||
} else {
|
||
log.Error().Msg("Failed to create discovery service")
|
||
}
|
||
}
|
||
|
||
// StopDiscoveryService stops the discovery service if running
|
||
func (m *Monitor) StopDiscoveryService() {
|
||
m.mu.Lock()
|
||
defer m.mu.Unlock()
|
||
|
||
if m.discoveryService != nil {
|
||
m.discoveryService.Stop()
|
||
m.discoveryService = nil
|
||
log.Info().Msg("Discovery service stopped")
|
||
}
|
||
}
|
||
|
||
// EnableTemperatureMonitoring enables temperature data collection
|
||
func (m *Monitor) EnableTemperatureMonitoring() {
|
||
// Temperature collection is always enabled when tempCollector is initialized
|
||
// This method exists for interface compatibility
|
||
log.Info().Msg("Temperature monitoring enabled")
|
||
}
|
||
|
||
// DisableTemperatureMonitoring disables temperature data collection
|
||
func (m *Monitor) DisableTemperatureMonitoring() {
|
||
// Temperature collection is always enabled when tempCollector is initialized
|
||
// This method exists for interface compatibility
|
||
log.Info().Msg("Temperature monitoring disabled")
|
||
}
|
||
|
||
// inMemoryChartThreshold is the max duration reliably covered by the in-memory
|
||
// metrics buffer. With 1000 data-points at a 10 s polling interval the buffer
|
||
// holds ~2.8 h of data; 2 h is a safe conservative cut-off.
|
||
const inMemoryChartThreshold = 2 * time.Hour
|
||
|
||
// chartDownsampleTarget is the number of points returned per metric when
|
||
// falling back to the persistent store. 500 is more than enough for any
|
||
// sparkline or thumbnail chart.
|
||
const chartDownsampleTarget = 500
|
||
|
||
// GetGuestMetrics returns historical metrics for a guest
|
||
func (m *Monitor) GetGuestMetrics(guestID string, duration time.Duration) map[string][]MetricPoint {
|
||
return m.metricsHistory.GetAllGuestMetrics(guestID, duration)
|
||
}
|
||
|
||
// GetGuestMetricsForChart returns guest metrics optimised for chart display.
|
||
// Short ranges are served from the in-memory ring buffer; longer ranges fall
|
||
// back to the persistent SQLite store with LTTB downsampling.
|
||
//
|
||
// inMemoryKey is the key used in the in-memory buffer (e.g. "docker:abc123").
|
||
// sqlResourceType/sqlResourceID are the type/id used in the SQLite store
|
||
// (e.g. "dockerContainer"/"abc123").
|
||
func (m *Monitor) GetGuestMetricsForChart(inMemoryKey, sqlResourceType, sqlResourceID string, duration time.Duration) map[string][]MetricPoint {
|
||
if duration <= inMemoryChartThreshold || m.metricsStore == nil {
|
||
return m.metricsHistory.GetAllGuestMetrics(inMemoryKey, duration)
|
||
}
|
||
end := time.Now()
|
||
start := end.Add(-duration)
|
||
sqlResult, err := m.metricsStore.QueryAll(sqlResourceType, sqlResourceID, start, end, 0)
|
||
if err != nil || len(sqlResult) == 0 {
|
||
return m.metricsHistory.GetAllGuestMetrics(inMemoryKey, duration)
|
||
}
|
||
return convertAndDownsample(sqlResult, chartDownsampleTarget)
|
||
}
|
||
|
||
// GetNodeMetrics returns historical metrics for a node
|
||
func (m *Monitor) GetNodeMetrics(nodeID string, metricType string, duration time.Duration) []MetricPoint {
|
||
return m.metricsHistory.GetNodeMetrics(nodeID, metricType, duration)
|
||
}
|
||
|
||
// GetNodeMetricsForChart returns node metrics for a single metric type,
|
||
// falling back to SQLite + LTTB for longer ranges.
|
||
func (m *Monitor) GetNodeMetricsForChart(nodeID, metricType string, duration time.Duration) []MetricPoint {
|
||
if duration <= inMemoryChartThreshold || m.metricsStore == nil {
|
||
return m.metricsHistory.GetNodeMetrics(nodeID, metricType, duration)
|
||
}
|
||
end := time.Now()
|
||
start := end.Add(-duration)
|
||
sqlPoints, err := m.metricsStore.Query("node", nodeID, metricType, start, end, 0)
|
||
if err != nil || len(sqlPoints) == 0 {
|
||
return m.metricsHistory.GetNodeMetrics(nodeID, metricType, duration)
|
||
}
|
||
converted := make([]MetricPoint, len(sqlPoints))
|
||
for i, p := range sqlPoints {
|
||
converted[i] = MetricPoint{Value: p.Value, Timestamp: p.Timestamp}
|
||
}
|
||
return lttb(converted, chartDownsampleTarget)
|
||
}
|
||
|
||
// GetStorageMetrics returns historical metrics for storage
|
||
func (m *Monitor) GetStorageMetrics(storageID string, duration time.Duration) map[string][]MetricPoint {
|
||
return m.metricsHistory.GetAllStorageMetrics(storageID, duration)
|
||
}
|
||
|
||
// GetStorageMetricsForChart returns storage metrics, falling back to SQLite +
|
||
// LTTB for longer ranges.
|
||
func (m *Monitor) GetStorageMetricsForChart(storageID string, duration time.Duration) map[string][]MetricPoint {
|
||
if duration <= inMemoryChartThreshold || m.metricsStore == nil {
|
||
return m.metricsHistory.GetAllStorageMetrics(storageID, duration)
|
||
}
|
||
end := time.Now()
|
||
start := end.Add(-duration)
|
||
sqlResult, err := m.metricsStore.QueryAll("storage", storageID, start, end, 0)
|
||
if err != nil || len(sqlResult) == 0 {
|
||
return m.metricsHistory.GetAllStorageMetrics(storageID, duration)
|
||
}
|
||
return convertAndDownsample(sqlResult, chartDownsampleTarget)
|
||
}
|
||
|
||
// convertAndDownsample converts pkg/metrics.MetricPoint slices to
|
||
// internal/types.MetricPoint slices and applies LTTB downsampling.
|
||
func convertAndDownsample(sqlResult map[string][]metrics.MetricPoint, target int) map[string][]MetricPoint {
|
||
result := make(map[string][]MetricPoint, len(sqlResult))
|
||
for metric, points := range sqlResult {
|
||
converted := make([]MetricPoint, len(points))
|
||
for i, p := range points {
|
||
converted[i] = MetricPoint{Value: p.Value, Timestamp: p.Timestamp}
|
||
}
|
||
result[metric] = lttb(converted, target)
|
||
}
|
||
return result
|
||
}
|
||
|
||
// GetAlertManager returns the alert manager
|
||
func (m *Monitor) GetAlertManager() *alerts.Manager {
|
||
return m.alertManager
|
||
}
|
||
|
||
// GetIncidentStore returns the incident timeline store.
|
||
func (m *Monitor) GetIncidentStore() *memory.IncidentStore {
|
||
return m.incidentStore
|
||
}
|
||
|
||
// SetAlertTriggeredAICallback sets an additional callback for AI analysis when alerts fire
|
||
// This enables token-efficient, real-time AI insights on specific resources
|
||
// SetAlertTriggeredAICallback sets an additional callback for AI analysis when alerts fire
|
||
// This enables token-efficient, real-time AI insights on specific resources
|
||
func (m *Monitor) SetAlertTriggeredAICallback(callback func(*alerts.Alert)) {
|
||
m.mu.Lock()
|
||
defer m.mu.Unlock()
|
||
m.alertTriggeredAICallback = callback
|
||
log.Info().Msg("Alert-triggered AI callback registered")
|
||
}
|
||
|
||
// SetAlertResolvedAICallback sets an additional callback when alerts are resolved.
|
||
// This enables AI systems (like incident recording) to stop or finalize context after resolution.
|
||
func (m *Monitor) SetAlertResolvedAICallback(callback func(*alerts.Alert)) {
|
||
if m.alertManager == nil {
|
||
return
|
||
}
|
||
m.alertResolvedAICallback = callback
|
||
log.Info().Msg("Alert-resolved AI callback registered")
|
||
}
|
||
|
||
func (m *Monitor) handleAlertFired(alert *alerts.Alert) {
|
||
if alert == nil {
|
||
return
|
||
}
|
||
|
||
if m.wsHub != nil {
|
||
m.wsHub.BroadcastAlert(alert)
|
||
}
|
||
|
||
log.Debug().
|
||
Str("alertID", alert.ID).
|
||
Str("level", string(alert.Level)).
|
||
Msg("Alert raised, sending to notification manager")
|
||
if m.notificationMgr != nil {
|
||
go m.notificationMgr.SendAlert(alert)
|
||
}
|
||
|
||
if m.incidentStore != nil {
|
||
m.incidentStore.RecordAlertFired(alert)
|
||
}
|
||
|
||
// Trigger AI analysis if callback is configured
|
||
if m.alertTriggeredAICallback != nil {
|
||
// Run in goroutine to avoid blocking the monitor loop
|
||
go func() {
|
||
defer func() {
|
||
if r := recover(); r != nil {
|
||
log.Error().Interface("panic", r).Msg("Panic in AI alert callback")
|
||
}
|
||
}()
|
||
m.alertTriggeredAICallback(alert)
|
||
}()
|
||
}
|
||
}
|
||
|
||
func (m *Monitor) handleAlertResolved(alertID string) {
|
||
var resolvedAlert *alerts.ResolvedAlert
|
||
|
||
if m.wsHub != nil {
|
||
m.wsHub.BroadcastAlertResolved(alertID)
|
||
}
|
||
|
||
// Always record incident timeline, regardless of notification suppression.
|
||
// This ensures we have a complete history even during quiet hours.
|
||
if m.incidentStore != nil {
|
||
resolvedAlert = m.alertManager.GetResolvedAlert(alertID)
|
||
if resolvedAlert != nil && resolvedAlert.Alert != nil {
|
||
m.incidentStore.RecordAlertResolved(resolvedAlert.Alert, resolvedAlert.ResolvedTime)
|
||
}
|
||
}
|
||
|
||
// Always trigger AI callback, regardless of notification suppression.
|
||
if m.alertResolvedAICallback != nil {
|
||
if resolvedAlert == nil {
|
||
resolvedAlert = m.alertManager.GetResolvedAlert(alertID)
|
||
}
|
||
if resolvedAlert != nil && resolvedAlert.Alert != nil {
|
||
go m.alertResolvedAICallback(resolvedAlert.Alert)
|
||
}
|
||
}
|
||
|
||
// Handle notifications (may be suppressed by quiet hours)
|
||
if m.notificationMgr != nil {
|
||
m.notificationMgr.CancelAlert(alertID)
|
||
if m.notificationMgr.GetNotifyOnResolve() {
|
||
if resolvedAlert == nil {
|
||
resolvedAlert = m.alertManager.GetResolvedAlert(alertID)
|
||
}
|
||
if resolvedAlert != nil {
|
||
// Check if recovery notification should be suppressed during quiet hours
|
||
if m.alertManager.ShouldSuppressResolvedNotification(resolvedAlert.Alert) {
|
||
log.Info().
|
||
Str("alertID", alertID).
|
||
Str("type", resolvedAlert.Alert.Type).
|
||
Msg("Resolved notification skipped - suppressed by quiet hours")
|
||
return
|
||
}
|
||
go m.notificationMgr.SendResolvedAlert(resolvedAlert)
|
||
}
|
||
} else {
|
||
log.Info().
|
||
Str("alertID", alertID).
|
||
Msg("Resolved notification skipped - notifyOnResolve is disabled")
|
||
}
|
||
}
|
||
}
|
||
|
||
func (m *Monitor) handleAlertAcknowledged(alert *alerts.Alert, user string) {
|
||
if m.incidentStore == nil || alert == nil {
|
||
return
|
||
}
|
||
m.incidentStore.RecordAlertAcknowledged(alert, user)
|
||
}
|
||
|
||
func (m *Monitor) handleAlertUnacknowledged(alert *alerts.Alert, user string) {
|
||
if m.incidentStore == nil || alert == nil {
|
||
return
|
||
}
|
||
m.incidentStore.RecordAlertUnacknowledged(alert, user)
|
||
}
|
||
|
||
// broadcastStateUpdate sends an immediate state update to all WebSocket clients.
|
||
// Call this after updating state with new data that should be visible immediately.
|
||
func (m *Monitor) broadcastStateUpdate() {
|
||
m.mu.RLock()
|
||
hub := m.wsHub
|
||
m.mu.RUnlock()
|
||
|
||
if hub == nil {
|
||
return
|
||
}
|
||
|
||
state := m.GetState()
|
||
frontendState := state.ToFrontend()
|
||
m.updateResourceStore(state)
|
||
frontendState.Resources = m.getResourcesForBroadcast()
|
||
// Use tenant-aware broadcast method
|
||
m.broadcastState(hub, frontendState)
|
||
}
|
||
|
||
// SetResourceStore sets the resource store for polling optimization.
|
||
// When set, the monitor will check if it should reduce polling frequency
|
||
// for nodes that have host agents providing data.
|
||
func (m *Monitor) SetResourceStore(store ResourceStoreInterface) {
|
||
m.mu.Lock()
|
||
defer m.mu.Unlock()
|
||
m.resourceStore = store
|
||
log.Info().Msg("Resource store set for polling optimization")
|
||
}
|
||
|
||
// GetNotificationManager returns the notification manager
|
||
func (m *Monitor) GetNotificationManager() *notifications.NotificationManager {
|
||
return m.notificationMgr
|
||
}
|
||
|
||
// GetConfigPersistence returns the config persistence manager
|
||
func (m *Monitor) GetConfigPersistence() *config.ConfigPersistence {
|
||
return m.configPersist
|
||
}
|
||
|
||
// GetMetricsStore returns the persistent metrics store
|
||
func (m *Monitor) GetMetricsStore() *metrics.Store {
|
||
return m.metricsStore
|
||
}
|
||
|
||
// GetMetricsHistory returns the in-memory metrics history for trend analysis
|
||
// This is used by the AI context builder to compute trends and predictions
|
||
func (m *Monitor) GetMetricsHistory() *MetricsHistory {
|
||
return m.metricsHistory
|
||
}
|
||
|
||
// shouldSkipNodeMetrics returns true if we should skip detailed metric polling
|
||
// for the given node because a host agent is providing richer data.
|
||
// This helps reduce API load when agents are active.
|
||
func (m *Monitor) shouldSkipNodeMetrics(nodeName string) bool {
|
||
m.mu.RLock()
|
||
store := m.resourceStore
|
||
m.mu.RUnlock()
|
||
|
||
if store == nil {
|
||
return false
|
||
}
|
||
|
||
should := store.ShouldSkipAPIPolling(nodeName)
|
||
if should {
|
||
log.Debug().
|
||
Str("node", nodeName).
|
||
Msg("Skipping detailed node metrics - host agent provides data")
|
||
}
|
||
return should
|
||
}
|
||
|
||
// updateResourceStore populates the resource store with data from the current state.
|
||
// This should be called before broadcasting to ensure fresh data.
|
||
func (m *Monitor) updateResourceStore(state models.StateSnapshot) {
|
||
m.mu.RLock()
|
||
store := m.resourceStore
|
||
m.mu.RUnlock()
|
||
|
||
if store == nil {
|
||
log.Debug().Msg("[Resources] No resource store configured, skipping population")
|
||
return
|
||
}
|
||
|
||
log.Debug().
|
||
Int("nodes", len(state.Nodes)).
|
||
Int("vms", len(state.VMs)).
|
||
Int("containers", len(state.Containers)).
|
||
Int("hosts", len(state.Hosts)).
|
||
Int("dockerHosts", len(state.DockerHosts)).
|
||
Msg("[Resources] Populating resource store from state snapshot")
|
||
|
||
store.PopulateFromSnapshot(state)
|
||
}
|
||
|
||
// getResourcesForBroadcast retrieves all resources from the store and converts them to frontend format.
|
||
// Returns nil if no resource store is configured.
|
||
func (m *Monitor) getResourcesForBroadcast() []models.ResourceFrontend {
|
||
m.mu.RLock()
|
||
store := m.resourceStore
|
||
m.mu.RUnlock()
|
||
|
||
if store == nil {
|
||
log.Debug().Msg("[Resources] No store for broadcast")
|
||
return nil
|
||
}
|
||
|
||
allResources := store.GetAll()
|
||
log.Debug().Int("count", len(allResources)).Msg("[Resources] Got resources for broadcast")
|
||
if len(allResources) == 0 {
|
||
return nil
|
||
}
|
||
|
||
result := make([]models.ResourceFrontend, len(allResources))
|
||
for i, r := range allResources {
|
||
input := models.ResourceConvertInput{
|
||
ID: r.ID,
|
||
Type: string(r.Type),
|
||
Name: r.Name,
|
||
DisplayName: r.DisplayName,
|
||
PlatformID: r.PlatformID,
|
||
PlatformType: string(r.PlatformType),
|
||
SourceType: string(r.SourceType),
|
||
ParentID: r.ParentID,
|
||
ClusterID: r.ClusterID,
|
||
Status: string(r.Status),
|
||
Temperature: r.Temperature,
|
||
Uptime: r.Uptime,
|
||
Tags: r.Tags,
|
||
Labels: r.Labels,
|
||
LastSeenUnix: r.LastSeen.UnixMilli(),
|
||
}
|
||
|
||
// Convert metrics
|
||
if r.CPU != nil {
|
||
input.CPU = &models.ResourceMetricInput{
|
||
Current: r.CPU.Current,
|
||
Total: r.CPU.Total,
|
||
Used: r.CPU.Used,
|
||
Free: r.CPU.Free,
|
||
}
|
||
}
|
||
if r.Memory != nil {
|
||
input.Memory = &models.ResourceMetricInput{
|
||
Current: r.Memory.Current,
|
||
Total: r.Memory.Total,
|
||
Used: r.Memory.Used,
|
||
Free: r.Memory.Free,
|
||
}
|
||
}
|
||
if r.Disk != nil {
|
||
input.Disk = &models.ResourceMetricInput{
|
||
Current: r.Disk.Current,
|
||
Total: r.Disk.Total,
|
||
Used: r.Disk.Used,
|
||
Free: r.Disk.Free,
|
||
}
|
||
}
|
||
if r.Network != nil {
|
||
input.HasNetwork = true
|
||
input.NetworkRX = r.Network.RXBytes
|
||
input.NetworkTX = r.Network.TXBytes
|
||
}
|
||
|
||
// Convert alerts
|
||
if len(r.Alerts) > 0 {
|
||
input.Alerts = make([]models.ResourceAlertInput, len(r.Alerts))
|
||
for j, a := range r.Alerts {
|
||
input.Alerts[j] = models.ResourceAlertInput{
|
||
ID: a.ID,
|
||
Type: a.Type,
|
||
Level: a.Level,
|
||
Message: a.Message,
|
||
Value: a.Value,
|
||
Threshold: a.Threshold,
|
||
StartTimeUnix: a.StartTime.UnixMilli(),
|
||
}
|
||
}
|
||
}
|
||
|
||
// Convert identity
|
||
if r.Identity != nil {
|
||
input.Identity = &models.ResourceIdentityInput{
|
||
Hostname: r.Identity.Hostname,
|
||
MachineID: r.Identity.MachineID,
|
||
IPs: r.Identity.IPs,
|
||
}
|
||
}
|
||
|
||
// Pass platform data directly as json.RawMessage
|
||
input.PlatformData = r.PlatformData
|
||
|
||
result[i] = models.ConvertResourceToFrontend(input)
|
||
}
|
||
|
||
return result
|
||
}
|
||
|
||
// pollStorageBackupsWithNodes polls backups using a provided nodes list to avoid duplicate GetNodes calls
|
||
func (m *Monitor) pollStorageBackupsWithNodes(ctx context.Context, instanceName string, client PVEClientInterface, nodes []proxmox.Node, nodeEffectiveStatus map[string]string) {
|
||
|
||
var allBackups []models.StorageBackup
|
||
seenVolids := make(map[string]bool) // Track seen volume IDs to avoid duplicates
|
||
hadSuccessfulNode := false // Track if at least one node responded successfully
|
||
storagesWithBackup := 0 // Number of storages that should contain backups
|
||
contentSuccess := 0 // Number of successful storage content fetches
|
||
contentFailures := 0 // Number of failed storage content fetches
|
||
storageQueryErrors := 0 // Number of nodes where storage list could not be queried
|
||
hadPermissionError := false // Track if any permission errors occurred this cycle
|
||
storagePreserveNeeded := map[string]struct{}{}
|
||
storageSuccess := map[string]struct{}{}
|
||
|
||
// Build guest lookup map to find actual node for each VMID
|
||
snapshot := m.state.GetSnapshot()
|
||
guestNodeMap := make(map[int]string) // VMID -> actual node name
|
||
for _, vm := range snapshot.VMs {
|
||
if vm.Instance == instanceName {
|
||
guestNodeMap[vm.VMID] = vm.Node
|
||
}
|
||
}
|
||
for _, ct := range snapshot.Containers {
|
||
if ct.Instance == instanceName {
|
||
guestNodeMap[ct.VMID] = ct.Node
|
||
}
|
||
}
|
||
|
||
// For each node, get storage and check content
|
||
for _, node := range nodes {
|
||
if nodeEffectiveStatus[node.Node] != "online" {
|
||
for _, storageName := range storageNamesForNode(instanceName, node.Node, snapshot) {
|
||
storagePreserveNeeded[storageName] = struct{}{}
|
||
}
|
||
continue
|
||
}
|
||
|
||
// Get storage for this node - retry once on timeout
|
||
var storages []proxmox.Storage
|
||
var err error
|
||
|
||
for attempt := 1; attempt <= 2; attempt++ {
|
||
storages, err = client.GetStorage(ctx, node.Node)
|
||
if err == nil {
|
||
break // Success
|
||
}
|
||
|
||
// Check if it's a timeout error
|
||
errStr := err.Error()
|
||
if strings.Contains(errStr, "timeout") || strings.Contains(errStr, "deadline exceeded") {
|
||
if attempt == 1 {
|
||
log.Warn().
|
||
Str("node", node.Node).
|
||
Str("instance", instanceName).
|
||
Msg("Storage query timed out, retrying with extended timeout...")
|
||
// Give it a bit more time on retry
|
||
time.Sleep(2 * time.Second)
|
||
continue
|
||
}
|
||
}
|
||
// Non-timeout error or second attempt failed
|
||
break
|
||
}
|
||
|
||
if err != nil {
|
||
monErr := errors.NewMonitorError(errors.ErrorTypeAPI, "get_storage_for_backups", instanceName, err).WithNode(node.Node)
|
||
log.Warn().Err(monErr).Str("node", node.Node).Msg("Failed to get storage for backups - skipping node")
|
||
for _, storageName := range storageNamesForNode(instanceName, node.Node, snapshot) {
|
||
storagePreserveNeeded[storageName] = struct{}{}
|
||
}
|
||
storageQueryErrors++
|
||
continue
|
||
}
|
||
|
||
hadSuccessfulNode = true
|
||
|
||
// For each storage that can contain backups or templates
|
||
for _, storage := range storages {
|
||
// Check if storage supports backup content
|
||
if !strings.Contains(storage.Content, "backup") {
|
||
continue
|
||
}
|
||
if !storageContentQueryable(storage) {
|
||
continue
|
||
}
|
||
|
||
storagesWithBackup++
|
||
|
||
// Get storage content
|
||
contents, err := client.GetStorageContent(ctx, node.Node, storage.Storage)
|
||
if err != nil {
|
||
monErr := errors.NewMonitorError(errors.ErrorTypeAPI, "get_storage_content", instanceName, err).WithNode(node.Node)
|
||
errStr := strings.ToLower(err.Error())
|
||
|
||
// Check if this is a permission error
|
||
if strings.Contains(errStr, "403") || strings.Contains(errStr, "401") ||
|
||
strings.Contains(errStr, "permission") || strings.Contains(errStr, "forbidden") {
|
||
hadPermissionError = true
|
||
m.mu.Lock()
|
||
m.backupPermissionWarnings[instanceName] = "Missing PVEDatastoreAdmin permission on /storage. Run: pveum aclmod /storage -user pulse-monitor@pam -role PVEDatastoreAdmin"
|
||
m.mu.Unlock()
|
||
log.Warn().
|
||
Str("instance", instanceName).
|
||
Str("node", node.Node).
|
||
Str("storage", storage.Storage).
|
||
Msg("Backup permission denied - PVEDatastoreAdmin role may be missing on /storage")
|
||
} else {
|
||
log.Debug().Err(monErr).
|
||
Str("node", node.Node).
|
||
Str("storage", storage.Storage).
|
||
Msg("Failed to get storage content")
|
||
}
|
||
if _, ok := storageSuccess[storage.Storage]; !ok {
|
||
storagePreserveNeeded[storage.Storage] = struct{}{}
|
||
}
|
||
contentFailures++
|
||
continue
|
||
}
|
||
|
||
contentSuccess++
|
||
storageSuccess[storage.Storage] = struct{}{}
|
||
delete(storagePreserveNeeded, storage.Storage)
|
||
|
||
// Convert to models
|
||
for _, content := range contents {
|
||
// Skip if we've already seen this item (shared storage duplicate)
|
||
if seenVolids[content.Volid] {
|
||
continue
|
||
}
|
||
seenVolids[content.Volid] = true
|
||
|
||
// Skip templates and ISOs - they're not backups
|
||
if content.Content == "vztmpl" || content.Content == "iso" {
|
||
continue
|
||
}
|
||
|
||
// Determine type from content type and VMID
|
||
backupType := "unknown"
|
||
if content.VMID == 0 {
|
||
backupType = "host"
|
||
} else if strings.Contains(content.Volid, "/vm/") || strings.Contains(content.Volid, "qemu") {
|
||
backupType = "qemu"
|
||
} else if strings.Contains(content.Volid, "/ct/") || strings.Contains(content.Volid, "lxc") {
|
||
backupType = "lxc"
|
||
} else if strings.Contains(content.Format, "pbs-ct") {
|
||
// PBS format check as fallback
|
||
backupType = "lxc"
|
||
} else if strings.Contains(content.Format, "pbs-vm") {
|
||
// PBS format check as fallback
|
||
backupType = "qemu"
|
||
}
|
||
|
||
// Determine the correct node: for guest backups (VMID > 0), use the actual guest's node
|
||
// For host backups (VMID == 0), use the node where the backup was found
|
||
backupNode := node.Node
|
||
if content.VMID > 0 {
|
||
if actualNode, found := guestNodeMap[content.VMID]; found {
|
||
backupNode = actualNode
|
||
}
|
||
// If not found in map, fall back to queried node (shouldn't happen normally)
|
||
}
|
||
isPBSStorage := strings.HasPrefix(storage.Storage, "pbs-") || storage.Type == "pbs"
|
||
|
||
// Check verification status for PBS backups
|
||
verified := false
|
||
verificationInfo := ""
|
||
if isPBSStorage {
|
||
// Check if verified flag is set
|
||
if content.Verified > 0 {
|
||
verified = true
|
||
}
|
||
// Also check verification map if available
|
||
if content.Verification != nil {
|
||
if state, ok := content.Verification["state"].(string); ok {
|
||
verified = (state == "ok")
|
||
verificationInfo = state
|
||
}
|
||
}
|
||
}
|
||
|
||
backup := models.StorageBackup{
|
||
ID: fmt.Sprintf("%s-%s", instanceName, content.Volid),
|
||
Storage: storage.Storage,
|
||
Node: backupNode,
|
||
Instance: instanceName,
|
||
Type: backupType,
|
||
VMID: content.VMID,
|
||
Time: time.Unix(content.CTime, 0),
|
||
CTime: content.CTime,
|
||
Size: int64(content.Size),
|
||
Format: content.Format,
|
||
Notes: content.Notes,
|
||
Protected: content.Protected > 0,
|
||
Volid: content.Volid,
|
||
IsPBS: isPBSStorage,
|
||
Verified: verified,
|
||
Verification: verificationInfo,
|
||
}
|
||
|
||
allBackups = append(allBackups, backup)
|
||
}
|
||
}
|
||
}
|
||
|
||
allBackups, preservedStorages := preserveFailedStorageBackups(instanceName, snapshot, storagePreserveNeeded, allBackups)
|
||
if len(preservedStorages) > 0 {
|
||
log.Warn().
|
||
Str("instance", instanceName).
|
||
Strs("storages", preservedStorages).
|
||
Msg("Preserving previous storage backup data due to partial failures")
|
||
}
|
||
|
||
// Decide whether to keep existing backups when every query failed
|
||
if shouldPreserveBackups(len(nodes), hadSuccessfulNode, storagesWithBackup, contentSuccess) {
|
||
if len(nodes) > 0 && !hadSuccessfulNode {
|
||
log.Warn().
|
||
Str("instance", instanceName).
|
||
Int("nodes", len(nodes)).
|
||
Int("errors", storageQueryErrors).
|
||
Msg("Failed to query storage on all nodes; keeping previous backup list")
|
||
} else if storagesWithBackup > 0 && contentSuccess == 0 {
|
||
log.Warn().
|
||
Str("instance", instanceName).
|
||
Int("storages", storagesWithBackup).
|
||
Int("failures", contentFailures).
|
||
Msg("All storage content queries failed; keeping previous backup list")
|
||
}
|
||
return
|
||
}
|
||
|
||
// Update state with storage backups for this instance
|
||
m.state.UpdateStorageBackupsForInstance(instanceName, allBackups)
|
||
|
||
// Sync backup times to VMs/Containers for backup status indicators
|
||
m.state.SyncGuestBackupTimes()
|
||
|
||
if m.alertManager != nil {
|
||
snapshot := m.state.GetSnapshot()
|
||
guestsByKey, guestsByVMID := buildGuestLookups(snapshot, m.guestMetadataStore)
|
||
pveStorage := snapshot.Backups.PVE.StorageBackups
|
||
if len(pveStorage) == 0 && len(snapshot.PVEBackups.StorageBackups) > 0 {
|
||
pveStorage = snapshot.PVEBackups.StorageBackups
|
||
}
|
||
pbsBackups := snapshot.Backups.PBS
|
||
if len(pbsBackups) == 0 && len(snapshot.PBSBackups) > 0 {
|
||
pbsBackups = snapshot.PBSBackups
|
||
}
|
||
pmgBackups := snapshot.Backups.PMG
|
||
if len(pmgBackups) == 0 && len(snapshot.PMGBackups) > 0 {
|
||
pmgBackups = snapshot.PMGBackups
|
||
}
|
||
m.alertManager.CheckBackups(pveStorage, pbsBackups, pmgBackups, guestsByKey, guestsByVMID)
|
||
}
|
||
|
||
// Clear permission warning if no permission errors occurred this cycle
|
||
if !hadPermissionError {
|
||
m.mu.Lock()
|
||
delete(m.backupPermissionWarnings, instanceName)
|
||
m.mu.Unlock()
|
||
}
|
||
|
||
log.Debug().
|
||
Str("instance", instanceName).
|
||
Int("count", len(allBackups)).
|
||
Msg("Storage backups polled")
|
||
|
||
// Immediately broadcast the updated state so frontend sees new backups
|
||
m.broadcastStateUpdate()
|
||
}
|
||
|
||
func shouldPreserveBackups(nodeCount int, hadSuccessfulNode bool, storagesWithBackup, contentSuccess int) bool {
|
||
if nodeCount > 0 && !hadSuccessfulNode {
|
||
return true
|
||
}
|
||
if storagesWithBackup > 0 && contentSuccess == 0 {
|
||
return true
|
||
}
|
||
return false
|
||
}
|
||
|
||
func shouldPreservePBSBackups(datastoreCount, datastoreFetches int) bool {
|
||
// If there are datastores but all fetches failed, preserve existing backups
|
||
if datastoreCount > 0 && datastoreFetches == 0 {
|
||
return true
|
||
}
|
||
return false
|
||
}
|
||
|
||
func storageNamesForNode(instanceName, nodeName string, snapshot models.StateSnapshot) []string {
|
||
if nodeName == "" {
|
||
return nil
|
||
}
|
||
|
||
var storages []string
|
||
for _, storage := range snapshot.Storage {
|
||
if storage.Instance != instanceName {
|
||
continue
|
||
}
|
||
if storage.Name == "" {
|
||
continue
|
||
}
|
||
if !strings.Contains(storage.Content, "backup") {
|
||
continue
|
||
}
|
||
if storage.Node == nodeName {
|
||
storages = append(storages, storage.Name)
|
||
continue
|
||
}
|
||
for _, node := range storage.Nodes {
|
||
if node == nodeName {
|
||
storages = append(storages, storage.Name)
|
||
break
|
||
}
|
||
}
|
||
}
|
||
|
||
return storages
|
||
}
|
||
|
||
func preserveFailedStorageBackups(instanceName string, snapshot models.StateSnapshot, storagesToPreserve map[string]struct{}, current []models.StorageBackup) ([]models.StorageBackup, []string) {
|
||
if len(storagesToPreserve) == 0 {
|
||
return current, nil
|
||
}
|
||
|
||
existing := make(map[string]struct{}, len(current))
|
||
for _, backup := range current {
|
||
existing[backup.ID] = struct{}{}
|
||
}
|
||
|
||
preserved := make(map[string]struct{})
|
||
for _, backup := range snapshot.PVEBackups.StorageBackups {
|
||
if backup.Instance != instanceName {
|
||
continue
|
||
}
|
||
if _, ok := storagesToPreserve[backup.Storage]; !ok {
|
||
continue
|
||
}
|
||
if _, duplicate := existing[backup.ID]; duplicate {
|
||
continue
|
||
}
|
||
current = append(current, backup)
|
||
existing[backup.ID] = struct{}{}
|
||
preserved[backup.Storage] = struct{}{}
|
||
}
|
||
|
||
if len(preserved) == 0 {
|
||
return current, nil
|
||
}
|
||
|
||
storages := make([]string, 0, len(preserved))
|
||
for storage := range preserved {
|
||
storages = append(storages, storage)
|
||
}
|
||
sort.Strings(storages)
|
||
return current, storages
|
||
}
|
||
|
||
func buildGuestLookups(snapshot models.StateSnapshot, metadataStore *config.GuestMetadataStore) (map[string]alerts.GuestLookup, map[string][]alerts.GuestLookup) {
|
||
byKey := make(map[string]alerts.GuestLookup)
|
||
byVMID := make(map[string][]alerts.GuestLookup)
|
||
|
||
for _, vm := range snapshot.VMs {
|
||
info := alerts.GuestLookup{
|
||
ResourceID: makeGuestID(vm.Instance, vm.Node, vm.VMID),
|
||
Name: vm.Name,
|
||
Instance: vm.Instance,
|
||
Node: vm.Node,
|
||
Type: vm.Type,
|
||
VMID: vm.VMID,
|
||
}
|
||
key := alerts.BuildGuestKey(vm.Instance, vm.Node, vm.VMID)
|
||
byKey[key] = info
|
||
|
||
vmidKey := strconv.Itoa(vm.VMID)
|
||
byVMID[vmidKey] = append(byVMID[vmidKey], info)
|
||
|
||
// Persist last-known name and type for this guest
|
||
if metadataStore != nil && vm.Name != "" {
|
||
persistGuestIdentity(metadataStore, key, vm.Name, vm.Type)
|
||
}
|
||
}
|
||
|
||
for _, ct := range snapshot.Containers {
|
||
info := alerts.GuestLookup{
|
||
ResourceID: makeGuestID(ct.Instance, ct.Node, ct.VMID),
|
||
Name: ct.Name,
|
||
Instance: ct.Instance,
|
||
Node: ct.Node,
|
||
Type: ct.Type,
|
||
VMID: ct.VMID,
|
||
}
|
||
key := alerts.BuildGuestKey(ct.Instance, ct.Node, ct.VMID)
|
||
if _, exists := byKey[key]; !exists {
|
||
byKey[key] = info
|
||
}
|
||
|
||
vmidKey := strconv.Itoa(ct.VMID)
|
||
byVMID[vmidKey] = append(byVMID[vmidKey], info)
|
||
|
||
// Persist last-known name and type for this guest
|
||
if metadataStore != nil && ct.Name != "" {
|
||
persistGuestIdentity(metadataStore, key, ct.Name, ct.Type)
|
||
}
|
||
}
|
||
|
||
// Augment byVMID with persisted metadata for deleted guests
|
||
if metadataStore != nil {
|
||
enrichWithPersistedMetadata(metadataStore, byVMID)
|
||
}
|
||
|
||
return byKey, byVMID
|
||
}
|
||
|
||
// enrichWithPersistedMetadata adds entries from the metadata store for guests
|
||
// that no longer exist in the live inventory but have persisted identity data
|
||
func enrichWithPersistedMetadata(metadataStore *config.GuestMetadataStore, byVMID map[string][]alerts.GuestLookup) {
|
||
allMetadata := metadataStore.GetAll()
|
||
for guestKey, meta := range allMetadata {
|
||
if meta.LastKnownName == "" {
|
||
continue // No name persisted, skip
|
||
}
|
||
|
||
// Parse the guest key (format: instance:node:vmid)
|
||
// We need to extract instance, node, and vmid
|
||
var instance, node string
|
||
parts := strings.Split(guestKey, ":")
|
||
if len(parts) != 3 {
|
||
continue
|
||
}
|
||
instance, node = parts[0], parts[1]
|
||
vmid, err := strconv.Atoi(parts[2])
|
||
if err != nil {
|
||
continue
|
||
}
|
||
|
||
vmidKey := strconv.Itoa(vmid)
|
||
|
||
// Check if we already have a live entry for this exact guest
|
||
hasLiveEntry := false
|
||
for _, existing := range byVMID[vmidKey] {
|
||
if existing.Instance == instance && existing.Node == node && existing.VMID == vmid {
|
||
hasLiveEntry = true
|
||
break
|
||
}
|
||
}
|
||
|
||
// Only add persisted metadata if no live entry exists
|
||
if !hasLiveEntry {
|
||
byVMID[vmidKey] = append(byVMID[vmidKey], alerts.GuestLookup{
|
||
Name: meta.LastKnownName,
|
||
Instance: instance,
|
||
Node: node,
|
||
Type: meta.LastKnownType,
|
||
VMID: vmid,
|
||
})
|
||
}
|
||
}
|
||
}
|
||
|
||
// persistGuestIdentity updates the metadata store with the last-known name and type for a guest
|
||
func persistGuestIdentity(metadataStore *config.GuestMetadataStore, guestKey, name, guestType string) {
|
||
existing := metadataStore.Get(guestKey)
|
||
if existing == nil {
|
||
existing = &config.GuestMetadata{
|
||
ID: guestKey,
|
||
Tags: []string{},
|
||
}
|
||
}
|
||
|
||
guestType = strings.TrimSpace(guestType)
|
||
if guestType == "" {
|
||
return
|
||
}
|
||
|
||
// Never "downgrade" OCI containers back to LXC. OCI classification can be transiently
|
||
// unavailable if Proxmox config reads fail due to permissions or transient API errors.
|
||
if existing.LastKnownType == "oci" && guestType != "oci" {
|
||
guestType = existing.LastKnownType
|
||
}
|
||
|
||
// Only update if the name or type has changed
|
||
if existing.LastKnownName != name || existing.LastKnownType != guestType {
|
||
existing.LastKnownName = name
|
||
existing.LastKnownType = guestType
|
||
// Save asynchronously to avoid blocking the monitor
|
||
go func() {
|
||
if err := metadataStore.Set(guestKey, existing); err != nil {
|
||
log.Error().Err(err).Str("guestKey", guestKey).Msg("Failed to persist guest identity")
|
||
}
|
||
}()
|
||
}
|
||
}
|
||
|
||
func (m *Monitor) calculateBackupOperationTimeout(instanceName string) time.Duration {
|
||
const (
|
||
minTimeout = 2 * time.Minute
|
||
maxTimeout = 5 * time.Minute
|
||
timeoutPerGuest = 2 * time.Second
|
||
)
|
||
|
||
timeout := minTimeout
|
||
snapshot := m.state.GetSnapshot()
|
||
|
||
guestCount := 0
|
||
for _, vm := range snapshot.VMs {
|
||
if vm.Instance == instanceName && !vm.Template {
|
||
guestCount++
|
||
}
|
||
}
|
||
for _, ct := range snapshot.Containers {
|
||
if ct.Instance == instanceName && !ct.Template {
|
||
guestCount++
|
||
}
|
||
}
|
||
|
||
if guestCount > 0 {
|
||
dynamic := time.Duration(guestCount) * timeoutPerGuest
|
||
if dynamic > timeout {
|
||
timeout = dynamic
|
||
}
|
||
}
|
||
|
||
if timeout > maxTimeout {
|
||
return maxTimeout
|
||
}
|
||
|
||
return timeout
|
||
}
|
||
|
||
// pollGuestSnapshots polls snapshots for all VMs and containers
|
||
func (m *Monitor) pollGuestSnapshots(ctx context.Context, instanceName string, client PVEClientInterface) {
|
||
log.Debug().Str("instance", instanceName).Msg("Polling guest snapshots")
|
||
|
||
// Get current VMs and containers from a properly-locked state snapshot.
|
||
// Using GetSnapshot() ensures we read a consistent view of VMs/containers
|
||
// with the State's internal mutex, avoiding data races.
|
||
snapshot := m.state.GetSnapshot()
|
||
var vms []models.VM
|
||
for _, vm := range snapshot.VMs {
|
||
if vm.Instance == instanceName {
|
||
vms = append(vms, vm)
|
||
}
|
||
}
|
||
var containers []models.Container
|
||
for _, ct := range snapshot.Containers {
|
||
if ct.Instance == instanceName {
|
||
containers = append(containers, ct)
|
||
}
|
||
}
|
||
|
||
guestKey := func(instance, node string, vmid int) string {
|
||
if instance == node {
|
||
return fmt.Sprintf("%s-%d", node, vmid)
|
||
}
|
||
return fmt.Sprintf("%s-%s-%d", instance, node, vmid)
|
||
}
|
||
|
||
guestNames := make(map[string]string, len(vms)+len(containers))
|
||
for _, vm := range vms {
|
||
guestNames[guestKey(instanceName, vm.Node, vm.VMID)] = vm.Name
|
||
}
|
||
for _, ct := range containers {
|
||
guestNames[guestKey(instanceName, ct.Node, ct.VMID)] = ct.Name
|
||
}
|
||
|
||
activeGuests := 0
|
||
for _, vm := range vms {
|
||
if !vm.Template {
|
||
activeGuests++
|
||
}
|
||
}
|
||
for _, ct := range containers {
|
||
if !ct.Template {
|
||
activeGuests++
|
||
}
|
||
}
|
||
|
||
const (
|
||
minSnapshotTimeout = 60 * time.Second
|
||
maxSnapshotTimeout = 4 * time.Minute
|
||
snapshotTimeoutPerGuest = 2 * time.Second
|
||
)
|
||
|
||
timeout := minSnapshotTimeout
|
||
if activeGuests > 0 {
|
||
dynamic := time.Duration(activeGuests) * snapshotTimeoutPerGuest
|
||
if dynamic > timeout {
|
||
timeout = dynamic
|
||
}
|
||
}
|
||
if timeout > maxSnapshotTimeout {
|
||
timeout = maxSnapshotTimeout
|
||
}
|
||
|
||
if deadline, ok := ctx.Deadline(); ok {
|
||
remaining := time.Until(deadline)
|
||
if remaining <= 0 {
|
||
log.Warn().
|
||
Str("instance", instanceName).
|
||
Msg("Skipping guest snapshot polling; backup context deadline exceeded")
|
||
return
|
||
}
|
||
if timeout > remaining {
|
||
timeout = remaining
|
||
}
|
||
}
|
||
|
||
snapshotCtx, cancel := context.WithTimeout(ctx, timeout)
|
||
defer cancel()
|
||
|
||
log.Debug().
|
||
Str("instance", instanceName).
|
||
Int("guestCount", activeGuests).
|
||
Dur("timeout", timeout).
|
||
Msg("Guest snapshot polling budget established")
|
||
|
||
var allSnapshots []models.GuestSnapshot
|
||
deadlineExceeded := false
|
||
|
||
// Poll VM snapshots
|
||
for _, vm := range vms {
|
||
// Skip templates
|
||
if vm.Template {
|
||
continue
|
||
}
|
||
|
||
snapshots, err := client.GetVMSnapshots(snapshotCtx, vm.Node, vm.VMID)
|
||
if err != nil {
|
||
if snapshotCtx.Err() != nil {
|
||
log.Warn().
|
||
Str("instance", instanceName).
|
||
Str("node", vm.Node).
|
||
Int("vmid", vm.VMID).
|
||
Err(snapshotCtx.Err()).
|
||
Msg("Aborting guest snapshot polling due to context cancellation while fetching VM snapshots")
|
||
deadlineExceeded = true
|
||
break
|
||
}
|
||
// This is common for VMs without snapshots, so use debug level
|
||
monErr := errors.NewMonitorError(errors.ErrorTypeAPI, "get_vm_snapshots", instanceName, err).WithNode(vm.Node)
|
||
log.Debug().
|
||
Err(monErr).
|
||
Str("node", vm.Node).
|
||
Int("vmid", vm.VMID).
|
||
Msg("Failed to get VM snapshots")
|
||
continue
|
||
}
|
||
|
||
for _, snap := range snapshots {
|
||
snapshot := models.GuestSnapshot{
|
||
ID: fmt.Sprintf("%s-%s-%d-%s", instanceName, vm.Node, vm.VMID, snap.Name),
|
||
Name: snap.Name,
|
||
Node: vm.Node,
|
||
Instance: instanceName,
|
||
Type: "qemu",
|
||
VMID: vm.VMID,
|
||
Time: time.Unix(snap.SnapTime, 0),
|
||
Description: snap.Description,
|
||
Parent: snap.Parent,
|
||
VMState: true, // VM state support enabled
|
||
}
|
||
|
||
allSnapshots = append(allSnapshots, snapshot)
|
||
}
|
||
}
|
||
|
||
if deadlineExceeded {
|
||
log.Warn().
|
||
Str("instance", instanceName).
|
||
Msg("Guest snapshot polling timed out before completing VM collection; retaining previous snapshots")
|
||
return
|
||
}
|
||
|
||
// Poll container snapshots
|
||
for _, ct := range containers {
|
||
// Skip templates
|
||
if ct.Template {
|
||
continue
|
||
}
|
||
|
||
snapshots, err := client.GetContainerSnapshots(snapshotCtx, ct.Node, ct.VMID)
|
||
if err != nil {
|
||
if snapshotCtx.Err() != nil {
|
||
log.Warn().
|
||
Str("instance", instanceName).
|
||
Str("node", ct.Node).
|
||
Int("vmid", ct.VMID).
|
||
Err(snapshotCtx.Err()).
|
||
Msg("Aborting guest snapshot polling due to context cancellation while fetching container snapshots")
|
||
deadlineExceeded = true
|
||
break
|
||
}
|
||
// API error 596 means snapshots not supported/available - this is expected for many containers
|
||
errStr := err.Error()
|
||
if strings.Contains(errStr, "596") || strings.Contains(errStr, "not available") {
|
||
// Silently skip containers without snapshot support
|
||
continue
|
||
}
|
||
// Log other errors at debug level
|
||
monErr := errors.NewMonitorError(errors.ErrorTypeAPI, "get_container_snapshots", instanceName, err).WithNode(ct.Node)
|
||
log.Debug().
|
||
Err(monErr).
|
||
Str("node", ct.Node).
|
||
Int("vmid", ct.VMID).
|
||
Msg("Failed to get container snapshots")
|
||
continue
|
||
}
|
||
|
||
for _, snap := range snapshots {
|
||
snapshot := models.GuestSnapshot{
|
||
ID: fmt.Sprintf("%s-%s-%d-%s", instanceName, ct.Node, ct.VMID, snap.Name),
|
||
Name: snap.Name,
|
||
Node: ct.Node,
|
||
Instance: instanceName,
|
||
Type: "lxc",
|
||
VMID: ct.VMID,
|
||
Time: time.Unix(snap.SnapTime, 0),
|
||
Description: snap.Description,
|
||
Parent: snap.Parent,
|
||
VMState: false,
|
||
}
|
||
|
||
allSnapshots = append(allSnapshots, snapshot)
|
||
}
|
||
}
|
||
|
||
if deadlineExceeded || snapshotCtx.Err() != nil {
|
||
log.Warn().
|
||
Str("instance", instanceName).
|
||
Msg("Guest snapshot polling timed out before completion; retaining previous snapshots")
|
||
return
|
||
}
|
||
|
||
if len(allSnapshots) > 0 {
|
||
sizeMap := m.collectSnapshotSizes(snapshotCtx, instanceName, client, allSnapshots)
|
||
if len(sizeMap) > 0 {
|
||
for i := range allSnapshots {
|
||
if size, ok := sizeMap[allSnapshots[i].ID]; ok && size > 0 {
|
||
allSnapshots[i].SizeBytes = size
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// Update state with guest snapshots for this instance
|
||
m.state.UpdateGuestSnapshotsForInstance(instanceName, allSnapshots)
|
||
|
||
if m.alertManager != nil {
|
||
m.alertManager.CheckSnapshotsForInstance(instanceName, allSnapshots, guestNames)
|
||
}
|
||
|
||
log.Debug().
|
||
Str("instance", instanceName).
|
||
Int("count", len(allSnapshots)).
|
||
Msg("Guest snapshots polled")
|
||
|
||
// Immediately broadcast the updated state so frontend sees new snapshots
|
||
m.broadcastStateUpdate()
|
||
}
|
||
|
||
func (m *Monitor) collectSnapshotSizes(ctx context.Context, instanceName string, client PVEClientInterface, snapshots []models.GuestSnapshot) map[string]int64 {
|
||
sizes := make(map[string]int64, len(snapshots))
|
||
if len(snapshots) == 0 {
|
||
return sizes
|
||
}
|
||
|
||
validSnapshots := make(map[string]struct{}, len(snapshots))
|
||
nodes := make(map[string]struct{})
|
||
|
||
for _, snap := range snapshots {
|
||
validSnapshots[snap.ID] = struct{}{}
|
||
if snap.Node != "" {
|
||
nodes[snap.Node] = struct{}{}
|
||
}
|
||
}
|
||
|
||
if len(nodes) == 0 {
|
||
return sizes
|
||
}
|
||
|
||
seenVolids := make(map[string]struct{})
|
||
|
||
for nodeName := range nodes {
|
||
if ctx.Err() != nil {
|
||
break
|
||
}
|
||
|
||
storages, err := client.GetStorage(ctx, nodeName)
|
||
if err != nil {
|
||
log.Debug().
|
||
Err(err).
|
||
Str("node", nodeName).
|
||
Str("instance", instanceName).
|
||
Msg("Failed to get storage list for snapshot sizing")
|
||
continue
|
||
}
|
||
|
||
for _, storage := range storages {
|
||
if ctx.Err() != nil {
|
||
break
|
||
}
|
||
|
||
contentTypes := strings.ToLower(storage.Content)
|
||
if !strings.Contains(contentTypes, "images") && !strings.Contains(contentTypes, "rootdir") {
|
||
continue
|
||
}
|
||
if !storageContentQueryable(storage) {
|
||
continue
|
||
}
|
||
|
||
contents, err := client.GetStorageContent(ctx, nodeName, storage.Storage)
|
||
if err != nil {
|
||
log.Debug().
|
||
Err(err).
|
||
Str("node", nodeName).
|
||
Str("storage", storage.Storage).
|
||
Str("instance", instanceName).
|
||
Msg("Failed to get storage content for snapshot sizing")
|
||
continue
|
||
}
|
||
|
||
for _, item := range contents {
|
||
if item.VMID <= 0 {
|
||
continue
|
||
}
|
||
|
||
if _, seen := seenVolids[item.Volid]; seen {
|
||
continue
|
||
}
|
||
|
||
snapName := extractSnapshotName(item.Volid)
|
||
if snapName == "" {
|
||
continue
|
||
}
|
||
|
||
key := fmt.Sprintf("%s-%s-%d-%s", instanceName, nodeName, item.VMID, snapName)
|
||
if _, ok := validSnapshots[key]; !ok {
|
||
continue
|
||
}
|
||
|
||
seenVolids[item.Volid] = struct{}{}
|
||
|
||
size := int64(item.Size)
|
||
if size < 0 {
|
||
size = 0
|
||
}
|
||
|
||
sizes[key] += size
|
||
}
|
||
}
|
||
}
|
||
|
||
return sizes
|
||
}
|
||
|
||
func extractSnapshotName(volid string) string {
|
||
if volid == "" {
|
||
return ""
|
||
}
|
||
|
||
parts := strings.SplitN(volid, ":", 2)
|
||
remainder := volid
|
||
if len(parts) == 2 {
|
||
remainder = parts[1]
|
||
}
|
||
|
||
if idx := strings.Index(remainder, "@"); idx >= 0 && idx+1 < len(remainder) {
|
||
return strings.TrimSpace(remainder[idx+1:])
|
||
}
|
||
|
||
return ""
|
||
}
|
||
|
||
// Stop gracefully stops the monitor
|
||
func (m *Monitor) Stop() {
|
||
log.Info().Msg("Stopping monitor")
|
||
|
||
// Stop the alert manager to save history
|
||
if m.alertManager != nil {
|
||
m.alertManager.Stop()
|
||
}
|
||
|
||
// Stop notification manager
|
||
if m.notificationMgr != nil {
|
||
m.notificationMgr.Stop()
|
||
}
|
||
|
||
// Close persistent metrics store (flushes buffered data)
|
||
if m.metricsStore != nil {
|
||
if err := m.metricsStore.Close(); err != nil {
|
||
log.Error().Err(err).Msg("Failed to close metrics store")
|
||
} else {
|
||
log.Info().Msg("Metrics store closed successfully")
|
||
}
|
||
}
|
||
|
||
log.Info().Msg("Monitor stopped")
|
||
}
|
||
|
||
// recordAuthFailure records an authentication failure for a node
|
||
func (m *Monitor) recordAuthFailure(instanceName string, nodeType string) {
|
||
m.mu.Lock()
|
||
defer m.mu.Unlock()
|
||
|
||
nodeID := instanceName
|
||
if nodeType != "" {
|
||
nodeID = nodeType + "-" + instanceName
|
||
}
|
||
|
||
// Increment failure count
|
||
m.authFailures[nodeID]++
|
||
m.lastAuthAttempt[nodeID] = time.Now()
|
||
|
||
log.Warn().
|
||
Str("node", nodeID).
|
||
Int("failures", m.authFailures[nodeID]).
|
||
Msg("Authentication failure recorded")
|
||
|
||
// If we've exceeded the threshold, remove the node
|
||
const maxAuthFailures = 5
|
||
if m.authFailures[nodeID] >= maxAuthFailures {
|
||
log.Error().
|
||
Str("node", nodeID).
|
||
Int("failures", m.authFailures[nodeID]).
|
||
Msg("Maximum authentication failures reached, removing node from state")
|
||
|
||
// Remove from state based on type
|
||
if nodeType == "pve" {
|
||
m.removeFailedPVENode(instanceName)
|
||
} else if nodeType == "pbs" {
|
||
m.removeFailedPBSNode(instanceName)
|
||
} else if nodeType == "pmg" {
|
||
m.removeFailedPMGInstance(instanceName)
|
||
}
|
||
|
||
// Reset the counter since we've removed the node
|
||
delete(m.authFailures, nodeID)
|
||
delete(m.lastAuthAttempt, nodeID)
|
||
}
|
||
}
|
||
|
||
// resetAuthFailures resets the failure count for a node after successful auth
|
||
func (m *Monitor) resetAuthFailures(instanceName string, nodeType string) {
|
||
m.mu.Lock()
|
||
defer m.mu.Unlock()
|
||
|
||
nodeID := instanceName
|
||
if nodeType != "" {
|
||
nodeID = nodeType + "-" + instanceName
|
||
}
|
||
|
||
if count, exists := m.authFailures[nodeID]; exists && count > 0 {
|
||
log.Info().
|
||
Str("node", nodeID).
|
||
Int("previousFailures", count).
|
||
Msg("Authentication succeeded, resetting failure count")
|
||
|
||
delete(m.authFailures, nodeID)
|
||
delete(m.lastAuthAttempt, nodeID)
|
||
}
|
||
}
|
||
|
||
// removeFailedPVENode updates a PVE node to show failed authentication status
|
||
func (m *Monitor) removeFailedPVENode(instanceName string) {
|
||
// Get instance config to get host URL
|
||
var hostURL string
|
||
for _, cfg := range m.config.PVEInstances {
|
||
if cfg.Name == instanceName {
|
||
hostURL = cfg.Host
|
||
break
|
||
}
|
||
}
|
||
|
||
// Create a failed node entry to show in UI with error status
|
||
failedNode := models.Node{
|
||
ID: instanceName + "-failed",
|
||
Name: instanceName,
|
||
DisplayName: instanceName,
|
||
Instance: instanceName,
|
||
Host: hostURL, // Include host URL even for failed nodes
|
||
Status: "offline",
|
||
Type: "node",
|
||
ConnectionHealth: "error",
|
||
LastSeen: time.Now(),
|
||
// Set other fields to zero values to indicate no data
|
||
CPU: 0,
|
||
Memory: models.Memory{},
|
||
Disk: models.Disk{},
|
||
}
|
||
|
||
// Update with just the failed node
|
||
m.state.UpdateNodesForInstance(instanceName, []models.Node{failedNode})
|
||
|
||
// Remove all other resources associated with this instance
|
||
m.state.UpdateVMsForInstance(instanceName, []models.VM{})
|
||
m.state.UpdateContainersForInstance(instanceName, []models.Container{})
|
||
m.state.UpdateStorageForInstance(instanceName, []models.Storage{})
|
||
m.state.UpdateCephClustersForInstance(instanceName, []models.CephCluster{})
|
||
m.state.UpdateBackupTasksForInstance(instanceName, []models.BackupTask{})
|
||
m.state.UpdateStorageBackupsForInstance(instanceName, []models.StorageBackup{})
|
||
m.state.UpdateGuestSnapshotsForInstance(instanceName, []models.GuestSnapshot{})
|
||
|
||
// Set connection health to false
|
||
m.state.SetConnectionHealth(instanceName, false)
|
||
}
|
||
|
||
// removeFailedPBSNode removes a PBS node and all its resources from state
|
||
func (m *Monitor) removeFailedPBSNode(instanceName string) {
|
||
// Remove PBS instance by passing empty array
|
||
currentInstances := m.state.PBSInstances
|
||
var updatedInstances []models.PBSInstance
|
||
for _, inst := range currentInstances {
|
||
if inst.Name != instanceName {
|
||
updatedInstances = append(updatedInstances, inst)
|
||
}
|
||
}
|
||
m.state.UpdatePBSInstances(updatedInstances)
|
||
|
||
// Remove PBS backups
|
||
m.state.UpdatePBSBackups(instanceName, []models.PBSBackup{})
|
||
|
||
// Set connection health to false
|
||
m.state.SetConnectionHealth("pbs-"+instanceName, false)
|
||
}
|
||
|
||
// removeFailedPMGInstance removes PMG data from state when authentication fails repeatedly
|
||
func (m *Monitor) removeFailedPMGInstance(instanceName string) {
|
||
currentInstances := m.state.PMGInstances
|
||
updated := make([]models.PMGInstance, 0, len(currentInstances))
|
||
for _, inst := range currentInstances {
|
||
if inst.Name != instanceName {
|
||
updated = append(updated, inst)
|
||
}
|
||
}
|
||
|
||
m.state.UpdatePMGInstances(updated)
|
||
m.state.UpdatePMGBackups(instanceName, nil)
|
||
m.state.SetConnectionHealth("pmg-"+instanceName, false)
|
||
}
|
||
|
||
// pbsBackupCacheTTL controls how long cached PBS backup snapshots are reused
|
||
// before forcing a re-fetch. This ensures verification status changes (which
|
||
// don't alter backup count or timestamp) are picked up periodically.
|
||
const pbsBackupCacheTTL = 10 * time.Minute
|
||
|
||
type pbsBackupGroupKey struct {
|
||
datastore string
|
||
namespace string
|
||
backupType string
|
||
backupID string
|
||
}
|
||
|
||
type cachedPBSGroup struct {
|
||
snapshots []models.PBSBackup
|
||
latest time.Time
|
||
}
|
||
|
||
type pbsBackupFetchRequest struct {
|
||
datastore string
|
||
namespace string
|
||
group pbs.BackupGroup
|
||
cached cachedPBSGroup
|
||
}
|
||
|
||
// pollPBSBackups fetches all backups from PBS datastores
|
||
func (m *Monitor) pollPBSBackups(ctx context.Context, instanceName string, client *pbs.Client, datastores []models.PBSDatastore) {
|
||
log.Debug().Str("instance", instanceName).Msg("Polling PBS backups")
|
||
|
||
// Cache existing PBS backups so we can avoid redundant API calls when no changes occurred.
|
||
existingGroups := m.buildPBSBackupCache(instanceName)
|
||
|
||
var allBackups []models.PBSBackup
|
||
datastoreCount := len(datastores) // Number of datastores to query
|
||
datastoreFetches := 0 // Number of successful datastore fetches
|
||
datastoreErrors := 0 // Number of failed datastore fetches
|
||
|
||
// Process each datastore
|
||
for _, ds := range datastores {
|
||
if ctx.Err() != nil {
|
||
log.Warn().
|
||
Str("instance", instanceName).
|
||
Msg("PBS backup polling cancelled before completion")
|
||
return
|
||
}
|
||
|
||
namespacePaths := namespacePathsForDatastore(ds)
|
||
|
||
log.Info().
|
||
Str("instance", instanceName).
|
||
Str("datastore", ds.Name).
|
||
Int("namespaces", len(namespacePaths)).
|
||
Strs("namespace_paths", namespacePaths).
|
||
Msg("Processing datastore namespaces")
|
||
|
||
datastoreHadSuccess := false
|
||
groupsReused := 0
|
||
groupsRequested := 0
|
||
|
||
for _, namespace := range namespacePaths {
|
||
if ctx.Err() != nil {
|
||
log.Warn().
|
||
Str("instance", instanceName).
|
||
Msg("PBS backup polling cancelled mid-datastore")
|
||
return
|
||
}
|
||
|
||
groups, err := client.ListBackupGroups(ctx, ds.Name, namespace)
|
||
if err != nil {
|
||
log.Error().
|
||
Err(err).
|
||
Str("instance", instanceName).
|
||
Str("datastore", ds.Name).
|
||
Str("namespace", namespace).
|
||
Msg("Failed to list PBS backup groups")
|
||
continue
|
||
}
|
||
|
||
datastoreHadSuccess = true
|
||
requests := make([]pbsBackupFetchRequest, 0, len(groups))
|
||
|
||
for _, group := range groups {
|
||
key := pbsBackupGroupKey{
|
||
datastore: ds.Name,
|
||
namespace: namespace,
|
||
backupType: group.BackupType,
|
||
backupID: group.BackupID,
|
||
}
|
||
cached := existingGroups[key]
|
||
|
||
// Group deleted (no backups left) - ensure cached data is dropped.
|
||
if group.BackupCount == 0 {
|
||
continue
|
||
}
|
||
|
||
lastBackupTime := time.Unix(group.LastBackup, 0)
|
||
hasCachedData := len(cached.snapshots) > 0
|
||
|
||
// Check if the cached data is still within its TTL.
|
||
cacheAge := time.Since(m.pbsBackupCacheTimeFor(instanceName, key))
|
||
cacheStillFresh := cacheAge < pbsBackupCacheTTL
|
||
|
||
// Only re-fetch when the backup count changes, the most recent backup
|
||
// is newer, or the cache TTL has expired (to pick up verification changes).
|
||
if hasCachedData &&
|
||
cacheStillFresh &&
|
||
len(cached.snapshots) == group.BackupCount &&
|
||
!lastBackupTime.After(cached.latest) {
|
||
|
||
allBackups = append(allBackups, cached.snapshots...)
|
||
groupsReused++
|
||
continue
|
||
}
|
||
|
||
requests = append(requests, pbsBackupFetchRequest{
|
||
datastore: ds.Name,
|
||
namespace: namespace,
|
||
group: group,
|
||
cached: cached,
|
||
})
|
||
}
|
||
|
||
if len(requests) == 0 {
|
||
continue
|
||
}
|
||
|
||
groupsRequested += len(requests)
|
||
fetched := m.fetchPBSBackupSnapshots(ctx, client, instanceName, requests)
|
||
if len(fetched) > 0 {
|
||
allBackups = append(allBackups, fetched...)
|
||
}
|
||
|
||
// Record fetch time for each requested group so the TTL tracks freshness.
|
||
// We record for all requested groups — on fetch failure, fetchPBSBackupSnapshots
|
||
// falls back to cached data, so the timestamp prevents hammering a failing
|
||
// endpoint. The TTL ensures we retry within a bounded window.
|
||
fetchedAt := time.Now()
|
||
for _, req := range requests {
|
||
reqKey := pbsBackupGroupKey{
|
||
datastore: req.datastore,
|
||
namespace: req.namespace,
|
||
backupType: req.group.BackupType,
|
||
backupID: req.group.BackupID,
|
||
}
|
||
m.setPBSBackupCacheTime(instanceName, reqKey, fetchedAt)
|
||
}
|
||
}
|
||
|
||
if datastoreHadSuccess {
|
||
datastoreFetches++
|
||
log.Info().
|
||
Str("instance", instanceName).
|
||
Str("datastore", ds.Name).
|
||
Int("namespaces", len(namespacePaths)).
|
||
Int("groups_reused", groupsReused).
|
||
Int("groups_refreshed", groupsRequested).
|
||
Msg("PBS datastore processed")
|
||
} else {
|
||
// Preserve cached data for this datastore if we couldn't fetch anything new.
|
||
log.Warn().
|
||
Str("instance", instanceName).
|
||
Str("datastore", ds.Name).
|
||
Msg("No namespaces succeeded for PBS datastore; using cached backups")
|
||
for key, entry := range existingGroups {
|
||
if key.datastore != ds.Name || len(entry.snapshots) == 0 {
|
||
continue
|
||
}
|
||
allBackups = append(allBackups, entry.snapshots...)
|
||
}
|
||
datastoreErrors++
|
||
}
|
||
}
|
||
|
||
log.Info().
|
||
Str("instance", instanceName).
|
||
Int("count", len(allBackups)).
|
||
Msg("PBS backups fetched")
|
||
|
||
// Decide whether to keep existing backups when all queries failed
|
||
if shouldPreservePBSBackups(datastoreCount, datastoreFetches) {
|
||
log.Warn().
|
||
Str("instance", instanceName).
|
||
Int("datastores", datastoreCount).
|
||
Int("errors", datastoreErrors).
|
||
Msg("All PBS datastore queries failed; keeping previous backup list")
|
||
return
|
||
}
|
||
|
||
// Update state
|
||
m.state.UpdatePBSBackups(instanceName, allBackups)
|
||
|
||
// Sync backup times to VMs/Containers for backup status indicators
|
||
m.state.SyncGuestBackupTimes()
|
||
|
||
if m.alertManager != nil {
|
||
snapshot := m.state.GetSnapshot()
|
||
guestsByKey, guestsByVMID := buildGuestLookups(snapshot, m.guestMetadataStore)
|
||
pveStorage := snapshot.Backups.PVE.StorageBackups
|
||
if len(pveStorage) == 0 && len(snapshot.PVEBackups.StorageBackups) > 0 {
|
||
pveStorage = snapshot.PVEBackups.StorageBackups
|
||
}
|
||
pbsBackups := snapshot.Backups.PBS
|
||
if len(pbsBackups) == 0 && len(snapshot.PBSBackups) > 0 {
|
||
pbsBackups = snapshot.PBSBackups
|
||
}
|
||
pmgBackups := snapshot.Backups.PMG
|
||
if len(pmgBackups) == 0 && len(snapshot.PMGBackups) > 0 {
|
||
pmgBackups = snapshot.PMGBackups
|
||
}
|
||
m.alertManager.CheckBackups(pveStorage, pbsBackups, pmgBackups, guestsByKey, guestsByVMID)
|
||
}
|
||
|
||
// Immediately broadcast the updated state so frontend sees new backups
|
||
m.broadcastStateUpdate()
|
||
}
|
||
|
||
func (m *Monitor) buildPBSBackupCache(instanceName string) map[pbsBackupGroupKey]cachedPBSGroup {
|
||
snapshot := m.state.GetSnapshot()
|
||
cache := make(map[pbsBackupGroupKey]cachedPBSGroup)
|
||
for _, backup := range snapshot.PBSBackups {
|
||
if backup.Instance != instanceName {
|
||
continue
|
||
}
|
||
key := pbsBackupGroupKey{
|
||
datastore: backup.Datastore,
|
||
namespace: normalizePBSNamespacePath(backup.Namespace),
|
||
backupType: backup.BackupType,
|
||
backupID: backup.VMID,
|
||
}
|
||
entry := cache[key]
|
||
entry.snapshots = append(entry.snapshots, backup)
|
||
if backup.BackupTime.After(entry.latest) {
|
||
entry.latest = backup.BackupTime
|
||
}
|
||
cache[key] = entry
|
||
}
|
||
return cache
|
||
}
|
||
|
||
// pbsBackupCacheTimeFor returns the last fetch time for a PBS backup group.
|
||
func (m *Monitor) pbsBackupCacheTimeFor(instanceName string, key pbsBackupGroupKey) time.Time {
|
||
m.mu.RLock()
|
||
defer m.mu.RUnlock()
|
||
if perGroup, ok := m.pbsBackupCacheTime[instanceName]; ok {
|
||
return perGroup[key]
|
||
}
|
||
return time.Time{}
|
||
}
|
||
|
||
// setPBSBackupCacheTime records when a PBS backup group was last fetched.
|
||
func (m *Monitor) setPBSBackupCacheTime(instanceName string, key pbsBackupGroupKey, t time.Time) {
|
||
m.mu.Lock()
|
||
defer m.mu.Unlock()
|
||
if m.pbsBackupCacheTime == nil {
|
||
m.pbsBackupCacheTime = make(map[string]map[pbsBackupGroupKey]time.Time)
|
||
}
|
||
if m.pbsBackupCacheTime[instanceName] == nil {
|
||
m.pbsBackupCacheTime[instanceName] = make(map[pbsBackupGroupKey]time.Time)
|
||
}
|
||
m.pbsBackupCacheTime[instanceName][key] = t
|
||
}
|
||
|
||
func normalizePBSNamespacePath(ns string) string {
|
||
if ns == "/" {
|
||
return ""
|
||
}
|
||
return ns
|
||
}
|
||
|
||
func namespacePathsForDatastore(ds models.PBSDatastore) []string {
|
||
if len(ds.Namespaces) == 0 {
|
||
return []string{""}
|
||
}
|
||
|
||
seen := make(map[string]struct{}, len(ds.Namespaces))
|
||
var paths []string
|
||
for _, ns := range ds.Namespaces {
|
||
path := normalizePBSNamespacePath(ns.Path)
|
||
if _, ok := seen[path]; ok {
|
||
continue
|
||
}
|
||
seen[path] = struct{}{}
|
||
paths = append(paths, path)
|
||
}
|
||
return paths
|
||
}
|
||
|
||
func (m *Monitor) fetchPBSBackupSnapshots(ctx context.Context, client *pbs.Client, instanceName string, requests []pbsBackupFetchRequest) []models.PBSBackup {
|
||
if len(requests) == 0 {
|
||
return nil
|
||
}
|
||
|
||
results := make(chan []models.PBSBackup, len(requests))
|
||
var wg sync.WaitGroup
|
||
sem := make(chan struct{}, 5)
|
||
|
||
for _, req := range requests {
|
||
req := req
|
||
wg.Add(1)
|
||
go func() {
|
||
defer wg.Done()
|
||
|
||
select {
|
||
case sem <- struct{}{}:
|
||
case <-ctx.Done():
|
||
return
|
||
}
|
||
defer func() { <-sem }()
|
||
|
||
log.Debug().
|
||
Str("instance", instanceName).
|
||
Str("datastore", req.datastore).
|
||
Str("namespace", req.namespace).
|
||
Str("type", req.group.BackupType).
|
||
Str("id", req.group.BackupID).
|
||
Msg("Refreshing PBS backup group")
|
||
|
||
snapshots, err := client.ListBackupSnapshots(ctx, req.datastore, req.namespace, req.group.BackupType, req.group.BackupID)
|
||
if err != nil {
|
||
log.Error().
|
||
Err(err).
|
||
Str("instance", instanceName).
|
||
Str("datastore", req.datastore).
|
||
Str("namespace", req.namespace).
|
||
Str("type", req.group.BackupType).
|
||
Str("id", req.group.BackupID).
|
||
Msg("Failed to list PBS backup snapshots")
|
||
|
||
if len(req.cached.snapshots) > 0 {
|
||
results <- req.cached.snapshots
|
||
}
|
||
return
|
||
}
|
||
|
||
results <- convertPBSSnapshots(instanceName, req.datastore, req.namespace, snapshots)
|
||
}()
|
||
}
|
||
|
||
go func() {
|
||
wg.Wait()
|
||
close(results)
|
||
}()
|
||
|
||
var combined []models.PBSBackup
|
||
for backups := range results {
|
||
if len(backups) == 0 {
|
||
continue
|
||
}
|
||
combined = append(combined, backups...)
|
||
}
|
||
|
||
return combined
|
||
}
|
||
|
||
func convertPBSSnapshots(instanceName, datastore, namespace string, snapshots []pbs.BackupSnapshot) []models.PBSBackup {
|
||
backups := make([]models.PBSBackup, 0, len(snapshots))
|
||
for _, snapshot := range snapshots {
|
||
backupTime := time.Unix(snapshot.BackupTime, 0)
|
||
id := fmt.Sprintf("pbs-%s-%s-%s-%s-%s-%d",
|
||
instanceName, datastore, namespace,
|
||
snapshot.BackupType, snapshot.BackupID,
|
||
snapshot.BackupTime)
|
||
|
||
var fileNames []string
|
||
for _, file := range snapshot.Files {
|
||
switch f := file.(type) {
|
||
case string:
|
||
fileNames = append(fileNames, f)
|
||
case map[string]interface{}:
|
||
if filename, ok := f["filename"].(string); ok {
|
||
fileNames = append(fileNames, filename)
|
||
}
|
||
}
|
||
}
|
||
|
||
verified := false
|
||
if snapshot.Verification != nil {
|
||
switch v := snapshot.Verification.(type) {
|
||
case string:
|
||
verified = v == "ok"
|
||
case map[string]interface{}:
|
||
if state, ok := v["state"].(string); ok {
|
||
verified = state == "ok"
|
||
}
|
||
}
|
||
|
||
log.Debug().
|
||
Str("vmid", snapshot.BackupID).
|
||
Int64("time", snapshot.BackupTime).
|
||
Interface("verification", snapshot.Verification).
|
||
Bool("verified", verified).
|
||
Msg("PBS backup verification status")
|
||
}
|
||
|
||
backups = append(backups, models.PBSBackup{
|
||
ID: id,
|
||
Instance: instanceName,
|
||
Datastore: datastore,
|
||
Namespace: namespace,
|
||
BackupType: snapshot.BackupType,
|
||
VMID: snapshot.BackupID,
|
||
BackupTime: backupTime,
|
||
Size: snapshot.Size,
|
||
Protected: snapshot.Protected,
|
||
Verified: verified,
|
||
Comment: snapshot.Comment,
|
||
Files: fileNames,
|
||
Owner: snapshot.Owner,
|
||
})
|
||
}
|
||
|
||
return backups
|
||
}
|
||
|
||
// checkMockAlerts checks alerts for mock data
|
||
func (m *Monitor) checkMockAlerts() {
|
||
defer recoverFromPanic("checkMockAlerts")
|
||
|
||
log.Info().Bool("mockEnabled", mock.IsMockEnabled()).Msg("checkMockAlerts called")
|
||
if !mock.IsMockEnabled() {
|
||
log.Info().Msg("Mock mode not enabled, skipping mock alert check")
|
||
return
|
||
}
|
||
|
||
// Get mock state
|
||
state := mock.GetMockState()
|
||
|
||
log.Info().
|
||
Int("vms", len(state.VMs)).
|
||
Int("containers", len(state.Containers)).
|
||
Int("nodes", len(state.Nodes)).
|
||
Msg("Checking alerts for mock data")
|
||
|
||
// Clean up alerts for nodes that no longer exist
|
||
existingNodes := make(map[string]bool)
|
||
for _, node := range state.Nodes {
|
||
existingNodes[node.Name] = true
|
||
if node.Host != "" {
|
||
existingNodes[node.Host] = true
|
||
}
|
||
}
|
||
for _, pbsInst := range state.PBSInstances {
|
||
existingNodes[pbsInst.Name] = true
|
||
existingNodes["pbs-"+pbsInst.Name] = true
|
||
if pbsInst.Host != "" {
|
||
existingNodes[pbsInst.Host] = true
|
||
}
|
||
}
|
||
log.Info().
|
||
Int("trackedNodes", len(existingNodes)).
|
||
Msg("Collecting resources for alert cleanup in mock mode")
|
||
m.alertManager.CleanupAlertsForNodes(existingNodes)
|
||
|
||
guestsByKey, guestsByVMID := buildGuestLookups(state, m.guestMetadataStore)
|
||
pveStorage := state.Backups.PVE.StorageBackups
|
||
if len(pveStorage) == 0 && len(state.PVEBackups.StorageBackups) > 0 {
|
||
pveStorage = state.PVEBackups.StorageBackups
|
||
}
|
||
pbsBackups := state.Backups.PBS
|
||
if len(pbsBackups) == 0 && len(state.PBSBackups) > 0 {
|
||
pbsBackups = state.PBSBackups
|
||
}
|
||
pmgBackups := state.Backups.PMG
|
||
if len(pmgBackups) == 0 && len(state.PMGBackups) > 0 {
|
||
pmgBackups = state.PMGBackups
|
||
}
|
||
m.alertManager.CheckBackups(pveStorage, pbsBackups, pmgBackups, guestsByKey, guestsByVMID)
|
||
|
||
// Limit how many guests we check per cycle to prevent blocking with large datasets
|
||
const maxGuestsPerCycle = 50
|
||
guestsChecked := 0
|
||
|
||
// Check alerts for VMs (up to limit)
|
||
for _, vm := range state.VMs {
|
||
if guestsChecked >= maxGuestsPerCycle {
|
||
log.Debug().
|
||
Int("checked", guestsChecked).
|
||
Int("total", len(state.VMs)+len(state.Containers)).
|
||
Msg("Reached guest check limit for this cycle")
|
||
break
|
||
}
|
||
m.alertManager.CheckGuest(vm, "mock")
|
||
guestsChecked++
|
||
}
|
||
|
||
// Check alerts for containers (if we haven't hit the limit)
|
||
for _, container := range state.Containers {
|
||
if guestsChecked >= maxGuestsPerCycle {
|
||
break
|
||
}
|
||
m.alertManager.CheckGuest(container, "mock")
|
||
guestsChecked++
|
||
}
|
||
|
||
// Check alerts for each node
|
||
for _, node := range state.Nodes {
|
||
m.alertManager.CheckNode(node)
|
||
}
|
||
|
||
// Check alerts for storage
|
||
log.Info().Int("storageCount", len(state.Storage)).Msg("Checking storage alerts")
|
||
for _, storage := range state.Storage {
|
||
log.Debug().
|
||
Str("name", storage.Name).
|
||
Float64("usage", storage.Usage).
|
||
Msg("Checking storage for alerts")
|
||
m.alertManager.CheckStorage(storage)
|
||
}
|
||
|
||
// Check alerts for PBS instances
|
||
log.Info().Int("pbsCount", len(state.PBSInstances)).Msg("Checking PBS alerts")
|
||
for _, pbsInst := range state.PBSInstances {
|
||
m.alertManager.CheckPBS(pbsInst)
|
||
}
|
||
|
||
// Check alerts for PMG instances
|
||
log.Info().Int("pmgCount", len(state.PMGInstances)).Msg("Checking PMG alerts")
|
||
for _, pmgInst := range state.PMGInstances {
|
||
m.alertManager.CheckPMG(pmgInst)
|
||
}
|
||
|
||
// Cache the latest alert snapshots directly in the mock data so the API can serve
|
||
// mock state without needing to grab the alert manager lock again.
|
||
mock.UpdateAlertSnapshots(m.alertManager.GetActiveAlerts(), m.alertManager.GetRecentlyResolved())
|
||
}
|
||
func isLegacyHostAgent(agentType string) bool {
|
||
// Unified agent reports type="unified"
|
||
// Legacy standalone agents have empty type
|
||
return agentType != "unified"
|
||
}
|
||
|
||
func isLegacyDockerAgent(agentType string) bool {
|
||
// Unified agent reports type="unified"
|
||
// Legacy standalone agents have empty type
|
||
return agentType != "unified"
|
||
}
|
||
|
||
// convertAgentSMARTToModels converts agent report S.M.A.R.T. data to the models.HostDiskSMART format.
|
||
func convertAgentSMARTToModels(smart []agentshost.DiskSMART) []models.HostDiskSMART {
|
||
if len(smart) == 0 {
|
||
return nil
|
||
}
|
||
result := make([]models.HostDiskSMART, 0, len(smart))
|
||
for _, disk := range smart {
|
||
entry := models.HostDiskSMART{
|
||
Device: disk.Device,
|
||
Model: disk.Model,
|
||
Serial: disk.Serial,
|
||
WWN: disk.WWN,
|
||
Type: disk.Type,
|
||
Temperature: disk.Temperature,
|
||
Health: disk.Health,
|
||
Standby: disk.Standby,
|
||
}
|
||
if disk.Attributes != nil {
|
||
entry.Attributes = convertAgentSMARTAttributes(disk.Attributes)
|
||
}
|
||
result = append(result, entry)
|
||
}
|
||
return result
|
||
}
|
||
|
||
// convertAgentSMARTAttributes converts agent SMARTAttributes to models SMARTAttributes.
|
||
func convertAgentSMARTAttributes(src *agentshost.SMARTAttributes) *models.SMARTAttributes {
|
||
if src == nil {
|
||
return nil
|
||
}
|
||
return &models.SMARTAttributes{
|
||
PowerOnHours: src.PowerOnHours,
|
||
PowerCycles: src.PowerCycles,
|
||
ReallocatedSectors: src.ReallocatedSectors,
|
||
PendingSectors: src.PendingSectors,
|
||
OfflineUncorrectable: src.OfflineUncorrectable,
|
||
UDMACRCErrors: src.UDMACRCErrors,
|
||
PercentageUsed: src.PercentageUsed,
|
||
AvailableSpare: src.AvailableSpare,
|
||
MediaErrors: src.MediaErrors,
|
||
UnsafeShutdowns: src.UnsafeShutdowns,
|
||
}
|
||
}
|
||
|
||
// convertAgentCephToModels converts agent report Ceph data to the models.HostCephCluster format.
|
||
func convertAgentCephToModels(ceph *agentshost.CephCluster) *models.HostCephCluster {
|
||
if ceph == nil {
|
||
return nil
|
||
}
|
||
|
||
collectedAt, _ := time.Parse(time.RFC3339, ceph.CollectedAt)
|
||
|
||
result := &models.HostCephCluster{
|
||
FSID: ceph.FSID,
|
||
Health: models.HostCephHealth{
|
||
Status: ceph.Health.Status,
|
||
Checks: make(map[string]models.HostCephCheck),
|
||
},
|
||
MonMap: models.HostCephMonitorMap{
|
||
Epoch: ceph.MonMap.Epoch,
|
||
NumMons: ceph.MonMap.NumMons,
|
||
},
|
||
MgrMap: models.HostCephManagerMap{
|
||
Available: ceph.MgrMap.Available,
|
||
NumMgrs: ceph.MgrMap.NumMgrs,
|
||
ActiveMgr: ceph.MgrMap.ActiveMgr,
|
||
Standbys: ceph.MgrMap.Standbys,
|
||
},
|
||
OSDMap: models.HostCephOSDMap{
|
||
Epoch: ceph.OSDMap.Epoch,
|
||
NumOSDs: ceph.OSDMap.NumOSDs,
|
||
NumUp: ceph.OSDMap.NumUp,
|
||
NumIn: ceph.OSDMap.NumIn,
|
||
NumDown: ceph.OSDMap.NumDown,
|
||
NumOut: ceph.OSDMap.NumOut,
|
||
},
|
||
PGMap: models.HostCephPGMap{
|
||
NumPGs: ceph.PGMap.NumPGs,
|
||
BytesTotal: ceph.PGMap.BytesTotal,
|
||
BytesUsed: ceph.PGMap.BytesUsed,
|
||
BytesAvailable: ceph.PGMap.BytesAvailable,
|
||
DataBytes: ceph.PGMap.DataBytes,
|
||
UsagePercent: ceph.PGMap.UsagePercent,
|
||
DegradedRatio: ceph.PGMap.DegradedRatio,
|
||
MisplacedRatio: ceph.PGMap.MisplacedRatio,
|
||
ReadBytesPerSec: ceph.PGMap.ReadBytesPerSec,
|
||
WriteBytesPerSec: ceph.PGMap.WriteBytesPerSec,
|
||
ReadOpsPerSec: ceph.PGMap.ReadOpsPerSec,
|
||
WriteOpsPerSec: ceph.PGMap.WriteOpsPerSec,
|
||
},
|
||
CollectedAt: collectedAt,
|
||
}
|
||
|
||
// Convert monitors
|
||
for _, mon := range ceph.MonMap.Monitors {
|
||
result.MonMap.Monitors = append(result.MonMap.Monitors, models.HostCephMonitor{
|
||
Name: mon.Name,
|
||
Rank: mon.Rank,
|
||
Addr: mon.Addr,
|
||
Status: mon.Status,
|
||
})
|
||
}
|
||
|
||
// Convert health checks
|
||
for name, check := range ceph.Health.Checks {
|
||
result.Health.Checks[name] = models.HostCephCheck{
|
||
Severity: check.Severity,
|
||
Message: check.Message,
|
||
Detail: check.Detail,
|
||
}
|
||
}
|
||
|
||
// Convert health summary
|
||
for _, s := range ceph.Health.Summary {
|
||
result.Health.Summary = append(result.Health.Summary, models.HostCephHealthSummary{
|
||
Severity: s.Severity,
|
||
Message: s.Message,
|
||
})
|
||
}
|
||
|
||
// Convert pools
|
||
for _, pool := range ceph.Pools {
|
||
result.Pools = append(result.Pools, models.HostCephPool{
|
||
ID: pool.ID,
|
||
Name: pool.Name,
|
||
BytesUsed: pool.BytesUsed,
|
||
BytesAvailable: pool.BytesAvailable,
|
||
Objects: pool.Objects,
|
||
PercentUsed: pool.PercentUsed,
|
||
})
|
||
}
|
||
|
||
// Convert services
|
||
for _, svc := range ceph.Services {
|
||
result.Services = append(result.Services, models.HostCephService{
|
||
Type: svc.Type,
|
||
Running: svc.Running,
|
||
Total: svc.Total,
|
||
Daemons: svc.Daemons,
|
||
})
|
||
}
|
||
|
||
return result
|
||
}
|
||
|
||
// convertAgentCephToGlobalCluster converts agent Ceph data to the global CephCluster format
|
||
// used by the State.CephClusters list.
|
||
func convertAgentCephToGlobalCluster(ceph *agentshost.CephCluster, hostname, hostID string, timestamp time.Time) models.CephCluster {
|
||
// Use FSID as the primary ID since it's unique per Ceph cluster
|
||
id := ceph.FSID
|
||
if id == "" {
|
||
id = "agent-ceph-" + hostID
|
||
}
|
||
|
||
cluster := models.CephCluster{
|
||
ID: id,
|
||
Instance: "agent:" + hostname,
|
||
Name: hostname + " Ceph",
|
||
FSID: ceph.FSID,
|
||
Health: strings.TrimPrefix(ceph.Health.Status, "HEALTH_"),
|
||
TotalBytes: int64(ceph.PGMap.BytesTotal),
|
||
UsedBytes: int64(ceph.PGMap.BytesUsed),
|
||
AvailableBytes: int64(ceph.PGMap.BytesAvailable),
|
||
UsagePercent: ceph.PGMap.UsagePercent,
|
||
NumMons: ceph.MonMap.NumMons,
|
||
NumMgrs: ceph.MgrMap.NumMgrs,
|
||
NumOSDs: ceph.OSDMap.NumOSDs,
|
||
NumOSDsUp: ceph.OSDMap.NumUp,
|
||
NumOSDsIn: ceph.OSDMap.NumIn,
|
||
NumPGs: ceph.PGMap.NumPGs,
|
||
LastUpdated: timestamp,
|
||
}
|
||
|
||
// Build health message from checks
|
||
var healthMessages []string
|
||
for _, check := range ceph.Health.Checks {
|
||
if check.Message != "" {
|
||
healthMessages = append(healthMessages, check.Message)
|
||
}
|
||
}
|
||
if len(healthMessages) > 0 {
|
||
cluster.HealthMessage = strings.Join(healthMessages, "; ")
|
||
}
|
||
|
||
// Convert pools
|
||
for _, pool := range ceph.Pools {
|
||
cluster.Pools = append(cluster.Pools, models.CephPool{
|
||
ID: pool.ID,
|
||
Name: pool.Name,
|
||
StoredBytes: int64(pool.BytesUsed),
|
||
AvailableBytes: int64(pool.BytesAvailable),
|
||
Objects: int64(pool.Objects),
|
||
PercentUsed: pool.PercentUsed,
|
||
})
|
||
}
|
||
|
||
// Convert services
|
||
for _, svc := range ceph.Services {
|
||
cluster.Services = append(cluster.Services, models.CephServiceStatus{
|
||
Type: svc.Type,
|
||
Running: svc.Running,
|
||
Total: svc.Total,
|
||
})
|
||
}
|
||
|
||
return cluster
|
||
}
|