mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-02-18 00:17:39 +01:00
Add seamless migration path from legacy agents to unified agent: - Add AgentType field to report payloads (unified vs legacy detection) - Update server to detect legacy agents by type instead of version - Add UI banner showing upgrade command when legacy agents are detected - Add deprecation notice to install-host-agent.ps1 - Create install-docker-agent.sh stub that redirects to unified installer Legacy agents (pulse-host-agent, pulse-docker-agent) now show a "Legacy" badge in the UI with a one-click copy command to upgrade to the unified agent.
9576 lines
280 KiB
Go
9576 lines
280 KiB
Go
package monitoring
|
|
|
|
import (
|
|
"context"
|
|
"crypto/sha1"
|
|
"encoding/hex"
|
|
"encoding/json"
|
|
stderrors "errors"
|
|
"fmt"
|
|
"math"
|
|
"math/rand"
|
|
"net"
|
|
"net/url"
|
|
"os"
|
|
"path/filepath"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
"sync/atomic"
|
|
"time"
|
|
"unicode"
|
|
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/alerts"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/config"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/discovery"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/errors"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/logging"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/mock"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/models"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/notifications"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/system"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/tempproxy"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/types"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/websocket"
|
|
agentsdocker "github.com/rcourtman/pulse-go-rewrite/pkg/agents/docker"
|
|
agentshost "github.com/rcourtman/pulse-go-rewrite/pkg/agents/host"
|
|
"github.com/rcourtman/pulse-go-rewrite/pkg/pbs"
|
|
"github.com/rcourtman/pulse-go-rewrite/pkg/pmg"
|
|
"github.com/rcourtman/pulse-go-rewrite/pkg/proxmox"
|
|
"github.com/rs/zerolog"
|
|
"github.com/rs/zerolog/log"
|
|
)
|
|
|
|
const (
|
|
defaultTaskTimeout = 90 * time.Second
|
|
minTaskTimeout = 30 * time.Second
|
|
maxTaskTimeout = 3 * time.Minute
|
|
)
|
|
|
|
// PVEClientInterface defines the interface for PVE clients (both regular and cluster)
|
|
type PVEClientInterface interface {
|
|
GetNodes(ctx context.Context) ([]proxmox.Node, error)
|
|
GetNodeStatus(ctx context.Context, node string) (*proxmox.NodeStatus, error)
|
|
GetNodeRRDData(ctx context.Context, node string, timeframe string, cf string, ds []string) ([]proxmox.NodeRRDPoint, error)
|
|
GetLXCRRDData(ctx context.Context, node string, vmid int, timeframe string, cf string, ds []string) ([]proxmox.GuestRRDPoint, error)
|
|
GetVMs(ctx context.Context, node string) ([]proxmox.VM, error)
|
|
GetContainers(ctx context.Context, node string) ([]proxmox.Container, error)
|
|
GetStorage(ctx context.Context, node string) ([]proxmox.Storage, error)
|
|
GetAllStorage(ctx context.Context) ([]proxmox.Storage, error)
|
|
GetBackupTasks(ctx context.Context) ([]proxmox.Task, error)
|
|
GetReplicationStatus(ctx context.Context) ([]proxmox.ReplicationJob, error)
|
|
GetStorageContent(ctx context.Context, node, storage string) ([]proxmox.StorageContent, error)
|
|
GetVMSnapshots(ctx context.Context, node string, vmid int) ([]proxmox.Snapshot, error)
|
|
GetContainerSnapshots(ctx context.Context, node string, vmid int) ([]proxmox.Snapshot, error)
|
|
GetVMStatus(ctx context.Context, node string, vmid int) (*proxmox.VMStatus, error)
|
|
GetContainerStatus(ctx context.Context, node string, vmid int) (*proxmox.Container, error)
|
|
GetContainerConfig(ctx context.Context, node string, vmid int) (map[string]interface{}, error)
|
|
GetContainerInterfaces(ctx context.Context, node string, vmid int) ([]proxmox.ContainerInterface, error)
|
|
GetClusterResources(ctx context.Context, resourceType string) ([]proxmox.ClusterResource, error)
|
|
IsClusterMember(ctx context.Context) (bool, error)
|
|
GetVMFSInfo(ctx context.Context, node string, vmid int) ([]proxmox.VMFileSystem, error)
|
|
GetVMNetworkInterfaces(ctx context.Context, node string, vmid int) ([]proxmox.VMNetworkInterface, error)
|
|
GetVMAgentInfo(ctx context.Context, node string, vmid int) (map[string]interface{}, error)
|
|
GetVMAgentVersion(ctx context.Context, node string, vmid int) (string, error)
|
|
GetZFSPoolStatus(ctx context.Context, node string) ([]proxmox.ZFSPoolStatus, error)
|
|
GetZFSPoolsWithDetails(ctx context.Context, node string) ([]proxmox.ZFSPoolInfo, error)
|
|
GetDisks(ctx context.Context, node string) ([]proxmox.Disk, error)
|
|
GetCephStatus(ctx context.Context) (*proxmox.CephStatus, error)
|
|
GetCephDF(ctx context.Context) (*proxmox.CephDF, error)
|
|
}
|
|
|
|
func getNodeDisplayName(instance *config.PVEInstance, nodeName string) string {
|
|
baseName := strings.TrimSpace(nodeName)
|
|
if baseName == "" {
|
|
baseName = "unknown-node"
|
|
}
|
|
|
|
if instance == nil {
|
|
return baseName
|
|
}
|
|
|
|
friendly := strings.TrimSpace(instance.Name)
|
|
|
|
if instance.IsCluster {
|
|
if endpointLabel := lookupClusterEndpointLabel(instance, nodeName); endpointLabel != "" {
|
|
return endpointLabel
|
|
}
|
|
|
|
if baseName != "" && baseName != "unknown-node" {
|
|
return baseName
|
|
}
|
|
|
|
if friendly != "" {
|
|
return friendly
|
|
}
|
|
|
|
return baseName
|
|
}
|
|
|
|
if friendly != "" {
|
|
return friendly
|
|
}
|
|
|
|
if baseName != "" && baseName != "unknown-node" {
|
|
return baseName
|
|
}
|
|
|
|
if label := normalizeEndpointHost(instance.Host); label != "" && !isLikelyIPAddress(label) {
|
|
return label
|
|
}
|
|
|
|
return baseName
|
|
}
|
|
|
|
func (m *Monitor) getInstanceConfig(instanceName string) *config.PVEInstance {
|
|
if m == nil || m.config == nil {
|
|
return nil
|
|
}
|
|
for i := range m.config.PVEInstances {
|
|
if strings.EqualFold(m.config.PVEInstances[i].Name, instanceName) {
|
|
return &m.config.PVEInstances[i]
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func mergeNVMeTempsIntoDisks(disks []models.PhysicalDisk, nodes []models.Node) []models.PhysicalDisk {
|
|
if len(disks) == 0 || len(nodes) == 0 {
|
|
return disks
|
|
}
|
|
|
|
// Build temperature maps by node for both SMART and legacy NVMe data
|
|
smartTempsByNode := make(map[string][]models.DiskTemp)
|
|
nvmeTempsByNode := make(map[string][]models.NVMeTemp)
|
|
|
|
for _, node := range nodes {
|
|
if node.Temperature == nil || !node.Temperature.Available {
|
|
continue
|
|
}
|
|
|
|
// Collect SMART temps (preferred source)
|
|
if len(node.Temperature.SMART) > 0 {
|
|
temps := make([]models.DiskTemp, len(node.Temperature.SMART))
|
|
copy(temps, node.Temperature.SMART)
|
|
smartTempsByNode[node.Name] = temps
|
|
}
|
|
|
|
// Collect legacy NVMe temps as fallback
|
|
if len(node.Temperature.NVMe) > 0 {
|
|
temps := make([]models.NVMeTemp, len(node.Temperature.NVMe))
|
|
copy(temps, node.Temperature.NVMe)
|
|
sort.Slice(temps, func(i, j int) bool {
|
|
return temps[i].Device < temps[j].Device
|
|
})
|
|
nvmeTempsByNode[node.Name] = temps
|
|
}
|
|
}
|
|
|
|
if len(smartTempsByNode) == 0 && len(nvmeTempsByNode) == 0 {
|
|
return disks
|
|
}
|
|
|
|
updated := make([]models.PhysicalDisk, len(disks))
|
|
copy(updated, disks)
|
|
|
|
// Process SMART temperatures first (preferred method)
|
|
for i := range updated {
|
|
smartTemps, ok := smartTempsByNode[updated[i].Node]
|
|
if !ok || len(smartTemps) == 0 {
|
|
continue
|
|
}
|
|
|
|
// Try to match by WWN (most reliable)
|
|
if updated[i].WWN != "" {
|
|
for _, temp := range smartTemps {
|
|
if temp.WWN != "" && strings.EqualFold(temp.WWN, updated[i].WWN) {
|
|
if temp.Temperature > 0 && !temp.StandbySkipped {
|
|
updated[i].Temperature = temp.Temperature
|
|
log.Debug().
|
|
Str("disk", updated[i].DevPath).
|
|
Str("wwn", updated[i].WWN).
|
|
Int("temp", temp.Temperature).
|
|
Msg("Matched SMART temperature by WWN")
|
|
}
|
|
continue
|
|
}
|
|
}
|
|
}
|
|
|
|
// Fall back to serial number match (case-insensitive)
|
|
if updated[i].Serial != "" && updated[i].Temperature == 0 {
|
|
for _, temp := range smartTemps {
|
|
if temp.Serial != "" && strings.EqualFold(temp.Serial, updated[i].Serial) {
|
|
if temp.Temperature > 0 && !temp.StandbySkipped {
|
|
updated[i].Temperature = temp.Temperature
|
|
log.Debug().
|
|
Str("disk", updated[i].DevPath).
|
|
Str("serial", updated[i].Serial).
|
|
Int("temp", temp.Temperature).
|
|
Msg("Matched SMART temperature by serial")
|
|
}
|
|
continue
|
|
}
|
|
}
|
|
}
|
|
|
|
// Last resort: match by device path (normalized)
|
|
if updated[i].Temperature == 0 {
|
|
normalizedDevPath := strings.TrimPrefix(updated[i].DevPath, "/dev/")
|
|
for _, temp := range smartTemps {
|
|
normalizedTempDev := strings.TrimPrefix(temp.Device, "/dev/")
|
|
if normalizedTempDev == normalizedDevPath {
|
|
if temp.Temperature > 0 && !temp.StandbySkipped {
|
|
updated[i].Temperature = temp.Temperature
|
|
log.Debug().
|
|
Str("disk", updated[i].DevPath).
|
|
Int("temp", temp.Temperature).
|
|
Msg("Matched SMART temperature by device path")
|
|
}
|
|
break
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Process legacy NVMe temperatures for disks that didn't get SMART data
|
|
disksByNode := make(map[string][]int)
|
|
for i := range updated {
|
|
if strings.EqualFold(updated[i].Type, "nvme") && updated[i].Temperature == 0 {
|
|
disksByNode[updated[i].Node] = append(disksByNode[updated[i].Node], i)
|
|
}
|
|
}
|
|
|
|
for nodeName, diskIndexes := range disksByNode {
|
|
temps, ok := nvmeTempsByNode[nodeName]
|
|
if !ok || len(temps) == 0 {
|
|
continue
|
|
}
|
|
|
|
sort.Slice(diskIndexes, func(i, j int) bool {
|
|
return updated[diskIndexes[i]].DevPath < updated[diskIndexes[j]].DevPath
|
|
})
|
|
|
|
for idx, diskIdx := range diskIndexes {
|
|
if idx >= len(temps) {
|
|
break
|
|
}
|
|
|
|
tempVal := temps[idx].Temp
|
|
if tempVal <= 0 || math.IsNaN(tempVal) {
|
|
continue
|
|
}
|
|
|
|
updated[diskIdx].Temperature = int(math.Round(tempVal))
|
|
log.Debug().
|
|
Str("disk", updated[diskIdx].DevPath).
|
|
Int("temp", updated[diskIdx].Temperature).
|
|
Msg("Matched legacy NVMe temperature by index")
|
|
}
|
|
}
|
|
|
|
return updated
|
|
}
|
|
|
|
func lookupClusterEndpointLabel(instance *config.PVEInstance, nodeName string) string {
|
|
if instance == nil {
|
|
return ""
|
|
}
|
|
|
|
for _, endpoint := range instance.ClusterEndpoints {
|
|
if !strings.EqualFold(endpoint.NodeName, nodeName) {
|
|
continue
|
|
}
|
|
|
|
if host := strings.TrimSpace(endpoint.Host); host != "" {
|
|
if label := normalizeEndpointHost(host); label != "" && !isLikelyIPAddress(label) {
|
|
return label
|
|
}
|
|
}
|
|
|
|
if nodeNameLabel := strings.TrimSpace(endpoint.NodeName); nodeNameLabel != "" {
|
|
return nodeNameLabel
|
|
}
|
|
|
|
if ip := strings.TrimSpace(endpoint.IP); ip != "" {
|
|
return ip
|
|
}
|
|
}
|
|
|
|
return ""
|
|
}
|
|
|
|
func normalizeEndpointHost(raw string) string {
|
|
value := strings.TrimSpace(raw)
|
|
if value == "" {
|
|
return ""
|
|
}
|
|
|
|
if parsed, err := url.Parse(value); err == nil && parsed.Host != "" {
|
|
host := parsed.Hostname()
|
|
if host != "" {
|
|
return host
|
|
}
|
|
return parsed.Host
|
|
}
|
|
|
|
value = strings.TrimPrefix(value, "https://")
|
|
value = strings.TrimPrefix(value, "http://")
|
|
value = strings.TrimSpace(value)
|
|
if value == "" {
|
|
return ""
|
|
}
|
|
|
|
if idx := strings.Index(value, "/"); idx >= 0 {
|
|
value = strings.TrimSpace(value[:idx])
|
|
}
|
|
|
|
if idx := strings.Index(value, ":"); idx >= 0 {
|
|
value = strings.TrimSpace(value[:idx])
|
|
}
|
|
|
|
return value
|
|
}
|
|
|
|
func isLikelyIPAddress(value string) bool {
|
|
if value == "" {
|
|
return false
|
|
}
|
|
|
|
if ip := net.ParseIP(value); ip != nil {
|
|
return true
|
|
}
|
|
|
|
// Handle IPv6 with zone identifier (fe80::1%eth0)
|
|
if i := strings.Index(value, "%"); i > 0 {
|
|
if ip := net.ParseIP(value[:i]); ip != nil {
|
|
return true
|
|
}
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
func ensureClusterEndpointURL(raw string) string {
|
|
value := strings.TrimSpace(raw)
|
|
if value == "" {
|
|
return ""
|
|
}
|
|
|
|
lower := strings.ToLower(value)
|
|
if strings.HasPrefix(lower, "http://") || strings.HasPrefix(lower, "https://") {
|
|
return value
|
|
}
|
|
|
|
if _, _, err := net.SplitHostPort(value); err == nil {
|
|
return "https://" + value
|
|
}
|
|
|
|
return "https://" + net.JoinHostPort(value, "8006")
|
|
}
|
|
|
|
func clusterEndpointEffectiveURL(endpoint config.ClusterEndpoint, verifySSL bool, hasFingerprint bool) string {
|
|
// When TLS hostname verification is required (VerifySSL=true and no fingerprint),
|
|
// prefer hostname over IP to ensure certificate CN/SAN validation works correctly.
|
|
// When TLS is not verified (VerifySSL=false) or a fingerprint is provided (which
|
|
// bypasses hostname checks), prefer IP to reduce DNS lookups (refs #620).
|
|
requiresHostnameForTLS := verifySSL && !hasFingerprint
|
|
|
|
if requiresHostnameForTLS {
|
|
// Prefer hostname for proper TLS certificate validation
|
|
if endpoint.Host != "" {
|
|
return ensureClusterEndpointURL(endpoint.Host)
|
|
}
|
|
if endpoint.IP != "" {
|
|
return ensureClusterEndpointURL(endpoint.IP)
|
|
}
|
|
} else {
|
|
// Prefer IP address to avoid excessive DNS lookups
|
|
if endpoint.IP != "" {
|
|
return ensureClusterEndpointURL(endpoint.IP)
|
|
}
|
|
if endpoint.Host != "" {
|
|
return ensureClusterEndpointURL(endpoint.Host)
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// PollExecutor defines the contract for executing polling tasks.
|
|
type PollExecutor interface {
|
|
Execute(ctx context.Context, task PollTask)
|
|
}
|
|
|
|
type realExecutor struct {
|
|
monitor *Monitor
|
|
}
|
|
|
|
func newRealExecutor(m *Monitor) PollExecutor {
|
|
return &realExecutor{monitor: m}
|
|
}
|
|
|
|
func (r *realExecutor) Execute(ctx context.Context, task PollTask) {
|
|
if r == nil || r.monitor == nil {
|
|
return
|
|
}
|
|
|
|
switch strings.ToLower(task.InstanceType) {
|
|
case "pve":
|
|
if task.PVEClient == nil {
|
|
log.Warn().
|
|
Str("instance", task.InstanceName).
|
|
Msg("PollExecutor received nil PVE client")
|
|
return
|
|
}
|
|
r.monitor.pollPVEInstance(ctx, task.InstanceName, task.PVEClient)
|
|
case "pbs":
|
|
if task.PBSClient == nil {
|
|
log.Warn().
|
|
Str("instance", task.InstanceName).
|
|
Msg("PollExecutor received nil PBS client")
|
|
return
|
|
}
|
|
r.monitor.pollPBSInstance(ctx, task.InstanceName, task.PBSClient)
|
|
case "pmg":
|
|
if task.PMGClient == nil {
|
|
log.Warn().
|
|
Str("instance", task.InstanceName).
|
|
Msg("PollExecutor received nil PMG client")
|
|
return
|
|
}
|
|
r.monitor.pollPMGInstance(ctx, task.InstanceName, task.PMGClient)
|
|
default:
|
|
if logging.IsLevelEnabled(zerolog.DebugLevel) {
|
|
log.Debug().
|
|
Str("instance", task.InstanceName).
|
|
Str("type", task.InstanceType).
|
|
Msg("PollExecutor received unsupported task type")
|
|
}
|
|
}
|
|
}
|
|
|
|
type instanceInfo struct {
|
|
Key string
|
|
Type InstanceType
|
|
DisplayName string
|
|
Connection string
|
|
Metadata map[string]string
|
|
}
|
|
|
|
type pollStatus struct {
|
|
LastSuccess time.Time
|
|
LastErrorAt time.Time
|
|
LastErrorMessage string
|
|
LastErrorCategory string
|
|
ConsecutiveFailures int
|
|
FirstFailureAt time.Time
|
|
}
|
|
|
|
type dlqInsight struct {
|
|
Reason string
|
|
FirstAttempt time.Time
|
|
LastAttempt time.Time
|
|
RetryCount int
|
|
NextRetry time.Time
|
|
}
|
|
|
|
type ErrorDetail struct {
|
|
At time.Time `json:"at"`
|
|
Message string `json:"message"`
|
|
Category string `json:"category"`
|
|
}
|
|
|
|
type InstancePollStatus struct {
|
|
LastSuccess *time.Time `json:"lastSuccess,omitempty"`
|
|
LastError *ErrorDetail `json:"lastError,omitempty"`
|
|
ConsecutiveFailures int `json:"consecutiveFailures"`
|
|
FirstFailureAt *time.Time `json:"firstFailureAt,omitempty"`
|
|
}
|
|
|
|
type InstanceBreaker struct {
|
|
State string `json:"state"`
|
|
Since *time.Time `json:"since,omitempty"`
|
|
LastTransition *time.Time `json:"lastTransition,omitempty"`
|
|
RetryAt *time.Time `json:"retryAt,omitempty"`
|
|
FailureCount int `json:"failureCount"`
|
|
}
|
|
|
|
type InstanceDLQ struct {
|
|
Present bool `json:"present"`
|
|
Reason string `json:"reason,omitempty"`
|
|
FirstAttempt *time.Time `json:"firstAttempt,omitempty"`
|
|
LastAttempt *time.Time `json:"lastAttempt,omitempty"`
|
|
RetryCount int `json:"retryCount,omitempty"`
|
|
NextRetry *time.Time `json:"nextRetry,omitempty"`
|
|
}
|
|
|
|
type InstanceHealth struct {
|
|
Key string `json:"key"`
|
|
Type string `json:"type"`
|
|
DisplayName string `json:"displayName"`
|
|
Instance string `json:"instance"`
|
|
Connection string `json:"connection"`
|
|
PollStatus InstancePollStatus `json:"pollStatus"`
|
|
Breaker InstanceBreaker `json:"breaker"`
|
|
DeadLetter InstanceDLQ `json:"deadLetter"`
|
|
}
|
|
|
|
func schedulerKey(instanceType InstanceType, name string) string {
|
|
return string(instanceType) + "::" + name
|
|
}
|
|
|
|
func timePtr(t time.Time) *time.Time {
|
|
if t.IsZero() {
|
|
return nil
|
|
}
|
|
copy := t
|
|
return ©
|
|
}
|
|
|
|
// Monitor handles all monitoring operations
|
|
type Monitor struct {
|
|
config *config.Config
|
|
state *models.State
|
|
pveClients map[string]PVEClientInterface
|
|
pbsClients map[string]*pbs.Client
|
|
pmgClients map[string]*pmg.Client
|
|
pollMetrics *PollMetrics
|
|
scheduler *AdaptiveScheduler
|
|
stalenessTracker *StalenessTracker
|
|
taskQueue *TaskQueue
|
|
pollTimeout time.Duration
|
|
circuitBreakers map[string]*circuitBreaker
|
|
deadLetterQueue *TaskQueue
|
|
failureCounts map[string]int
|
|
lastOutcome map[string]taskOutcome
|
|
backoffCfg backoffConfig
|
|
rng *rand.Rand
|
|
maxRetryAttempts int
|
|
tempCollector *TemperatureCollector // SSH-based temperature collector
|
|
guestMetadataStore *config.GuestMetadataStore
|
|
dockerMetadataStore *config.DockerMetadataStore
|
|
mu sync.RWMutex
|
|
startTime time.Time
|
|
rateTracker *RateTracker
|
|
metricsHistory *MetricsHistory
|
|
alertManager *alerts.Manager
|
|
notificationMgr *notifications.NotificationManager
|
|
configPersist *config.ConfigPersistence
|
|
discoveryService *discovery.Service // Background discovery service
|
|
activePollCount int32 // Number of active polling operations
|
|
pollCounter int64 // Counter for polling cycles
|
|
authFailures map[string]int // Track consecutive auth failures per node
|
|
lastAuthAttempt map[string]time.Time // Track last auth attempt time
|
|
lastClusterCheck map[string]time.Time // Track last cluster check for standalone nodes
|
|
lastPhysicalDiskPoll map[string]time.Time // Track last physical disk poll time per instance
|
|
lastPVEBackupPoll map[string]time.Time // Track last PVE backup poll per instance
|
|
lastPBSBackupPoll map[string]time.Time // Track last PBS backup poll per instance
|
|
persistence *config.ConfigPersistence // Add persistence for saving updated configs
|
|
pbsBackupPollers map[string]bool // Track PBS backup polling goroutines per instance
|
|
runtimeCtx context.Context // Context used while monitor is running
|
|
wsHub *websocket.Hub // Hub used for broadcasting state
|
|
diagMu sync.RWMutex // Protects diagnostic snapshot maps
|
|
nodeSnapshots map[string]NodeMemorySnapshot
|
|
guestSnapshots map[string]GuestMemorySnapshot
|
|
rrdCacheMu sync.RWMutex // Protects RRD memavailable cache
|
|
nodeRRDMemCache map[string]rrdMemCacheEntry
|
|
removedDockerHosts map[string]time.Time // Track deliberately removed Docker hosts (ID -> removal time)
|
|
dockerTokenBindings map[string]string // Track token ID -> agent ID bindings to enforce uniqueness
|
|
hostTokenBindings map[string]string // Track token ID -> agent ID bindings to enforce uniqueness
|
|
dockerCommands map[string]*dockerHostCommand
|
|
dockerCommandIndex map[string]string
|
|
guestMetadataMu sync.RWMutex
|
|
guestMetadataCache map[string]guestMetadataCacheEntry
|
|
guestMetadataLimiterMu sync.Mutex
|
|
guestMetadataLimiter map[string]time.Time
|
|
guestMetadataSlots chan struct{}
|
|
guestMetadataMinRefresh time.Duration
|
|
guestMetadataRefreshJitter time.Duration
|
|
guestMetadataRetryBackoff time.Duration
|
|
guestMetadataHoldDuration time.Duration
|
|
// Configurable guest agent timeouts (refs #592)
|
|
guestAgentFSInfoTimeout time.Duration
|
|
guestAgentNetworkTimeout time.Duration
|
|
guestAgentOSInfoTimeout time.Duration
|
|
guestAgentVersionTimeout time.Duration
|
|
guestAgentRetries int
|
|
executor PollExecutor
|
|
breakerBaseRetry time.Duration
|
|
breakerMaxDelay time.Duration
|
|
breakerHalfOpenWindow time.Duration
|
|
instanceInfoCache map[string]*instanceInfo
|
|
pollStatusMap map[string]*pollStatus
|
|
dlqInsightMap map[string]*dlqInsight
|
|
nodeLastOnline map[string]time.Time // Track last time each node was seen online (for grace period)
|
|
}
|
|
|
|
type rrdMemCacheEntry struct {
|
|
available uint64
|
|
used uint64
|
|
total uint64
|
|
fetchedAt time.Time
|
|
}
|
|
|
|
// safePercentage calculates percentage safely, returning 0 if divisor is 0
|
|
func safePercentage(used, total float64) float64 {
|
|
if total == 0 {
|
|
return 0
|
|
}
|
|
result := used / total * 100
|
|
if math.IsNaN(result) || math.IsInf(result, 0) {
|
|
return 0
|
|
}
|
|
return result
|
|
}
|
|
|
|
// maxInt64 returns the maximum of two int64 values
|
|
func maxInt64(a, b int64) int64 {
|
|
if a > b {
|
|
return a
|
|
}
|
|
return b
|
|
}
|
|
|
|
// safeFloat ensures a float value is not NaN or Inf
|
|
func safeFloat(val float64) float64 {
|
|
if math.IsNaN(val) || math.IsInf(val, 0) {
|
|
return 0
|
|
}
|
|
return val
|
|
}
|
|
|
|
// parseDurationEnv parses a duration from an environment variable, returning defaultVal if not set or invalid
|
|
func parseDurationEnv(key string, defaultVal time.Duration) time.Duration {
|
|
val := os.Getenv(key)
|
|
if val == "" {
|
|
return defaultVal
|
|
}
|
|
parsed, err := time.ParseDuration(val)
|
|
if err != nil {
|
|
log.Warn().
|
|
Str("key", key).
|
|
Str("value", val).
|
|
Err(err).
|
|
Dur("default", defaultVal).
|
|
Msg("Failed to parse duration from environment variable, using default")
|
|
return defaultVal
|
|
}
|
|
return parsed
|
|
}
|
|
|
|
// parseIntEnv parses an integer from an environment variable, returning defaultVal if not set or invalid
|
|
func parseIntEnv(key string, defaultVal int) int {
|
|
val := os.Getenv(key)
|
|
if val == "" {
|
|
return defaultVal
|
|
}
|
|
parsed, err := strconv.Atoi(val)
|
|
if err != nil {
|
|
log.Warn().
|
|
Str("key", key).
|
|
Str("value", val).
|
|
Err(err).
|
|
Int("default", defaultVal).
|
|
Msg("Failed to parse integer from environment variable, using default")
|
|
return defaultVal
|
|
}
|
|
return parsed
|
|
}
|
|
|
|
func clampUint64ToInt64(val uint64) int64 {
|
|
if val > math.MaxInt64 {
|
|
return math.MaxInt64
|
|
}
|
|
return int64(val)
|
|
}
|
|
|
|
func cloneStringFloatMap(src map[string]float64) map[string]float64 {
|
|
if len(src) == 0 {
|
|
return nil
|
|
}
|
|
out := make(map[string]float64, len(src))
|
|
for k, v := range src {
|
|
out[k] = v
|
|
}
|
|
return out
|
|
}
|
|
|
|
func cloneStringMap(src map[string]string) map[string]string {
|
|
if len(src) == 0 {
|
|
return nil
|
|
}
|
|
out := make(map[string]string, len(src))
|
|
for k, v := range src {
|
|
out[k] = v
|
|
}
|
|
return out
|
|
}
|
|
|
|
func convertDockerServices(services []agentsdocker.Service) []models.DockerService {
|
|
if len(services) == 0 {
|
|
return nil
|
|
}
|
|
|
|
result := make([]models.DockerService, 0, len(services))
|
|
for _, svc := range services {
|
|
service := models.DockerService{
|
|
ID: svc.ID,
|
|
Name: svc.Name,
|
|
Stack: svc.Stack,
|
|
Image: svc.Image,
|
|
Mode: svc.Mode,
|
|
DesiredTasks: svc.DesiredTasks,
|
|
RunningTasks: svc.RunningTasks,
|
|
CompletedTasks: svc.CompletedTasks,
|
|
}
|
|
|
|
if len(svc.Labels) > 0 {
|
|
service.Labels = cloneStringMap(svc.Labels)
|
|
}
|
|
|
|
if len(svc.EndpointPorts) > 0 {
|
|
ports := make([]models.DockerServicePort, len(svc.EndpointPorts))
|
|
for i, port := range svc.EndpointPorts {
|
|
ports[i] = models.DockerServicePort{
|
|
Name: port.Name,
|
|
Protocol: port.Protocol,
|
|
TargetPort: port.TargetPort,
|
|
PublishedPort: port.PublishedPort,
|
|
PublishMode: port.PublishMode,
|
|
}
|
|
}
|
|
service.EndpointPorts = ports
|
|
}
|
|
|
|
if svc.UpdateStatus != nil {
|
|
update := &models.DockerServiceUpdate{
|
|
State: svc.UpdateStatus.State,
|
|
Message: svc.UpdateStatus.Message,
|
|
}
|
|
if svc.UpdateStatus.CompletedAt != nil && !svc.UpdateStatus.CompletedAt.IsZero() {
|
|
completed := *svc.UpdateStatus.CompletedAt
|
|
update.CompletedAt = &completed
|
|
}
|
|
service.UpdateStatus = update
|
|
}
|
|
|
|
if svc.CreatedAt != nil && !svc.CreatedAt.IsZero() {
|
|
created := *svc.CreatedAt
|
|
service.CreatedAt = &created
|
|
}
|
|
if svc.UpdatedAt != nil && !svc.UpdatedAt.IsZero() {
|
|
updated := *svc.UpdatedAt
|
|
service.UpdatedAt = &updated
|
|
}
|
|
|
|
result = append(result, service)
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
func convertDockerTasks(tasks []agentsdocker.Task) []models.DockerTask {
|
|
if len(tasks) == 0 {
|
|
return nil
|
|
}
|
|
|
|
result := make([]models.DockerTask, 0, len(tasks))
|
|
for _, task := range tasks {
|
|
modelTask := models.DockerTask{
|
|
ID: task.ID,
|
|
ServiceID: task.ServiceID,
|
|
ServiceName: task.ServiceName,
|
|
Slot: task.Slot,
|
|
NodeID: task.NodeID,
|
|
NodeName: task.NodeName,
|
|
DesiredState: task.DesiredState,
|
|
CurrentState: task.CurrentState,
|
|
Error: task.Error,
|
|
Message: task.Message,
|
|
ContainerID: task.ContainerID,
|
|
ContainerName: task.ContainerName,
|
|
CreatedAt: task.CreatedAt,
|
|
}
|
|
|
|
if task.UpdatedAt != nil && !task.UpdatedAt.IsZero() {
|
|
updated := *task.UpdatedAt
|
|
modelTask.UpdatedAt = &updated
|
|
}
|
|
if task.StartedAt != nil && !task.StartedAt.IsZero() {
|
|
started := *task.StartedAt
|
|
modelTask.StartedAt = &started
|
|
}
|
|
if task.CompletedAt != nil && !task.CompletedAt.IsZero() {
|
|
completed := *task.CompletedAt
|
|
modelTask.CompletedAt = &completed
|
|
}
|
|
|
|
result = append(result, modelTask)
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
func normalizeAgentVersion(version string) string {
|
|
version = strings.TrimSpace(version)
|
|
if version == "" {
|
|
return ""
|
|
}
|
|
version = strings.TrimLeft(version, "vV")
|
|
if version == "" {
|
|
return ""
|
|
}
|
|
return "v" + version
|
|
}
|
|
|
|
func convertDockerSwarmInfo(info *agentsdocker.SwarmInfo) *models.DockerSwarmInfo {
|
|
if info == nil {
|
|
return nil
|
|
}
|
|
|
|
return &models.DockerSwarmInfo{
|
|
NodeID: info.NodeID,
|
|
NodeRole: info.NodeRole,
|
|
LocalState: info.LocalState,
|
|
ControlAvailable: info.ControlAvailable,
|
|
ClusterID: info.ClusterID,
|
|
ClusterName: info.ClusterName,
|
|
Scope: info.Scope,
|
|
Error: info.Error,
|
|
}
|
|
}
|
|
|
|
// shouldRunBackupPoll determines whether a backup polling cycle should execute.
|
|
// Returns whether polling should run, a human-readable skip reason, and the timestamp to record.
|
|
func (m *Monitor) shouldRunBackupPoll(last time.Time, now time.Time) (bool, string, time.Time) {
|
|
if m == nil || m.config == nil {
|
|
return false, "configuration unavailable", last
|
|
}
|
|
|
|
if !m.config.EnableBackupPolling {
|
|
return false, "backup polling globally disabled", last
|
|
}
|
|
|
|
interval := m.config.BackupPollingInterval
|
|
if interval > 0 {
|
|
if !last.IsZero() && now.Sub(last) < interval {
|
|
next := last.Add(interval)
|
|
return false, fmt.Sprintf("next run scheduled for %s", next.Format(time.RFC3339)), last
|
|
}
|
|
return true, "", now
|
|
}
|
|
|
|
backupCycles := m.config.BackupPollingCycles
|
|
if backupCycles <= 0 {
|
|
backupCycles = 10
|
|
}
|
|
|
|
if m.pollCounter%int64(backupCycles) == 0 || m.pollCounter == 1 {
|
|
return true, "", now
|
|
}
|
|
|
|
remaining := int64(backupCycles) - (m.pollCounter % int64(backupCycles))
|
|
if remaining <= 0 {
|
|
remaining = int64(backupCycles)
|
|
}
|
|
return false, fmt.Sprintf("next run in %d polling cycles", remaining), last
|
|
}
|
|
|
|
const (
|
|
dockerConnectionPrefix = "docker-"
|
|
hostConnectionPrefix = "host-"
|
|
dockerOfflineGraceMultiplier = 4
|
|
dockerMinimumHealthWindow = 30 * time.Second
|
|
dockerMaximumHealthWindow = 10 * time.Minute
|
|
hostOfflineGraceMultiplier = 4
|
|
hostMinimumHealthWindow = 30 * time.Second
|
|
hostMaximumHealthWindow = 10 * time.Minute
|
|
nodeOfflineGracePeriod = 60 * time.Second // Grace period before marking Proxmox nodes offline
|
|
nodeRRDCacheTTL = 30 * time.Second
|
|
nodeRRDRequestTimeout = 2 * time.Second
|
|
guestMetadataCacheTTL = 5 * time.Minute
|
|
defaultGuestMetadataHold = 15 * time.Second
|
|
|
|
// Guest agent timeout defaults (configurable via environment variables)
|
|
// Increased from 3-5s to 10-15s to handle high-load environments better (refs #592)
|
|
defaultGuestAgentFSInfoTimeout = 15 * time.Second // GUEST_AGENT_FSINFO_TIMEOUT
|
|
defaultGuestAgentNetworkTimeout = 10 * time.Second // GUEST_AGENT_NETWORK_TIMEOUT
|
|
defaultGuestAgentOSInfoTimeout = 10 * time.Second // GUEST_AGENT_OSINFO_TIMEOUT
|
|
defaultGuestAgentVersionTimeout = 10 * time.Second // GUEST_AGENT_VERSION_TIMEOUT
|
|
defaultGuestAgentRetries = 1 // GUEST_AGENT_RETRIES (0 = no retry, 1 = one retry)
|
|
defaultGuestAgentRetryDelay = 500 * time.Millisecond
|
|
|
|
// Skip OS info calls after this many consecutive failures to avoid triggering buggy guest agents (refs #692)
|
|
guestAgentOSInfoFailureThreshold = 3
|
|
)
|
|
|
|
type guestMetadataCacheEntry struct {
|
|
ipAddresses []string
|
|
networkInterfaces []models.GuestNetworkInterface
|
|
osName string
|
|
osVersion string
|
|
agentVersion string
|
|
fetchedAt time.Time
|
|
osInfoFailureCount int // Track consecutive OS info failures
|
|
osInfoSkip bool // Skip OS info calls after repeated failures (refs #692)
|
|
}
|
|
|
|
type taskOutcome struct {
|
|
success bool
|
|
transient bool
|
|
err error
|
|
recordedAt time.Time
|
|
}
|
|
|
|
func (m *Monitor) getNodeRRDMetrics(ctx context.Context, client PVEClientInterface, nodeName string) (rrdMemCacheEntry, error) {
|
|
if client == nil || nodeName == "" {
|
|
return rrdMemCacheEntry{}, fmt.Errorf("invalid arguments for RRD lookup")
|
|
}
|
|
|
|
now := time.Now()
|
|
|
|
m.rrdCacheMu.RLock()
|
|
if entry, ok := m.nodeRRDMemCache[nodeName]; ok && now.Sub(entry.fetchedAt) < nodeRRDCacheTTL {
|
|
m.rrdCacheMu.RUnlock()
|
|
return entry, nil
|
|
}
|
|
m.rrdCacheMu.RUnlock()
|
|
|
|
requestCtx, cancel := context.WithTimeout(ctx, nodeRRDRequestTimeout)
|
|
defer cancel()
|
|
|
|
points, err := client.GetNodeRRDData(requestCtx, nodeName, "hour", "AVERAGE", []string{"memavailable", "memused", "memtotal"})
|
|
if err != nil {
|
|
return rrdMemCacheEntry{}, err
|
|
}
|
|
|
|
var memAvailable uint64
|
|
var memUsed uint64
|
|
var memTotal uint64
|
|
|
|
for i := len(points) - 1; i >= 0; i-- {
|
|
point := points[i]
|
|
|
|
if memTotal == 0 && point.MemTotal != nil && !math.IsNaN(*point.MemTotal) && *point.MemTotal > 0 {
|
|
memTotal = uint64(math.Round(*point.MemTotal))
|
|
}
|
|
|
|
if memAvailable == 0 && point.MemAvailable != nil && !math.IsNaN(*point.MemAvailable) && *point.MemAvailable > 0 {
|
|
memAvailable = uint64(math.Round(*point.MemAvailable))
|
|
}
|
|
|
|
if memUsed == 0 && point.MemUsed != nil && !math.IsNaN(*point.MemUsed) && *point.MemUsed > 0 {
|
|
memUsed = uint64(math.Round(*point.MemUsed))
|
|
}
|
|
|
|
if memTotal > 0 && (memAvailable > 0 || memUsed > 0) {
|
|
break
|
|
}
|
|
}
|
|
|
|
if memTotal > 0 {
|
|
if memAvailable > memTotal {
|
|
memAvailable = memTotal
|
|
}
|
|
if memUsed > memTotal {
|
|
memUsed = memTotal
|
|
}
|
|
}
|
|
|
|
if memAvailable == 0 && memUsed == 0 {
|
|
return rrdMemCacheEntry{}, fmt.Errorf("rrd mem metrics not present")
|
|
}
|
|
|
|
entry := rrdMemCacheEntry{
|
|
available: memAvailable,
|
|
used: memUsed,
|
|
total: memTotal,
|
|
fetchedAt: now,
|
|
}
|
|
|
|
m.rrdCacheMu.Lock()
|
|
m.nodeRRDMemCache[nodeName] = entry
|
|
m.rrdCacheMu.Unlock()
|
|
|
|
return entry, nil
|
|
}
|
|
|
|
// RemoveDockerHost removes a docker host from the shared state and clears related alerts.
|
|
func (m *Monitor) RemoveDockerHost(hostID string) (models.DockerHost, error) {
|
|
hostID = strings.TrimSpace(hostID)
|
|
if hostID == "" {
|
|
return models.DockerHost{}, fmt.Errorf("docker host id is required")
|
|
}
|
|
|
|
host, removed := m.state.RemoveDockerHost(hostID)
|
|
if !removed {
|
|
if logging.IsLevelEnabled(zerolog.DebugLevel) {
|
|
log.Debug().Str("dockerHostID", hostID).Msg("Docker host not present in state during removal; proceeding to clear alerts")
|
|
}
|
|
host = models.DockerHost{
|
|
ID: hostID,
|
|
Hostname: hostID,
|
|
DisplayName: hostID,
|
|
}
|
|
}
|
|
|
|
// Revoke the API token associated with this Docker host
|
|
if host.TokenID != "" {
|
|
tokenRemoved := m.config.RemoveAPIToken(host.TokenID)
|
|
if tokenRemoved {
|
|
m.config.SortAPITokens()
|
|
m.config.APITokenEnabled = m.config.HasAPITokens()
|
|
|
|
if m.persistence != nil {
|
|
if err := m.persistence.SaveAPITokens(m.config.APITokens); err != nil {
|
|
log.Warn().Err(err).Str("tokenID", host.TokenID).Msg("Failed to persist API token revocation after Docker host removal")
|
|
} else {
|
|
log.Info().Str("tokenID", host.TokenID).Str("tokenName", host.TokenName).Msg("API token revoked for removed Docker host")
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Track removal to prevent resurrection from cached reports
|
|
removedAt := time.Now()
|
|
|
|
m.mu.Lock()
|
|
m.removedDockerHosts[hostID] = removedAt
|
|
// Unbind the token so it can be reused with a different agent if needed
|
|
if host.TokenID != "" {
|
|
delete(m.dockerTokenBindings, host.TokenID)
|
|
log.Debug().
|
|
Str("tokenID", host.TokenID).
|
|
Str("dockerHostID", hostID).
|
|
Msg("Unbound Docker agent token from removed host")
|
|
}
|
|
if cmd, ok := m.dockerCommands[hostID]; ok {
|
|
delete(m.dockerCommandIndex, cmd.status.ID)
|
|
}
|
|
delete(m.dockerCommands, hostID)
|
|
m.mu.Unlock()
|
|
|
|
m.state.AddRemovedDockerHost(models.RemovedDockerHost{
|
|
ID: hostID,
|
|
Hostname: host.Hostname,
|
|
DisplayName: host.DisplayName,
|
|
RemovedAt: removedAt,
|
|
})
|
|
|
|
m.state.RemoveConnectionHealth(dockerConnectionPrefix + hostID)
|
|
if m.alertManager != nil {
|
|
m.alertManager.HandleDockerHostRemoved(host)
|
|
m.SyncAlertState()
|
|
}
|
|
|
|
log.Info().
|
|
Str("dockerHost", host.Hostname).
|
|
Str("dockerHostID", hostID).
|
|
Bool("removed", removed).
|
|
Msg("Docker host removed and alerts cleared")
|
|
|
|
return host, nil
|
|
}
|
|
|
|
// RemoveHostAgent removes a host agent from monitoring state and clears related data.
|
|
func (m *Monitor) RemoveHostAgent(hostID string) (models.Host, error) {
|
|
hostID = strings.TrimSpace(hostID)
|
|
if hostID == "" {
|
|
return models.Host{}, fmt.Errorf("host id is required")
|
|
}
|
|
|
|
host, removed := m.state.RemoveHost(hostID)
|
|
if !removed {
|
|
if logging.IsLevelEnabled(zerolog.DebugLevel) {
|
|
log.Debug().Str("hostID", hostID).Msg("Host not present in state during removal")
|
|
}
|
|
host = models.Host{
|
|
ID: hostID,
|
|
Hostname: hostID,
|
|
}
|
|
}
|
|
|
|
// Revoke the API token associated with this host agent
|
|
if host.TokenID != "" {
|
|
tokenRemoved := m.config.RemoveAPIToken(host.TokenID)
|
|
if tokenRemoved {
|
|
m.config.SortAPITokens()
|
|
m.config.APITokenEnabled = m.config.HasAPITokens()
|
|
|
|
if m.persistence != nil {
|
|
if err := m.persistence.SaveAPITokens(m.config.APITokens); err != nil {
|
|
log.Warn().Err(err).Str("tokenID", host.TokenID).Msg("Failed to persist API token revocation after host agent removal")
|
|
} else {
|
|
log.Info().Str("tokenID", host.TokenID).Str("tokenName", host.TokenName).Msg("API token revoked for removed host agent")
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if host.TokenID != "" {
|
|
m.mu.Lock()
|
|
if _, exists := m.hostTokenBindings[host.TokenID]; exists {
|
|
delete(m.hostTokenBindings, host.TokenID)
|
|
log.Debug().
|
|
Str("tokenID", host.TokenID).
|
|
Str("hostID", hostID).
|
|
Msg("Unbound host agent token from removed host")
|
|
}
|
|
m.mu.Unlock()
|
|
}
|
|
|
|
m.state.RemoveConnectionHealth(hostConnectionPrefix + hostID)
|
|
|
|
log.Info().
|
|
Str("host", host.Hostname).
|
|
Str("hostID", hostID).
|
|
Bool("removed", removed).
|
|
Msg("Host agent removed from monitoring")
|
|
|
|
if m.alertManager != nil {
|
|
m.alertManager.HandleHostRemoved(host)
|
|
}
|
|
|
|
return host, nil
|
|
}
|
|
|
|
// HideDockerHost marks a docker host as hidden without removing it from state.
|
|
// Hidden hosts will not be shown in the frontend but will continue to accept updates.
|
|
func (m *Monitor) HideDockerHost(hostID string) (models.DockerHost, error) {
|
|
hostID = strings.TrimSpace(hostID)
|
|
if hostID == "" {
|
|
return models.DockerHost{}, fmt.Errorf("docker host id is required")
|
|
}
|
|
|
|
host, ok := m.state.SetDockerHostHidden(hostID, true)
|
|
if !ok {
|
|
return models.DockerHost{}, fmt.Errorf("docker host %q not found", hostID)
|
|
}
|
|
|
|
log.Info().
|
|
Str("dockerHost", host.Hostname).
|
|
Str("dockerHostID", hostID).
|
|
Msg("Docker host hidden from view")
|
|
|
|
return host, nil
|
|
}
|
|
|
|
// UnhideDockerHost marks a docker host as visible again.
|
|
func (m *Monitor) UnhideDockerHost(hostID string) (models.DockerHost, error) {
|
|
hostID = strings.TrimSpace(hostID)
|
|
if hostID == "" {
|
|
return models.DockerHost{}, fmt.Errorf("docker host id is required")
|
|
}
|
|
|
|
host, ok := m.state.SetDockerHostHidden(hostID, false)
|
|
if !ok {
|
|
return models.DockerHost{}, fmt.Errorf("docker host %q not found", hostID)
|
|
}
|
|
|
|
// Clear removal tracking if it was marked as removed
|
|
m.mu.Lock()
|
|
delete(m.removedDockerHosts, hostID)
|
|
m.mu.Unlock()
|
|
|
|
log.Info().
|
|
Str("dockerHost", host.Hostname).
|
|
Str("dockerHostID", hostID).
|
|
Msg("Docker host unhidden")
|
|
|
|
return host, nil
|
|
}
|
|
|
|
// MarkDockerHostPendingUninstall marks a docker host as pending uninstall.
|
|
// This is used when the user has run the uninstall command and is waiting for the host to go offline.
|
|
func (m *Monitor) MarkDockerHostPendingUninstall(hostID string) (models.DockerHost, error) {
|
|
hostID = strings.TrimSpace(hostID)
|
|
if hostID == "" {
|
|
return models.DockerHost{}, fmt.Errorf("docker host id is required")
|
|
}
|
|
|
|
host, ok := m.state.SetDockerHostPendingUninstall(hostID, true)
|
|
if !ok {
|
|
return models.DockerHost{}, fmt.Errorf("docker host %q not found", hostID)
|
|
}
|
|
|
|
log.Info().
|
|
Str("dockerHost", host.Hostname).
|
|
Str("dockerHostID", hostID).
|
|
Msg("Docker host marked as pending uninstall")
|
|
|
|
return host, nil
|
|
}
|
|
|
|
// SetDockerHostCustomDisplayName updates the custom display name for a docker host.
|
|
func (m *Monitor) SetDockerHostCustomDisplayName(hostID string, customName string) (models.DockerHost, error) {
|
|
hostID = strings.TrimSpace(hostID)
|
|
if hostID == "" {
|
|
return models.DockerHost{}, fmt.Errorf("docker host id is required")
|
|
}
|
|
|
|
customName = strings.TrimSpace(customName)
|
|
|
|
// Persist to Docker metadata store first
|
|
var hostMeta *config.DockerHostMetadata
|
|
if customName != "" {
|
|
hostMeta = &config.DockerHostMetadata{
|
|
CustomDisplayName: customName,
|
|
}
|
|
}
|
|
if err := m.dockerMetadataStore.SetHostMetadata(hostID, hostMeta); err != nil {
|
|
log.Error().Err(err).Str("hostID", hostID).Msg("Failed to persist Docker host metadata")
|
|
return models.DockerHost{}, fmt.Errorf("failed to persist custom display name: %w", err)
|
|
}
|
|
|
|
// Update in-memory state
|
|
host, ok := m.state.SetDockerHostCustomDisplayName(hostID, customName)
|
|
if !ok {
|
|
return models.DockerHost{}, fmt.Errorf("docker host %q not found", hostID)
|
|
}
|
|
|
|
log.Info().
|
|
Str("dockerHost", host.Hostname).
|
|
Str("dockerHostID", hostID).
|
|
Str("customDisplayName", customName).
|
|
Msg("Docker host custom display name updated")
|
|
|
|
return host, nil
|
|
}
|
|
|
|
// AllowDockerHostReenroll removes a host ID from the removal blocklist so it can report again.
|
|
func (m *Monitor) AllowDockerHostReenroll(hostID string) error {
|
|
hostID = strings.TrimSpace(hostID)
|
|
if hostID == "" {
|
|
return fmt.Errorf("docker host id is required")
|
|
}
|
|
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
if _, exists := m.removedDockerHosts[hostID]; !exists {
|
|
host, found := m.GetDockerHost(hostID)
|
|
event := log.Info().
|
|
Str("dockerHostID", hostID)
|
|
if found {
|
|
event = event.Str("dockerHost", host.Hostname)
|
|
}
|
|
event.Msg("Allow re-enroll requested but host was not blocked; ignoring")
|
|
return nil
|
|
}
|
|
|
|
delete(m.removedDockerHosts, hostID)
|
|
if cmd, exists := m.dockerCommands[hostID]; exists {
|
|
delete(m.dockerCommandIndex, cmd.status.ID)
|
|
delete(m.dockerCommands, hostID)
|
|
}
|
|
m.state.SetDockerHostCommand(hostID, nil)
|
|
m.state.RemoveRemovedDockerHost(hostID)
|
|
|
|
log.Info().
|
|
Str("dockerHostID", hostID).
|
|
Msg("Docker host removal block cleared; host may report again")
|
|
|
|
return nil
|
|
}
|
|
|
|
// GetDockerHost retrieves a docker host by identifier if present in state.
|
|
func (m *Monitor) GetDockerHost(hostID string) (models.DockerHost, bool) {
|
|
hostID = strings.TrimSpace(hostID)
|
|
if hostID == "" {
|
|
return models.DockerHost{}, false
|
|
}
|
|
|
|
hosts := m.state.GetDockerHosts()
|
|
for _, host := range hosts {
|
|
if host.ID == hostID {
|
|
return host, true
|
|
}
|
|
}
|
|
return models.DockerHost{}, false
|
|
}
|
|
|
|
// GetDockerHosts returns a point-in-time snapshot of all Docker hosts Pulse knows about.
|
|
func (m *Monitor) GetDockerHosts() []models.DockerHost {
|
|
if m == nil || m.state == nil {
|
|
return nil
|
|
}
|
|
return m.state.GetDockerHosts()
|
|
}
|
|
|
|
// QueueDockerHostStop queues a stop command for the specified docker host.
|
|
func (m *Monitor) QueueDockerHostStop(hostID string) (models.DockerHostCommandStatus, error) {
|
|
return m.queueDockerStopCommand(hostID)
|
|
}
|
|
|
|
// FetchDockerCommandForHost retrieves the next command payload (if any) for the host.
|
|
func (m *Monitor) FetchDockerCommandForHost(hostID string) (map[string]any, *models.DockerHostCommandStatus) {
|
|
return m.getDockerCommandPayload(hostID)
|
|
}
|
|
|
|
// AcknowledgeDockerHostCommand updates the lifecycle status for a docker host command.
|
|
func (m *Monitor) AcknowledgeDockerHostCommand(commandID, hostID, status, message string) (models.DockerHostCommandStatus, string, bool, error) {
|
|
return m.acknowledgeDockerCommand(commandID, hostID, status, message)
|
|
}
|
|
|
|
func tokenHintFromRecord(record *config.APITokenRecord) string {
|
|
if record == nil {
|
|
return ""
|
|
}
|
|
switch {
|
|
case record.Prefix != "" && record.Suffix != "":
|
|
return fmt.Sprintf("%s…%s", record.Prefix, record.Suffix)
|
|
case record.Prefix != "":
|
|
return record.Prefix + "…"
|
|
case record.Suffix != "":
|
|
return "…" + record.Suffix
|
|
default:
|
|
return ""
|
|
}
|
|
}
|
|
|
|
func resolveDockerHostIdentifier(report agentsdocker.Report, tokenRecord *config.APITokenRecord, hosts []models.DockerHost) (string, []string, models.DockerHost, bool) {
|
|
base := strings.TrimSpace(report.AgentKey())
|
|
fallbacks := uniqueNonEmptyStrings(
|
|
base,
|
|
strings.TrimSpace(report.Agent.ID),
|
|
strings.TrimSpace(report.Host.MachineID),
|
|
strings.TrimSpace(report.Host.Hostname),
|
|
)
|
|
|
|
if existing, ok := findMatchingDockerHost(hosts, report, tokenRecord); ok {
|
|
return existing.ID, fallbacks, existing, true
|
|
}
|
|
|
|
identifier := base
|
|
if identifier == "" {
|
|
identifier = strings.TrimSpace(report.Host.MachineID)
|
|
}
|
|
if identifier == "" {
|
|
identifier = strings.TrimSpace(report.Host.Hostname)
|
|
}
|
|
if identifier == "" {
|
|
identifier = strings.TrimSpace(report.Agent.ID)
|
|
}
|
|
if identifier == "" {
|
|
identifier = fallbackDockerHostID(report, tokenRecord)
|
|
}
|
|
if identifier == "" {
|
|
identifier = "docker-host"
|
|
}
|
|
|
|
if dockerHostIDExists(identifier, hosts) {
|
|
identifier = generateDockerHostIdentifier(identifier, report, tokenRecord, hosts)
|
|
}
|
|
|
|
return identifier, fallbacks, models.DockerHost{}, false
|
|
}
|
|
|
|
func findMatchingDockerHost(hosts []models.DockerHost, report agentsdocker.Report, tokenRecord *config.APITokenRecord) (models.DockerHost, bool) {
|
|
agentID := strings.TrimSpace(report.Agent.ID)
|
|
tokenID := ""
|
|
if tokenRecord != nil {
|
|
tokenID = strings.TrimSpace(tokenRecord.ID)
|
|
}
|
|
machineID := strings.TrimSpace(report.Host.MachineID)
|
|
hostname := strings.TrimSpace(report.Host.Hostname)
|
|
|
|
if agentID != "" {
|
|
for _, host := range hosts {
|
|
if strings.TrimSpace(host.AgentID) != agentID {
|
|
continue
|
|
}
|
|
|
|
existingToken := strings.TrimSpace(host.TokenID)
|
|
if tokenID == "" || existingToken == tokenID {
|
|
return host, true
|
|
}
|
|
}
|
|
}
|
|
|
|
if machineID != "" && hostname != "" {
|
|
for _, host := range hosts {
|
|
if strings.TrimSpace(host.MachineID) == machineID && strings.TrimSpace(host.Hostname) == hostname {
|
|
if tokenID == "" || strings.TrimSpace(host.TokenID) == tokenID {
|
|
return host, true
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if machineID != "" && tokenID == "" {
|
|
for _, host := range hosts {
|
|
if strings.TrimSpace(host.MachineID) == machineID && strings.TrimSpace(host.TokenID) == "" {
|
|
return host, true
|
|
}
|
|
}
|
|
}
|
|
|
|
if hostname != "" && tokenID == "" {
|
|
for _, host := range hosts {
|
|
if strings.TrimSpace(host.Hostname) == hostname && strings.TrimSpace(host.TokenID) == "" {
|
|
return host, true
|
|
}
|
|
}
|
|
}
|
|
|
|
return models.DockerHost{}, false
|
|
}
|
|
|
|
func dockerHostIDExists(id string, hosts []models.DockerHost) bool {
|
|
if strings.TrimSpace(id) == "" {
|
|
return false
|
|
}
|
|
for _, host := range hosts {
|
|
if host.ID == id {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func generateDockerHostIdentifier(base string, report agentsdocker.Report, tokenRecord *config.APITokenRecord, hosts []models.DockerHost) string {
|
|
if strings.TrimSpace(base) == "" {
|
|
base = fallbackDockerHostID(report, tokenRecord)
|
|
}
|
|
if strings.TrimSpace(base) == "" {
|
|
base = "docker-host"
|
|
}
|
|
|
|
used := make(map[string]struct{}, len(hosts))
|
|
for _, host := range hosts {
|
|
used[host.ID] = struct{}{}
|
|
}
|
|
|
|
suffixes := dockerHostSuffixCandidates(report, tokenRecord)
|
|
for _, suffix := range suffixes {
|
|
candidate := fmt.Sprintf("%s::%s", base, suffix)
|
|
if _, exists := used[candidate]; !exists {
|
|
return candidate
|
|
}
|
|
}
|
|
|
|
seed := strings.Join(suffixes, "|")
|
|
if strings.TrimSpace(seed) == "" {
|
|
seed = base
|
|
}
|
|
sum := sha1.Sum([]byte(seed))
|
|
hashSuffix := fmt.Sprintf("hash-%s", hex.EncodeToString(sum[:6]))
|
|
candidate := fmt.Sprintf("%s::%s", base, hashSuffix)
|
|
if _, exists := used[candidate]; !exists {
|
|
return candidate
|
|
}
|
|
|
|
for idx := 2; ; idx++ {
|
|
candidate = fmt.Sprintf("%s::%d", base, idx)
|
|
if _, exists := used[candidate]; !exists {
|
|
return candidate
|
|
}
|
|
}
|
|
}
|
|
|
|
func dockerHostSuffixCandidates(report agentsdocker.Report, tokenRecord *config.APITokenRecord) []string {
|
|
candidates := make([]string, 0, 5)
|
|
|
|
if tokenRecord != nil {
|
|
if sanitized := sanitizeDockerHostSuffix(tokenRecord.ID); sanitized != "" {
|
|
candidates = append(candidates, "token-"+sanitized)
|
|
}
|
|
}
|
|
|
|
if agentID := sanitizeDockerHostSuffix(report.Agent.ID); agentID != "" {
|
|
candidates = append(candidates, "agent-"+agentID)
|
|
}
|
|
|
|
if machineID := sanitizeDockerHostSuffix(report.Host.MachineID); machineID != "" {
|
|
candidates = append(candidates, "machine-"+machineID)
|
|
}
|
|
|
|
hostNameSanitized := sanitizeDockerHostSuffix(report.Host.Hostname)
|
|
if hostNameSanitized != "" {
|
|
candidates = append(candidates, "host-"+hostNameSanitized)
|
|
}
|
|
|
|
hostDisplay := sanitizeDockerHostSuffix(report.Host.Name)
|
|
if hostDisplay != "" && hostDisplay != hostNameSanitized {
|
|
candidates = append(candidates, "name-"+hostDisplay)
|
|
}
|
|
|
|
return uniqueNonEmptyStrings(candidates...)
|
|
}
|
|
|
|
func sanitizeDockerHostSuffix(value string) string {
|
|
value = strings.ToLower(strings.TrimSpace(value))
|
|
if value == "" {
|
|
return ""
|
|
}
|
|
|
|
var builder strings.Builder
|
|
builder.Grow(len(value))
|
|
lastHyphen := false
|
|
runeCount := 0
|
|
|
|
for _, r := range value {
|
|
if runeCount >= 48 {
|
|
break
|
|
}
|
|
|
|
switch {
|
|
case unicode.IsLetter(r) || unicode.IsDigit(r):
|
|
builder.WriteRune(r)
|
|
lastHyphen = false
|
|
runeCount++
|
|
default:
|
|
if !lastHyphen {
|
|
builder.WriteRune('-')
|
|
lastHyphen = true
|
|
runeCount++
|
|
}
|
|
}
|
|
}
|
|
|
|
result := strings.Trim(builder.String(), "-")
|
|
if result == "" {
|
|
return ""
|
|
}
|
|
return result
|
|
}
|
|
|
|
func fallbackDockerHostID(report agentsdocker.Report, tokenRecord *config.APITokenRecord) string {
|
|
seedParts := dockerHostSuffixCandidates(report, tokenRecord)
|
|
if len(seedParts) == 0 {
|
|
seedParts = uniqueNonEmptyStrings(
|
|
report.Host.Hostname,
|
|
report.Host.MachineID,
|
|
report.Agent.ID,
|
|
)
|
|
}
|
|
if len(seedParts) == 0 {
|
|
return ""
|
|
}
|
|
seed := strings.Join(seedParts, "|")
|
|
sum := sha1.Sum([]byte(seed))
|
|
return fmt.Sprintf("docker-host-%s", hex.EncodeToString(sum[:6]))
|
|
}
|
|
|
|
func uniqueNonEmptyStrings(values ...string) []string {
|
|
seen := make(map[string]struct{}, len(values))
|
|
result := make([]string, 0, len(values))
|
|
for _, value := range values {
|
|
value = strings.TrimSpace(value)
|
|
if value == "" {
|
|
continue
|
|
}
|
|
if _, ok := seen[value]; ok {
|
|
continue
|
|
}
|
|
seen[value] = struct{}{}
|
|
result = append(result, value)
|
|
}
|
|
return result
|
|
}
|
|
|
|
// ApplyDockerReport ingests a docker agent report into the shared state.
|
|
func (m *Monitor) ApplyDockerReport(report agentsdocker.Report, tokenRecord *config.APITokenRecord) (models.DockerHost, error) {
|
|
hostsSnapshot := m.state.GetDockerHosts()
|
|
identifier, legacyIDs, previous, hasPrevious := resolveDockerHostIdentifier(report, tokenRecord, hostsSnapshot)
|
|
if strings.TrimSpace(identifier) == "" {
|
|
return models.DockerHost{}, fmt.Errorf("docker report missing agent identifier")
|
|
}
|
|
|
|
// Check if this host was deliberately removed - reject report to prevent resurrection
|
|
m.mu.RLock()
|
|
removedAt, wasRemoved := m.removedDockerHosts[identifier]
|
|
if !wasRemoved {
|
|
for _, legacyID := range legacyIDs {
|
|
if legacyID == "" || legacyID == identifier {
|
|
continue
|
|
}
|
|
if ts, ok := m.removedDockerHosts[legacyID]; ok {
|
|
removedAt = ts
|
|
wasRemoved = true
|
|
break
|
|
}
|
|
}
|
|
}
|
|
m.mu.RUnlock()
|
|
|
|
if wasRemoved {
|
|
log.Info().
|
|
Str("dockerHostID", identifier).
|
|
Time("removedAt", removedAt).
|
|
Msg("Rejecting report from deliberately removed Docker host")
|
|
return models.DockerHost{}, fmt.Errorf("docker host %q was removed at %v and cannot report again. Use Allow re-enroll in Settings -> Docker -> Removed hosts or rerun the installer with a docker:manage token to clear this block", identifier, removedAt.Format(time.RFC3339))
|
|
}
|
|
|
|
// Enforce token uniqueness: each token can only be bound to one agent
|
|
if tokenRecord != nil && tokenRecord.ID != "" {
|
|
tokenID := strings.TrimSpace(tokenRecord.ID)
|
|
agentID := strings.TrimSpace(report.Agent.ID)
|
|
if agentID == "" {
|
|
agentID = identifier
|
|
}
|
|
|
|
m.mu.Lock()
|
|
if boundAgentID, exists := m.dockerTokenBindings[tokenID]; exists {
|
|
if boundAgentID != agentID {
|
|
m.mu.Unlock()
|
|
// Find the conflicting host to provide helpful error message
|
|
conflictingHostname := "unknown"
|
|
for _, host := range hostsSnapshot {
|
|
if host.AgentID == boundAgentID || host.ID == boundAgentID {
|
|
conflictingHostname = host.Hostname
|
|
if host.CustomDisplayName != "" {
|
|
conflictingHostname = host.CustomDisplayName
|
|
} else if host.DisplayName != "" {
|
|
conflictingHostname = host.DisplayName
|
|
}
|
|
break
|
|
}
|
|
}
|
|
tokenHint := tokenHintFromRecord(tokenRecord)
|
|
if tokenHint != "" {
|
|
tokenHint = " (" + tokenHint + ")"
|
|
}
|
|
log.Warn().
|
|
Str("tokenID", tokenID).
|
|
Str("tokenHint", tokenHint).
|
|
Str("reportingAgentID", agentID).
|
|
Str("boundAgentID", boundAgentID).
|
|
Str("conflictingHost", conflictingHostname).
|
|
Msg("Rejecting Docker report: token already bound to different agent")
|
|
return models.DockerHost{}, fmt.Errorf("API token%s is already in use by agent %q (host: %s). Each Docker agent must use a unique API token. Generate a new token for this agent", tokenHint, boundAgentID, conflictingHostname)
|
|
}
|
|
} else {
|
|
// First time seeing this token - bind it to this agent
|
|
m.dockerTokenBindings[tokenID] = agentID
|
|
log.Debug().
|
|
Str("tokenID", tokenID).
|
|
Str("agentID", agentID).
|
|
Str("hostname", report.Host.Hostname).
|
|
Msg("Bound Docker agent token to agent identity")
|
|
}
|
|
m.mu.Unlock()
|
|
}
|
|
|
|
hostname := strings.TrimSpace(report.Host.Hostname)
|
|
if hostname == "" {
|
|
return models.DockerHost{}, fmt.Errorf("docker report missing hostname")
|
|
}
|
|
|
|
timestamp := report.Timestamp
|
|
if timestamp.IsZero() {
|
|
timestamp = time.Now()
|
|
}
|
|
|
|
agentID := strings.TrimSpace(report.Agent.ID)
|
|
if agentID == "" {
|
|
agentID = identifier
|
|
}
|
|
|
|
displayName := strings.TrimSpace(report.Host.Name)
|
|
if displayName == "" {
|
|
displayName = hostname
|
|
}
|
|
|
|
runtime := strings.ToLower(strings.TrimSpace(report.Host.Runtime))
|
|
switch runtime {
|
|
case "", "auto", "default":
|
|
runtime = "docker"
|
|
case "docker", "podman":
|
|
// supported runtimes
|
|
default:
|
|
runtime = "docker"
|
|
}
|
|
|
|
runtimeVersion := strings.TrimSpace(report.Host.RuntimeVersion)
|
|
dockerVersion := strings.TrimSpace(report.Host.DockerVersion)
|
|
if runtimeVersion == "" {
|
|
runtimeVersion = dockerVersion
|
|
}
|
|
if dockerVersion == "" {
|
|
dockerVersion = runtimeVersion
|
|
}
|
|
|
|
containers := make([]models.DockerContainer, 0, len(report.Containers))
|
|
for _, payload := range report.Containers {
|
|
container := models.DockerContainer{
|
|
ID: payload.ID,
|
|
Name: payload.Name,
|
|
Image: payload.Image,
|
|
State: payload.State,
|
|
Status: payload.Status,
|
|
Health: payload.Health,
|
|
CPUPercent: safeFloat(payload.CPUPercent),
|
|
MemoryUsage: payload.MemoryUsageBytes,
|
|
MemoryLimit: payload.MemoryLimitBytes,
|
|
MemoryPercent: safeFloat(payload.MemoryPercent),
|
|
UptimeSeconds: payload.UptimeSeconds,
|
|
RestartCount: payload.RestartCount,
|
|
ExitCode: payload.ExitCode,
|
|
CreatedAt: payload.CreatedAt,
|
|
StartedAt: payload.StartedAt,
|
|
FinishedAt: payload.FinishedAt,
|
|
}
|
|
|
|
if len(payload.Ports) > 0 {
|
|
ports := make([]models.DockerContainerPort, len(payload.Ports))
|
|
for i, port := range payload.Ports {
|
|
ports[i] = models.DockerContainerPort{
|
|
PrivatePort: port.PrivatePort,
|
|
PublicPort: port.PublicPort,
|
|
Protocol: port.Protocol,
|
|
IP: port.IP,
|
|
}
|
|
}
|
|
container.Ports = ports
|
|
}
|
|
|
|
if len(payload.Labels) > 0 {
|
|
labels := make(map[string]string, len(payload.Labels))
|
|
for k, v := range payload.Labels {
|
|
labels[k] = v
|
|
}
|
|
container.Labels = labels
|
|
}
|
|
|
|
if len(payload.Networks) > 0 {
|
|
networks := make([]models.DockerContainerNetworkLink, len(payload.Networks))
|
|
for i, net := range payload.Networks {
|
|
networks[i] = models.DockerContainerNetworkLink{
|
|
Name: net.Name,
|
|
IPv4: net.IPv4,
|
|
IPv6: net.IPv6,
|
|
}
|
|
}
|
|
container.Networks = networks
|
|
}
|
|
|
|
container.WritableLayerBytes = payload.WritableLayerBytes
|
|
container.RootFilesystemBytes = payload.RootFilesystemBytes
|
|
|
|
if payload.BlockIO != nil {
|
|
container.BlockIO = &models.DockerContainerBlockIO{
|
|
ReadBytes: payload.BlockIO.ReadBytes,
|
|
WriteBytes: payload.BlockIO.WriteBytes,
|
|
}
|
|
|
|
containerIdentifier := payload.ID
|
|
if strings.TrimSpace(containerIdentifier) == "" {
|
|
containerIdentifier = payload.Name
|
|
}
|
|
if strings.TrimSpace(containerIdentifier) != "" {
|
|
metrics := types.IOMetrics{
|
|
DiskRead: clampUint64ToInt64(payload.BlockIO.ReadBytes),
|
|
DiskWrite: clampUint64ToInt64(payload.BlockIO.WriteBytes),
|
|
Timestamp: timestamp,
|
|
}
|
|
readRate, writeRate, _, _ := m.rateTracker.CalculateRates(fmt.Sprintf("docker:%s:%s", identifier, containerIdentifier), metrics)
|
|
if readRate >= 0 {
|
|
value := readRate
|
|
container.BlockIO.ReadRateBytesPerSecond = &value
|
|
}
|
|
if writeRate >= 0 {
|
|
value := writeRate
|
|
container.BlockIO.WriteRateBytesPerSecond = &value
|
|
}
|
|
}
|
|
}
|
|
|
|
if len(payload.Mounts) > 0 {
|
|
mounts := make([]models.DockerContainerMount, len(payload.Mounts))
|
|
for i, mount := range payload.Mounts {
|
|
mounts[i] = models.DockerContainerMount{
|
|
Type: mount.Type,
|
|
Source: mount.Source,
|
|
Destination: mount.Destination,
|
|
Mode: mount.Mode,
|
|
RW: mount.RW,
|
|
Propagation: mount.Propagation,
|
|
Name: mount.Name,
|
|
Driver: mount.Driver,
|
|
}
|
|
}
|
|
container.Mounts = mounts
|
|
}
|
|
|
|
containers = append(containers, container)
|
|
}
|
|
|
|
services := convertDockerServices(report.Services)
|
|
tasks := convertDockerTasks(report.Tasks)
|
|
swarmInfo := convertDockerSwarmInfo(report.Host.Swarm)
|
|
|
|
loadAverage := make([]float64, 0, len(report.Host.LoadAverage))
|
|
if len(report.Host.LoadAverage) > 0 {
|
|
loadAverage = append(loadAverage, report.Host.LoadAverage...)
|
|
}
|
|
|
|
var memory models.Memory
|
|
if report.Host.Memory.TotalBytes > 0 || report.Host.Memory.UsedBytes > 0 {
|
|
memory = models.Memory{
|
|
Total: report.Host.Memory.TotalBytes,
|
|
Used: report.Host.Memory.UsedBytes,
|
|
Free: report.Host.Memory.FreeBytes,
|
|
Usage: safeFloat(report.Host.Memory.Usage),
|
|
SwapTotal: report.Host.Memory.SwapTotal,
|
|
SwapUsed: report.Host.Memory.SwapUsed,
|
|
}
|
|
}
|
|
|
|
disks := make([]models.Disk, 0, len(report.Host.Disks))
|
|
for _, disk := range report.Host.Disks {
|
|
disks = append(disks, models.Disk{
|
|
Total: disk.TotalBytes,
|
|
Used: disk.UsedBytes,
|
|
Free: disk.FreeBytes,
|
|
Usage: safeFloat(disk.Usage),
|
|
Mountpoint: disk.Mountpoint,
|
|
Type: disk.Type,
|
|
Device: disk.Device,
|
|
})
|
|
}
|
|
|
|
networkIfaces := make([]models.HostNetworkInterface, 0, len(report.Host.Network))
|
|
for _, iface := range report.Host.Network {
|
|
addresses := append([]string(nil), iface.Addresses...)
|
|
networkIfaces = append(networkIfaces, models.HostNetworkInterface{
|
|
Name: iface.Name,
|
|
MAC: iface.MAC,
|
|
Addresses: addresses,
|
|
RXBytes: iface.RXBytes,
|
|
TXBytes: iface.TXBytes,
|
|
SpeedMbps: iface.SpeedMbps,
|
|
})
|
|
}
|
|
|
|
agentVersion := normalizeAgentVersion(report.Agent.Version)
|
|
if agentVersion == "" && hasPrevious {
|
|
agentVersion = normalizeAgentVersion(previous.AgentVersion)
|
|
}
|
|
|
|
host := models.DockerHost{
|
|
ID: identifier,
|
|
AgentID: agentID,
|
|
Hostname: hostname,
|
|
DisplayName: displayName,
|
|
MachineID: strings.TrimSpace(report.Host.MachineID),
|
|
OS: report.Host.OS,
|
|
KernelVersion: report.Host.KernelVersion,
|
|
Architecture: report.Host.Architecture,
|
|
Runtime: runtime,
|
|
RuntimeVersion: runtimeVersion,
|
|
DockerVersion: dockerVersion,
|
|
CPUs: report.Host.TotalCPU,
|
|
TotalMemoryBytes: report.Host.TotalMemoryBytes,
|
|
UptimeSeconds: report.Host.UptimeSeconds,
|
|
CPUUsage: safeFloat(report.Host.CPUUsagePercent),
|
|
LoadAverage: loadAverage,
|
|
Memory: memory,
|
|
Disks: disks,
|
|
NetworkInterfaces: networkIfaces,
|
|
Status: "online",
|
|
LastSeen: timestamp,
|
|
IntervalSeconds: report.Agent.IntervalSeconds,
|
|
AgentVersion: agentVersion,
|
|
Containers: containers,
|
|
Services: services,
|
|
Tasks: tasks,
|
|
Swarm: swarmInfo,
|
|
IsLegacy: isLegacyDockerAgent(report.Agent.Type),
|
|
}
|
|
|
|
if tokenRecord != nil {
|
|
host.TokenID = tokenRecord.ID
|
|
host.TokenName = tokenRecord.Name
|
|
host.TokenHint = tokenHintFromRecord(tokenRecord)
|
|
if tokenRecord.LastUsedAt != nil {
|
|
t := tokenRecord.LastUsedAt.UTC()
|
|
host.TokenLastUsedAt = &t
|
|
} else {
|
|
t := time.Now().UTC()
|
|
host.TokenLastUsedAt = &t
|
|
}
|
|
} else if hasPrevious {
|
|
host.TokenID = previous.TokenID
|
|
host.TokenName = previous.TokenName
|
|
host.TokenHint = previous.TokenHint
|
|
host.TokenLastUsedAt = previous.TokenLastUsedAt
|
|
}
|
|
|
|
// Load custom display name from metadata store if not already set
|
|
if host.CustomDisplayName == "" {
|
|
if hostMeta := m.dockerMetadataStore.GetHostMetadata(identifier); hostMeta != nil {
|
|
host.CustomDisplayName = hostMeta.CustomDisplayName
|
|
}
|
|
}
|
|
|
|
m.state.UpsertDockerHost(host)
|
|
m.state.SetConnectionHealth(dockerConnectionPrefix+host.ID, true)
|
|
|
|
// Check if the host was previously hidden and is now visible again
|
|
if hasPrevious && previous.Hidden && !host.Hidden {
|
|
log.Info().
|
|
Str("dockerHost", host.Hostname).
|
|
Str("dockerHostID", host.ID).
|
|
Msg("Docker host auto-unhidden after receiving report")
|
|
}
|
|
|
|
// Check if the host was pending uninstall - if so, log a warning that uninstall failed and clear the flag
|
|
if hasPrevious && previous.PendingUninstall {
|
|
log.Warn().
|
|
Str("dockerHost", host.Hostname).
|
|
Str("dockerHostID", host.ID).
|
|
Msg("Docker host reporting again after pending uninstall - uninstall may have failed")
|
|
|
|
// Clear the pending uninstall flag since the host is clearly still active
|
|
m.state.SetDockerHostPendingUninstall(host.ID, false)
|
|
}
|
|
|
|
if m.alertManager != nil {
|
|
m.alertManager.CheckDockerHost(host)
|
|
}
|
|
|
|
log.Debug().
|
|
Str("dockerHost", host.Hostname).
|
|
Int("containers", len(containers)).
|
|
Msg("Docker host report processed")
|
|
|
|
return host, nil
|
|
}
|
|
|
|
// ApplyHostReport ingests a host agent report into the shared state.
|
|
func (m *Monitor) ApplyHostReport(report agentshost.Report, tokenRecord *config.APITokenRecord) (models.Host, error) {
|
|
hostname := strings.TrimSpace(report.Host.Hostname)
|
|
if hostname == "" {
|
|
return models.Host{}, fmt.Errorf("host report missing hostname")
|
|
}
|
|
|
|
identifier := strings.TrimSpace(report.Host.ID)
|
|
if identifier != "" {
|
|
identifier = sanitizeDockerHostSuffix(identifier)
|
|
}
|
|
if identifier == "" {
|
|
if machine := sanitizeDockerHostSuffix(report.Host.MachineID); machine != "" {
|
|
identifier = machine
|
|
}
|
|
}
|
|
if identifier == "" {
|
|
if agentID := sanitizeDockerHostSuffix(report.Agent.ID); agentID != "" {
|
|
identifier = agentID
|
|
}
|
|
}
|
|
if identifier == "" {
|
|
if hostName := sanitizeDockerHostSuffix(hostname); hostName != "" {
|
|
identifier = hostName
|
|
}
|
|
}
|
|
if identifier == "" {
|
|
seedParts := uniqueNonEmptyStrings(
|
|
report.Host.MachineID,
|
|
report.Agent.ID,
|
|
report.Host.Hostname,
|
|
)
|
|
if len(seedParts) == 0 {
|
|
seedParts = []string{hostname}
|
|
}
|
|
seed := strings.Join(seedParts, "|")
|
|
sum := sha1.Sum([]byte(seed))
|
|
identifier = fmt.Sprintf("host-%s", hex.EncodeToString(sum[:6]))
|
|
}
|
|
|
|
existingHosts := m.state.GetHosts()
|
|
|
|
agentID := strings.TrimSpace(report.Agent.ID)
|
|
if agentID == "" {
|
|
agentID = identifier
|
|
}
|
|
|
|
if tokenRecord != nil && tokenRecord.ID != "" {
|
|
tokenID := strings.TrimSpace(tokenRecord.ID)
|
|
bindingID := agentID
|
|
if bindingID == "" {
|
|
bindingID = identifier
|
|
}
|
|
|
|
m.mu.Lock()
|
|
if m.hostTokenBindings == nil {
|
|
m.hostTokenBindings = make(map[string]string)
|
|
}
|
|
if boundID, exists := m.hostTokenBindings[tokenID]; exists && boundID != bindingID {
|
|
m.mu.Unlock()
|
|
|
|
conflictingHost := "unknown"
|
|
for _, candidate := range existingHosts {
|
|
if candidate.TokenID == tokenID || candidate.ID == boundID {
|
|
conflictingHost = candidate.Hostname
|
|
if candidate.DisplayName != "" {
|
|
conflictingHost = candidate.DisplayName
|
|
}
|
|
break
|
|
}
|
|
}
|
|
|
|
tokenHint := tokenHintFromRecord(tokenRecord)
|
|
if tokenHint != "" {
|
|
tokenHint = " (" + tokenHint + ")"
|
|
}
|
|
|
|
log.Warn().
|
|
Str("tokenID", tokenID).
|
|
Str("tokenHint", tokenHint).
|
|
Str("reportingAgentID", bindingID).
|
|
Str("boundAgentID", boundID).
|
|
Str("conflictingHost", conflictingHost).
|
|
Msg("Rejecting host report: token already bound to different agent")
|
|
|
|
return models.Host{}, fmt.Errorf("API token%s is already in use by host %q (agent: %s). Generate a new token or set --agent-id before reusing it", tokenHint, conflictingHost, boundID)
|
|
}
|
|
|
|
if _, exists := m.hostTokenBindings[tokenID]; !exists {
|
|
m.hostTokenBindings[tokenID] = bindingID
|
|
log.Debug().
|
|
Str("tokenID", tokenID).
|
|
Str("agentID", bindingID).
|
|
Str("hostname", hostname).
|
|
Msg("Bound host agent token to agent identity")
|
|
}
|
|
m.mu.Unlock()
|
|
}
|
|
|
|
var previous models.Host
|
|
var hasPrevious bool
|
|
for _, candidate := range existingHosts {
|
|
if candidate.ID == identifier {
|
|
previous = candidate
|
|
hasPrevious = true
|
|
break
|
|
}
|
|
}
|
|
|
|
displayName := strings.TrimSpace(report.Host.DisplayName)
|
|
if displayName == "" {
|
|
displayName = hostname
|
|
}
|
|
|
|
timestamp := report.Timestamp
|
|
if timestamp.IsZero() {
|
|
timestamp = time.Now().UTC()
|
|
}
|
|
|
|
memory := models.Memory{
|
|
Total: report.Metrics.Memory.TotalBytes,
|
|
Used: report.Metrics.Memory.UsedBytes,
|
|
Free: report.Metrics.Memory.FreeBytes,
|
|
Usage: safeFloat(report.Metrics.Memory.Usage),
|
|
SwapTotal: report.Metrics.Memory.SwapTotal,
|
|
SwapUsed: report.Metrics.Memory.SwapUsed,
|
|
}
|
|
if memory.Usage <= 0 && memory.Total > 0 {
|
|
memory.Usage = safePercentage(float64(memory.Used), float64(memory.Total))
|
|
}
|
|
|
|
disks := make([]models.Disk, 0, len(report.Disks))
|
|
for _, disk := range report.Disks {
|
|
// Filter read-only filesystems for backward compatibility with older host agents
|
|
// that don't have the filter built in. Prevents false alerts for snap mounts,
|
|
// immutable OS images, etc. (issues #505, #690).
|
|
if shouldIgnoreReadOnlyFilesystem(disk.Type, uint64(disk.TotalBytes), uint64(disk.UsedBytes)) {
|
|
continue
|
|
}
|
|
|
|
usage := safeFloat(disk.Usage)
|
|
if usage <= 0 && disk.TotalBytes > 0 {
|
|
usage = safePercentage(float64(disk.UsedBytes), float64(disk.TotalBytes))
|
|
}
|
|
disks = append(disks, models.Disk{
|
|
Total: disk.TotalBytes,
|
|
Used: disk.UsedBytes,
|
|
Free: disk.FreeBytes,
|
|
Usage: usage,
|
|
Mountpoint: disk.Mountpoint,
|
|
Type: disk.Type,
|
|
Device: disk.Device,
|
|
})
|
|
}
|
|
|
|
network := make([]models.HostNetworkInterface, 0, len(report.Network))
|
|
for _, nic := range report.Network {
|
|
network = append(network, models.HostNetworkInterface{
|
|
Name: nic.Name,
|
|
MAC: nic.MAC,
|
|
Addresses: append([]string(nil), nic.Addresses...),
|
|
RXBytes: nic.RXBytes,
|
|
TXBytes: nic.TXBytes,
|
|
SpeedMbps: nic.SpeedMbps,
|
|
})
|
|
}
|
|
|
|
raid := make([]models.HostRAIDArray, 0, len(report.RAID))
|
|
for _, array := range report.RAID {
|
|
devices := make([]models.HostRAIDDevice, 0, len(array.Devices))
|
|
for _, dev := range array.Devices {
|
|
devices = append(devices, models.HostRAIDDevice{
|
|
Device: dev.Device,
|
|
State: dev.State,
|
|
Slot: dev.Slot,
|
|
})
|
|
}
|
|
raid = append(raid, models.HostRAIDArray{
|
|
Device: array.Device,
|
|
Name: array.Name,
|
|
Level: array.Level,
|
|
State: array.State,
|
|
TotalDevices: array.TotalDevices,
|
|
ActiveDevices: array.ActiveDevices,
|
|
WorkingDevices: array.WorkingDevices,
|
|
FailedDevices: array.FailedDevices,
|
|
SpareDevices: array.SpareDevices,
|
|
UUID: array.UUID,
|
|
Devices: devices,
|
|
RebuildPercent: array.RebuildPercent,
|
|
RebuildSpeed: array.RebuildSpeed,
|
|
})
|
|
}
|
|
|
|
host := models.Host{
|
|
ID: identifier,
|
|
Hostname: hostname,
|
|
DisplayName: displayName,
|
|
Platform: strings.TrimSpace(strings.ToLower(report.Host.Platform)),
|
|
OSName: strings.TrimSpace(report.Host.OSName),
|
|
OSVersion: strings.TrimSpace(report.Host.OSVersion),
|
|
KernelVersion: strings.TrimSpace(report.Host.KernelVersion),
|
|
Architecture: strings.TrimSpace(report.Host.Architecture),
|
|
CPUCount: report.Host.CPUCount,
|
|
CPUUsage: safeFloat(report.Metrics.CPUUsagePercent),
|
|
LoadAverage: append([]float64(nil), report.Host.LoadAverage...),
|
|
Memory: memory,
|
|
Disks: disks,
|
|
NetworkInterfaces: network,
|
|
Sensors: models.HostSensorSummary{
|
|
TemperatureCelsius: cloneStringFloatMap(report.Sensors.TemperatureCelsius),
|
|
FanRPM: cloneStringFloatMap(report.Sensors.FanRPM),
|
|
Additional: cloneStringFloatMap(report.Sensors.Additional),
|
|
},
|
|
RAID: raid,
|
|
Status: "online",
|
|
UptimeSeconds: report.Host.UptimeSeconds,
|
|
IntervalSeconds: report.Agent.IntervalSeconds,
|
|
LastSeen: timestamp,
|
|
AgentVersion: strings.TrimSpace(report.Agent.Version),
|
|
Tags: append([]string(nil), report.Tags...),
|
|
IsLegacy: isLegacyHostAgent(report.Agent.Type),
|
|
}
|
|
|
|
if len(host.LoadAverage) == 0 {
|
|
host.LoadAverage = nil
|
|
}
|
|
if len(host.Disks) == 0 {
|
|
host.Disks = nil
|
|
}
|
|
if len(host.NetworkInterfaces) == 0 {
|
|
host.NetworkInterfaces = nil
|
|
}
|
|
if len(host.RAID) == 0 {
|
|
host.RAID = nil
|
|
}
|
|
|
|
if tokenRecord != nil {
|
|
host.TokenID = tokenRecord.ID
|
|
host.TokenName = tokenRecord.Name
|
|
host.TokenHint = tokenHintFromRecord(tokenRecord)
|
|
if tokenRecord.LastUsedAt != nil {
|
|
t := tokenRecord.LastUsedAt.UTC()
|
|
host.TokenLastUsedAt = &t
|
|
} else {
|
|
now := time.Now().UTC()
|
|
host.TokenLastUsedAt = &now
|
|
}
|
|
} else if hasPrevious {
|
|
host.TokenID = previous.TokenID
|
|
host.TokenName = previous.TokenName
|
|
host.TokenHint = previous.TokenHint
|
|
host.TokenLastUsedAt = previous.TokenLastUsedAt
|
|
}
|
|
|
|
m.state.UpsertHost(host)
|
|
m.state.SetConnectionHealth(hostConnectionPrefix+host.ID, true)
|
|
|
|
if m.alertManager != nil {
|
|
m.alertManager.CheckHost(host)
|
|
}
|
|
|
|
return host, nil
|
|
}
|
|
|
|
const (
|
|
removedDockerHostsTTL = 24 * time.Hour // Clean up removed hosts tracking after 24 hours
|
|
)
|
|
|
|
// recoverFromPanic recovers from panics in monitoring goroutines and logs them.
|
|
// This prevents a panic in one component from crashing the entire monitoring system.
|
|
func recoverFromPanic(goroutineName string) {
|
|
if r := recover(); r != nil {
|
|
log.Error().
|
|
Str("goroutine", goroutineName).
|
|
Interface("panic", r).
|
|
Stack().
|
|
Msg("Recovered from panic in monitoring goroutine")
|
|
}
|
|
}
|
|
|
|
// cleanupRemovedDockerHosts removes entries from the removed hosts map that are older than 24 hours.
|
|
func (m *Monitor) cleanupRemovedDockerHosts(now time.Time) {
|
|
// Collect IDs to remove first to avoid holding lock during state update
|
|
var toRemove []string
|
|
|
|
m.mu.Lock()
|
|
for hostID, removedAt := range m.removedDockerHosts {
|
|
if now.Sub(removedAt) > removedDockerHostsTTL {
|
|
toRemove = append(toRemove, hostID)
|
|
}
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
// Remove from state and map without holding both locks
|
|
for _, hostID := range toRemove {
|
|
m.state.RemoveRemovedDockerHost(hostID)
|
|
|
|
m.mu.Lock()
|
|
removedAt := m.removedDockerHosts[hostID]
|
|
delete(m.removedDockerHosts, hostID)
|
|
m.mu.Unlock()
|
|
|
|
log.Debug().
|
|
Str("dockerHostID", hostID).
|
|
Time("removedAt", removedAt).
|
|
Msg("Cleaned up old removed Docker host entry")
|
|
}
|
|
}
|
|
|
|
// cleanupGuestMetadataCache removes stale guest metadata entries.
|
|
// Entries older than 2x the cache TTL (10 minutes) are removed to prevent unbounded growth
|
|
// when VMs are deleted or moved.
|
|
func (m *Monitor) cleanupGuestMetadataCache(now time.Time) {
|
|
const maxAge = 2 * guestMetadataCacheTTL // 10 minutes
|
|
|
|
m.guestMetadataMu.Lock()
|
|
defer m.guestMetadataMu.Unlock()
|
|
|
|
for key, entry := range m.guestMetadataCache {
|
|
if now.Sub(entry.fetchedAt) > maxAge {
|
|
delete(m.guestMetadataCache, key)
|
|
log.Debug().
|
|
Str("key", key).
|
|
Time("fetchedAt", entry.fetchedAt).
|
|
Msg("Cleaned up stale guest metadata cache entry")
|
|
}
|
|
}
|
|
}
|
|
|
|
// cleanupDiagnosticSnapshots removes stale diagnostic snapshots.
|
|
// Snapshots older than 1 hour are removed to prevent unbounded growth
|
|
// when nodes/VMs are deleted or reconfigured.
|
|
func (m *Monitor) cleanupDiagnosticSnapshots(now time.Time) {
|
|
const maxAge = 1 * time.Hour
|
|
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
for key, snapshot := range m.nodeSnapshots {
|
|
if now.Sub(snapshot.RetrievedAt) > maxAge {
|
|
delete(m.nodeSnapshots, key)
|
|
log.Debug().
|
|
Str("key", key).
|
|
Time("retrievedAt", snapshot.RetrievedAt).
|
|
Msg("Cleaned up stale node snapshot")
|
|
}
|
|
}
|
|
|
|
for key, snapshot := range m.guestSnapshots {
|
|
if now.Sub(snapshot.RetrievedAt) > maxAge {
|
|
delete(m.guestSnapshots, key)
|
|
log.Debug().
|
|
Str("key", key).
|
|
Time("retrievedAt", snapshot.RetrievedAt).
|
|
Msg("Cleaned up stale guest snapshot")
|
|
}
|
|
}
|
|
}
|
|
|
|
// cleanupRRDCache removes stale RRD memory cache entries.
|
|
// Entries older than 2x the cache TTL (1 minute) are removed to prevent unbounded growth
|
|
// when nodes are removed from the cluster.
|
|
func (m *Monitor) cleanupRRDCache(now time.Time) {
|
|
const maxAge = 2 * nodeRRDCacheTTL // 1 minute
|
|
|
|
m.rrdCacheMu.Lock()
|
|
defer m.rrdCacheMu.Unlock()
|
|
|
|
for key, entry := range m.nodeRRDMemCache {
|
|
if now.Sub(entry.fetchedAt) > maxAge {
|
|
delete(m.nodeRRDMemCache, key)
|
|
log.Debug().
|
|
Str("node", key).
|
|
Time("fetchedAt", entry.fetchedAt).
|
|
Msg("Cleaned up stale RRD cache entry")
|
|
}
|
|
}
|
|
}
|
|
|
|
// evaluateDockerAgents updates health for Docker hosts based on last report time.
|
|
func (m *Monitor) evaluateDockerAgents(now time.Time) {
|
|
hosts := m.state.GetDockerHosts()
|
|
for _, host := range hosts {
|
|
interval := host.IntervalSeconds
|
|
if interval <= 0 {
|
|
interval = int(dockerMinimumHealthWindow / time.Second)
|
|
}
|
|
|
|
window := time.Duration(interval) * time.Second * dockerOfflineGraceMultiplier
|
|
if window < dockerMinimumHealthWindow {
|
|
window = dockerMinimumHealthWindow
|
|
} else if window > dockerMaximumHealthWindow {
|
|
window = dockerMaximumHealthWindow
|
|
}
|
|
|
|
healthy := !host.LastSeen.IsZero() && now.Sub(host.LastSeen) <= window
|
|
key := dockerConnectionPrefix + host.ID
|
|
m.state.SetConnectionHealth(key, healthy)
|
|
hostCopy := host
|
|
if healthy {
|
|
hostCopy.Status = "online"
|
|
m.state.SetDockerHostStatus(host.ID, "online")
|
|
if m.alertManager != nil {
|
|
m.alertManager.HandleDockerHostOnline(hostCopy)
|
|
}
|
|
} else {
|
|
hostCopy.Status = "offline"
|
|
m.state.SetDockerHostStatus(host.ID, "offline")
|
|
if m.alertManager != nil {
|
|
m.alertManager.HandleDockerHostOffline(hostCopy)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// evaluateHostAgents updates health for host agents based on last report time.
|
|
func (m *Monitor) evaluateHostAgents(now time.Time) {
|
|
hosts := m.state.GetHosts()
|
|
for _, host := range hosts {
|
|
interval := host.IntervalSeconds
|
|
if interval <= 0 {
|
|
interval = int(hostMinimumHealthWindow / time.Second)
|
|
}
|
|
|
|
window := time.Duration(interval) * time.Second * hostOfflineGraceMultiplier
|
|
if window < hostMinimumHealthWindow {
|
|
window = hostMinimumHealthWindow
|
|
} else if window > hostMaximumHealthWindow {
|
|
window = hostMaximumHealthWindow
|
|
}
|
|
|
|
healthy := !host.LastSeen.IsZero() && now.Sub(host.LastSeen) <= window
|
|
key := hostConnectionPrefix + host.ID
|
|
m.state.SetConnectionHealth(key, healthy)
|
|
|
|
hostCopy := host
|
|
if healthy {
|
|
hostCopy.Status = "online"
|
|
m.state.SetHostStatus(host.ID, "online")
|
|
if m.alertManager != nil {
|
|
m.alertManager.HandleHostOnline(hostCopy)
|
|
}
|
|
} else {
|
|
hostCopy.Status = "offline"
|
|
m.state.SetHostStatus(host.ID, "offline")
|
|
if m.alertManager != nil {
|
|
m.alertManager.HandleHostOffline(hostCopy)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// sortContent sorts comma-separated content values for consistent display
|
|
func sortContent(content string) string {
|
|
if content == "" {
|
|
return ""
|
|
}
|
|
parts := strings.Split(content, ",")
|
|
sort.Strings(parts)
|
|
return strings.Join(parts, ",")
|
|
}
|
|
|
|
func (m *Monitor) tryReserveGuestMetadataFetch(key string, now time.Time) bool {
|
|
if m == nil {
|
|
return false
|
|
}
|
|
m.guestMetadataLimiterMu.Lock()
|
|
defer m.guestMetadataLimiterMu.Unlock()
|
|
|
|
if next, ok := m.guestMetadataLimiter[key]; ok && now.Before(next) {
|
|
return false
|
|
}
|
|
hold := m.guestMetadataHoldDuration
|
|
if hold <= 0 {
|
|
hold = defaultGuestMetadataHold
|
|
}
|
|
m.guestMetadataLimiter[key] = now.Add(hold)
|
|
return true
|
|
}
|
|
|
|
func (m *Monitor) scheduleNextGuestMetadataFetch(key string, now time.Time) {
|
|
if m == nil {
|
|
return
|
|
}
|
|
interval := m.guestMetadataMinRefresh
|
|
if interval <= 0 {
|
|
interval = config.DefaultGuestMetadataMinRefresh
|
|
}
|
|
jitter := m.guestMetadataRefreshJitter
|
|
if jitter > 0 && m.rng != nil {
|
|
interval += time.Duration(m.rng.Int63n(int64(jitter)))
|
|
}
|
|
m.guestMetadataLimiterMu.Lock()
|
|
m.guestMetadataLimiter[key] = now.Add(interval)
|
|
m.guestMetadataLimiterMu.Unlock()
|
|
}
|
|
|
|
func (m *Monitor) deferGuestMetadataRetry(key string, now time.Time) {
|
|
if m == nil {
|
|
return
|
|
}
|
|
backoff := m.guestMetadataRetryBackoff
|
|
if backoff <= 0 {
|
|
backoff = config.DefaultGuestMetadataRetryBackoff
|
|
}
|
|
m.guestMetadataLimiterMu.Lock()
|
|
m.guestMetadataLimiter[key] = now.Add(backoff)
|
|
m.guestMetadataLimiterMu.Unlock()
|
|
}
|
|
|
|
func (m *Monitor) acquireGuestMetadataSlot(ctx context.Context) bool {
|
|
if m == nil || m.guestMetadataSlots == nil {
|
|
return true
|
|
}
|
|
select {
|
|
case m.guestMetadataSlots <- struct{}{}:
|
|
return true
|
|
case <-ctx.Done():
|
|
return false
|
|
}
|
|
}
|
|
|
|
func (m *Monitor) releaseGuestMetadataSlot() {
|
|
if m == nil || m.guestMetadataSlots == nil {
|
|
return
|
|
}
|
|
select {
|
|
case <-m.guestMetadataSlots:
|
|
default:
|
|
}
|
|
}
|
|
|
|
// retryGuestAgentCall executes a guest agent API call with timeout and retry logic (refs #592)
|
|
func (m *Monitor) retryGuestAgentCall(ctx context.Context, timeout time.Duration, maxRetries int, fn func(context.Context) (interface{}, error)) (interface{}, error) {
|
|
var lastErr error
|
|
for attempt := 0; attempt <= maxRetries; attempt++ {
|
|
callCtx, cancel := context.WithTimeout(ctx, timeout)
|
|
result, err := fn(callCtx)
|
|
cancel()
|
|
|
|
if err == nil {
|
|
return result, nil
|
|
}
|
|
|
|
lastErr = err
|
|
|
|
// Don't retry non-timeout errors or if this was the last attempt
|
|
if attempt >= maxRetries || !strings.Contains(err.Error(), "timeout") {
|
|
break
|
|
}
|
|
|
|
// Brief delay before retry to avoid hammering the API
|
|
select {
|
|
case <-time.After(defaultGuestAgentRetryDelay):
|
|
case <-ctx.Done():
|
|
return nil, ctx.Err()
|
|
}
|
|
}
|
|
|
|
return nil, lastErr
|
|
}
|
|
|
|
func (m *Monitor) fetchGuestAgentMetadata(ctx context.Context, client PVEClientInterface, instanceName, nodeName, vmName string, vmid int, vmStatus *proxmox.VMStatus) ([]string, []models.GuestNetworkInterface, string, string, string) {
|
|
if vmStatus == nil || client == nil {
|
|
m.clearGuestMetadataCache(instanceName, nodeName, vmid)
|
|
return nil, nil, "", "", ""
|
|
}
|
|
|
|
if vmStatus.Agent.Value <= 0 {
|
|
m.clearGuestMetadataCache(instanceName, nodeName, vmid)
|
|
return nil, nil, "", "", ""
|
|
}
|
|
|
|
key := guestMetadataCacheKey(instanceName, nodeName, vmid)
|
|
now := time.Now()
|
|
|
|
m.guestMetadataMu.RLock()
|
|
cached, ok := m.guestMetadataCache[key]
|
|
m.guestMetadataMu.RUnlock()
|
|
|
|
if ok && now.Sub(cached.fetchedAt) < guestMetadataCacheTTL {
|
|
return cloneStringSlice(cached.ipAddresses), cloneGuestNetworkInterfaces(cached.networkInterfaces), cached.osName, cached.osVersion, cached.agentVersion
|
|
}
|
|
|
|
needsFetch := !ok || now.Sub(cached.fetchedAt) >= guestMetadataCacheTTL
|
|
if !needsFetch {
|
|
return cloneStringSlice(cached.ipAddresses), cloneGuestNetworkInterfaces(cached.networkInterfaces), cached.osName, cached.osVersion, cached.agentVersion
|
|
}
|
|
|
|
reserved := m.tryReserveGuestMetadataFetch(key, now)
|
|
if !reserved && ok {
|
|
return cloneStringSlice(cached.ipAddresses), cloneGuestNetworkInterfaces(cached.networkInterfaces), cached.osName, cached.osVersion, cached.agentVersion
|
|
}
|
|
if !reserved && !ok {
|
|
reserved = true
|
|
}
|
|
|
|
// Start with cached values as fallback in case new calls fail
|
|
ipAddresses := cloneStringSlice(cached.ipAddresses)
|
|
networkIfaces := cloneGuestNetworkInterfaces(cached.networkInterfaces)
|
|
osName := cached.osName
|
|
osVersion := cached.osVersion
|
|
agentVersion := cached.agentVersion
|
|
|
|
if reserved {
|
|
if !m.acquireGuestMetadataSlot(ctx) {
|
|
m.deferGuestMetadataRetry(key, time.Now())
|
|
return ipAddresses, networkIfaces, osName, osVersion, agentVersion
|
|
}
|
|
defer m.releaseGuestMetadataSlot()
|
|
defer func() {
|
|
m.scheduleNextGuestMetadataFetch(key, time.Now())
|
|
}()
|
|
}
|
|
|
|
// Network interfaces with configurable timeout and retry (refs #592)
|
|
interfaces, err := m.retryGuestAgentCall(ctx, m.guestAgentNetworkTimeout, m.guestAgentRetries, func(ctx context.Context) (interface{}, error) {
|
|
return client.GetVMNetworkInterfaces(ctx, nodeName, vmid)
|
|
})
|
|
if err != nil {
|
|
log.Debug().
|
|
Str("instance", instanceName).
|
|
Str("vm", vmName).
|
|
Int("vmid", vmid).
|
|
Err(err).
|
|
Msg("Guest agent network interfaces unavailable")
|
|
} else if ifaces, ok := interfaces.([]proxmox.VMNetworkInterface); ok && len(ifaces) > 0 {
|
|
ipAddresses, networkIfaces = processGuestNetworkInterfaces(ifaces)
|
|
} else {
|
|
ipAddresses = nil
|
|
networkIfaces = nil
|
|
}
|
|
|
|
// OS info with configurable timeout and retry (refs #592)
|
|
// Skip OS info calls if we've seen repeated failures (refs #692 - OpenBSD qemu-ga issue)
|
|
osInfoFailureCount := cached.osInfoFailureCount
|
|
osInfoSkip := cached.osInfoSkip
|
|
|
|
if !osInfoSkip {
|
|
agentInfoRaw, err := m.retryGuestAgentCall(ctx, m.guestAgentOSInfoTimeout, m.guestAgentRetries, func(ctx context.Context) (interface{}, error) {
|
|
return client.GetVMAgentInfo(ctx, nodeName, vmid)
|
|
})
|
|
if err != nil {
|
|
if isGuestAgentOSInfoUnsupportedError(err) {
|
|
osInfoSkip = true
|
|
osInfoFailureCount = guestAgentOSInfoFailureThreshold
|
|
log.Warn().
|
|
Str("instance", instanceName).
|
|
Str("vm", vmName).
|
|
Int("vmid", vmid).
|
|
Err(err).
|
|
Msg("Guest agent OS info unsupported (missing os-release). Skipping future calls to avoid qemu-ga issues (refs #692)")
|
|
} else {
|
|
osInfoFailureCount++
|
|
if osInfoFailureCount >= guestAgentOSInfoFailureThreshold {
|
|
osInfoSkip = true
|
|
log.Info().
|
|
Str("instance", instanceName).
|
|
Str("vm", vmName).
|
|
Int("vmid", vmid).
|
|
Int("failureCount", osInfoFailureCount).
|
|
Msg("Guest agent OS info consistently fails, skipping future calls to avoid triggering buggy guest agents")
|
|
} else {
|
|
log.Debug().
|
|
Str("instance", instanceName).
|
|
Str("vm", vmName).
|
|
Int("vmid", vmid).
|
|
Int("failureCount", osInfoFailureCount).
|
|
Err(err).
|
|
Msg("Guest agent OS info unavailable")
|
|
}
|
|
}
|
|
} else if agentInfo, ok := agentInfoRaw.(map[string]interface{}); ok && len(agentInfo) > 0 {
|
|
osName, osVersion = extractGuestOSInfo(agentInfo)
|
|
osInfoFailureCount = 0 // Reset on success
|
|
osInfoSkip = false
|
|
} else {
|
|
osName = ""
|
|
osVersion = ""
|
|
}
|
|
} else {
|
|
// Skipping OS info call due to repeated failures
|
|
log.Debug().
|
|
Str("instance", instanceName).
|
|
Str("vm", vmName).
|
|
Int("vmid", vmid).
|
|
Msg("Skipping guest agent OS info call (disabled after repeated failures)")
|
|
}
|
|
|
|
// Agent version with configurable timeout and retry (refs #592)
|
|
versionRaw, err := m.retryGuestAgentCall(ctx, m.guestAgentVersionTimeout, m.guestAgentRetries, func(ctx context.Context) (interface{}, error) {
|
|
return client.GetVMAgentVersion(ctx, nodeName, vmid)
|
|
})
|
|
if err != nil {
|
|
log.Debug().
|
|
Str("instance", instanceName).
|
|
Str("vm", vmName).
|
|
Int("vmid", vmid).
|
|
Err(err).
|
|
Msg("Guest agent version unavailable")
|
|
} else if version, ok := versionRaw.(string); ok && version != "" {
|
|
agentVersion = version
|
|
} else {
|
|
agentVersion = ""
|
|
}
|
|
|
|
entry := guestMetadataCacheEntry{
|
|
ipAddresses: cloneStringSlice(ipAddresses),
|
|
networkInterfaces: cloneGuestNetworkInterfaces(networkIfaces),
|
|
osName: osName,
|
|
osVersion: osVersion,
|
|
agentVersion: agentVersion,
|
|
fetchedAt: time.Now(),
|
|
osInfoFailureCount: osInfoFailureCount,
|
|
osInfoSkip: osInfoSkip,
|
|
}
|
|
|
|
m.guestMetadataMu.Lock()
|
|
if m.guestMetadataCache == nil {
|
|
m.guestMetadataCache = make(map[string]guestMetadataCacheEntry)
|
|
}
|
|
m.guestMetadataCache[key] = entry
|
|
m.guestMetadataMu.Unlock()
|
|
|
|
return ipAddresses, networkIfaces, osName, osVersion, agentVersion
|
|
}
|
|
|
|
func guestMetadataCacheKey(instanceName, nodeName string, vmid int) string {
|
|
return fmt.Sprintf("%s|%s|%d", instanceName, nodeName, vmid)
|
|
}
|
|
|
|
func (m *Monitor) clearGuestMetadataCache(instanceName, nodeName string, vmid int) {
|
|
if m == nil {
|
|
return
|
|
}
|
|
|
|
key := guestMetadataCacheKey(instanceName, nodeName, vmid)
|
|
m.guestMetadataMu.Lock()
|
|
if m.guestMetadataCache != nil {
|
|
delete(m.guestMetadataCache, key)
|
|
}
|
|
m.guestMetadataMu.Unlock()
|
|
}
|
|
|
|
func cloneStringSlice(src []string) []string {
|
|
if len(src) == 0 {
|
|
return nil
|
|
}
|
|
dst := make([]string, len(src))
|
|
copy(dst, src)
|
|
return dst
|
|
}
|
|
|
|
func cloneGuestNetworkInterfaces(src []models.GuestNetworkInterface) []models.GuestNetworkInterface {
|
|
if len(src) == 0 {
|
|
return nil
|
|
}
|
|
dst := make([]models.GuestNetworkInterface, len(src))
|
|
for i, iface := range src {
|
|
dst[i] = iface
|
|
if len(iface.Addresses) > 0 {
|
|
dst[i].Addresses = cloneStringSlice(iface.Addresses)
|
|
}
|
|
}
|
|
return dst
|
|
}
|
|
|
|
func processGuestNetworkInterfaces(raw []proxmox.VMNetworkInterface) ([]string, []models.GuestNetworkInterface) {
|
|
ipSet := make(map[string]struct{})
|
|
ipAddresses := make([]string, 0)
|
|
guestIfaces := make([]models.GuestNetworkInterface, 0, len(raw))
|
|
|
|
for _, iface := range raw {
|
|
ifaceName := strings.TrimSpace(iface.Name)
|
|
mac := strings.TrimSpace(iface.HardwareAddr)
|
|
|
|
addrSet := make(map[string]struct{})
|
|
addresses := make([]string, 0, len(iface.IPAddresses))
|
|
|
|
for _, addr := range iface.IPAddresses {
|
|
ip := strings.TrimSpace(addr.Address)
|
|
if ip == "" {
|
|
continue
|
|
}
|
|
lower := strings.ToLower(ip)
|
|
if strings.HasPrefix(ip, "127.") || strings.HasPrefix(lower, "fe80") || ip == "::1" {
|
|
continue
|
|
}
|
|
|
|
if _, exists := addrSet[ip]; !exists {
|
|
addrSet[ip] = struct{}{}
|
|
addresses = append(addresses, ip)
|
|
}
|
|
|
|
if _, exists := ipSet[ip]; !exists {
|
|
ipSet[ip] = struct{}{}
|
|
ipAddresses = append(ipAddresses, ip)
|
|
}
|
|
}
|
|
|
|
if len(addresses) > 1 {
|
|
sort.Strings(addresses)
|
|
}
|
|
|
|
rxBytes := parseInterfaceStat(iface.Statistics, "rx-bytes")
|
|
txBytes := parseInterfaceStat(iface.Statistics, "tx-bytes")
|
|
|
|
if len(addresses) == 0 && rxBytes == 0 && txBytes == 0 {
|
|
continue
|
|
}
|
|
|
|
guestIfaces = append(guestIfaces, models.GuestNetworkInterface{
|
|
Name: ifaceName,
|
|
MAC: mac,
|
|
Addresses: addresses,
|
|
RXBytes: rxBytes,
|
|
TXBytes: txBytes,
|
|
})
|
|
}
|
|
|
|
if len(ipAddresses) > 1 {
|
|
sort.Strings(ipAddresses)
|
|
}
|
|
|
|
if len(guestIfaces) > 1 {
|
|
sort.SliceStable(guestIfaces, func(i, j int) bool {
|
|
return guestIfaces[i].Name < guestIfaces[j].Name
|
|
})
|
|
}
|
|
|
|
return ipAddresses, guestIfaces
|
|
}
|
|
|
|
func parseInterfaceStat(stats interface{}, key string) int64 {
|
|
if stats == nil {
|
|
return 0
|
|
}
|
|
statsMap, ok := stats.(map[string]interface{})
|
|
if !ok {
|
|
return 0
|
|
}
|
|
val, ok := statsMap[key]
|
|
if !ok {
|
|
return 0
|
|
}
|
|
return anyToInt64(val)
|
|
}
|
|
|
|
func extractGuestOSInfo(data map[string]interface{}) (string, string) {
|
|
if data == nil {
|
|
return "", ""
|
|
}
|
|
|
|
if result, ok := data["result"]; ok {
|
|
if resultMap, ok := result.(map[string]interface{}); ok {
|
|
data = resultMap
|
|
}
|
|
}
|
|
|
|
name := stringValue(data["name"])
|
|
prettyName := stringValue(data["pretty-name"])
|
|
version := stringValue(data["version"])
|
|
versionID := stringValue(data["version-id"])
|
|
|
|
osName := name
|
|
if osName == "" {
|
|
osName = prettyName
|
|
}
|
|
if osName == "" {
|
|
osName = stringValue(data["id"])
|
|
}
|
|
|
|
osVersion := version
|
|
if osVersion == "" && versionID != "" {
|
|
osVersion = versionID
|
|
}
|
|
if osVersion == "" && prettyName != "" && prettyName != osName {
|
|
osVersion = prettyName
|
|
}
|
|
if osVersion == "" {
|
|
osVersion = stringValue(data["kernel-release"])
|
|
}
|
|
if osVersion == osName {
|
|
osVersion = ""
|
|
}
|
|
|
|
return osName, osVersion
|
|
}
|
|
|
|
func isGuestAgentOSInfoUnsupportedError(err error) bool {
|
|
if err == nil {
|
|
return false
|
|
}
|
|
|
|
msg := strings.ToLower(err.Error())
|
|
|
|
// OpenBSD qemu-ga emits "Failed to open file '/etc/os-release'" (refs #692)
|
|
if strings.Contains(msg, "os-release") &&
|
|
(strings.Contains(msg, "failed to open file") || strings.Contains(msg, "no such file or directory")) {
|
|
return true
|
|
}
|
|
|
|
// Some Proxmox builds bubble up "unsupported command: guest-get-osinfo"
|
|
if strings.Contains(msg, "guest-get-osinfo") && strings.Contains(msg, "unsupported") {
|
|
return true
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
func stringValue(val interface{}) string {
|
|
switch v := val.(type) {
|
|
case string:
|
|
return strings.TrimSpace(v)
|
|
case json.Number:
|
|
return strings.TrimSpace(v.String())
|
|
case fmt.Stringer:
|
|
return strings.TrimSpace(v.String())
|
|
case float64:
|
|
return strings.TrimSpace(strconv.FormatFloat(v, 'f', -1, 64))
|
|
case float32:
|
|
return strings.TrimSpace(strconv.FormatFloat(float64(v), 'f', -1, 32))
|
|
case int:
|
|
return strconv.Itoa(v)
|
|
case int32:
|
|
return strconv.FormatInt(int64(v), 10)
|
|
case int64:
|
|
return strconv.FormatInt(v, 10)
|
|
case uint32:
|
|
return strconv.FormatUint(uint64(v), 10)
|
|
case uint64:
|
|
return strconv.FormatUint(v, 10)
|
|
default:
|
|
return ""
|
|
}
|
|
}
|
|
|
|
func anyToInt64(val interface{}) int64 {
|
|
switch v := val.(type) {
|
|
case int:
|
|
return int64(v)
|
|
case int32:
|
|
return int64(v)
|
|
case int64:
|
|
return v
|
|
case uint32:
|
|
return int64(v)
|
|
case uint64:
|
|
if v > math.MaxInt64 {
|
|
return math.MaxInt64
|
|
}
|
|
return int64(v)
|
|
case float32:
|
|
return int64(v)
|
|
case float64:
|
|
return int64(v)
|
|
case string:
|
|
if v == "" {
|
|
return 0
|
|
}
|
|
if parsed, err := strconv.ParseInt(v, 10, 64); err == nil {
|
|
return parsed
|
|
}
|
|
if parsedFloat, err := strconv.ParseFloat(v, 64); err == nil {
|
|
return int64(parsedFloat)
|
|
}
|
|
case json.Number:
|
|
if parsed, err := v.Int64(); err == nil {
|
|
return parsed
|
|
}
|
|
if parsedFloat, err := v.Float64(); err == nil {
|
|
return int64(parsedFloat)
|
|
}
|
|
}
|
|
return 0
|
|
}
|
|
|
|
func (m *Monitor) enrichContainerMetadata(ctx context.Context, client PVEClientInterface, instanceName, nodeName string, container *models.Container) {
|
|
if container == nil {
|
|
return
|
|
}
|
|
|
|
ensureContainerRootDiskEntry(container)
|
|
|
|
if client == nil || container.Status != "running" {
|
|
return
|
|
}
|
|
|
|
statusCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
|
|
status, err := client.GetContainerStatus(statusCtx, nodeName, container.VMID)
|
|
cancel()
|
|
if err != nil {
|
|
log.Debug().
|
|
Err(err).
|
|
Str("instance", instanceName).
|
|
Str("node", nodeName).
|
|
Str("container", container.Name).
|
|
Int("vmid", container.VMID).
|
|
Msg("Container status metadata unavailable")
|
|
return
|
|
}
|
|
if status == nil {
|
|
return
|
|
}
|
|
|
|
rootDeviceHint := ""
|
|
var mountMetadata map[string]containerMountMetadata
|
|
addressSet := make(map[string]struct{})
|
|
addressOrder := make([]string, 0, 4)
|
|
|
|
addAddress := func(addr string) {
|
|
addr = strings.TrimSpace(addr)
|
|
if addr == "" {
|
|
return
|
|
}
|
|
if _, exists := addressSet[addr]; exists {
|
|
return
|
|
}
|
|
addressSet[addr] = struct{}{}
|
|
addressOrder = append(addressOrder, addr)
|
|
}
|
|
|
|
for _, addr := range sanitizeGuestAddressStrings(status.IP) {
|
|
addAddress(addr)
|
|
}
|
|
for _, addr := range sanitizeGuestAddressStrings(status.IP6) {
|
|
addAddress(addr)
|
|
}
|
|
for _, addr := range parseContainerRawIPs(status.IPv4) {
|
|
addAddress(addr)
|
|
}
|
|
for _, addr := range parseContainerRawIPs(status.IPv6) {
|
|
addAddress(addr)
|
|
}
|
|
|
|
networkIfaces := make([]models.GuestNetworkInterface, 0, len(status.Network))
|
|
for rawName, cfg := range status.Network {
|
|
if cfg == (proxmox.ContainerNetworkConfig{}) {
|
|
continue
|
|
}
|
|
|
|
iface := models.GuestNetworkInterface{}
|
|
name := strings.TrimSpace(cfg.Name)
|
|
if name == "" {
|
|
name = strings.TrimSpace(rawName)
|
|
}
|
|
if name != "" {
|
|
iface.Name = name
|
|
}
|
|
if mac := strings.TrimSpace(cfg.HWAddr); mac != "" {
|
|
iface.MAC = mac
|
|
}
|
|
|
|
addrCandidates := make([]string, 0, 4)
|
|
addrCandidates = append(addrCandidates, collectIPsFromInterface(cfg.IP)...)
|
|
addrCandidates = append(addrCandidates, collectIPsFromInterface(cfg.IP6)...)
|
|
addrCandidates = append(addrCandidates, collectIPsFromInterface(cfg.IPv4)...)
|
|
addrCandidates = append(addrCandidates, collectIPsFromInterface(cfg.IPv6)...)
|
|
|
|
if len(addrCandidates) > 0 {
|
|
deduped := dedupeStringsPreserveOrder(addrCandidates)
|
|
if len(deduped) > 0 {
|
|
iface.Addresses = deduped
|
|
for _, addr := range deduped {
|
|
addAddress(addr)
|
|
}
|
|
}
|
|
}
|
|
|
|
if iface.Name != "" || iface.MAC != "" || len(iface.Addresses) > 0 {
|
|
networkIfaces = append(networkIfaces, iface)
|
|
}
|
|
}
|
|
|
|
configCtx, cancelConfig := context.WithTimeout(ctx, 5*time.Second)
|
|
configData, configErr := client.GetContainerConfig(configCtx, nodeName, container.VMID)
|
|
cancelConfig()
|
|
if configErr != nil {
|
|
log.Debug().
|
|
Err(configErr).
|
|
Str("instance", instanceName).
|
|
Str("node", nodeName).
|
|
Str("container", container.Name).
|
|
Int("vmid", container.VMID).
|
|
Msg("Container config metadata unavailable")
|
|
} else if len(configData) > 0 {
|
|
mountMetadata = parseContainerMountMetadata(configData)
|
|
if rootDeviceHint == "" {
|
|
if meta, ok := mountMetadata["rootfs"]; ok && meta.Source != "" {
|
|
rootDeviceHint = meta.Source
|
|
}
|
|
}
|
|
if rootDeviceHint == "" {
|
|
if hint := extractContainerRootDeviceFromConfig(configData); hint != "" {
|
|
rootDeviceHint = hint
|
|
}
|
|
}
|
|
for _, detail := range parseContainerConfigNetworks(configData) {
|
|
if len(detail.Addresses) > 0 {
|
|
for _, addr := range detail.Addresses {
|
|
addAddress(addr)
|
|
}
|
|
}
|
|
mergeContainerNetworkInterface(&networkIfaces, detail)
|
|
}
|
|
}
|
|
|
|
if len(addressOrder) == 0 {
|
|
interfacesCtx, cancelInterfaces := context.WithTimeout(ctx, 5*time.Second)
|
|
ifaceDetails, ifaceErr := client.GetContainerInterfaces(interfacesCtx, nodeName, container.VMID)
|
|
cancelInterfaces()
|
|
if ifaceErr != nil {
|
|
log.Debug().
|
|
Err(ifaceErr).
|
|
Str("instance", instanceName).
|
|
Str("node", nodeName).
|
|
Str("container", container.Name).
|
|
Int("vmid", container.VMID).
|
|
Msg("Container interface metadata unavailable")
|
|
} else if len(ifaceDetails) > 0 {
|
|
for _, detail := range ifaceDetails {
|
|
parsed := containerNetworkDetails{}
|
|
parsed.Name = strings.TrimSpace(detail.Name)
|
|
parsed.MAC = strings.ToUpper(strings.TrimSpace(detail.HWAddr))
|
|
|
|
for _, addr := range detail.IPAddresses {
|
|
stripped := strings.TrimSpace(addr.Address)
|
|
if stripped == "" {
|
|
continue
|
|
}
|
|
if slash := strings.Index(stripped, "/"); slash > 0 {
|
|
stripped = stripped[:slash]
|
|
}
|
|
parsed.Addresses = append(parsed.Addresses, sanitizeGuestAddressStrings(stripped)...)
|
|
}
|
|
|
|
if len(parsed.Addresses) == 0 && strings.TrimSpace(detail.Inet) != "" {
|
|
parts := strings.Fields(detail.Inet)
|
|
for _, part := range parts {
|
|
stripped := strings.TrimSpace(part)
|
|
if stripped == "" {
|
|
continue
|
|
}
|
|
if slash := strings.Index(stripped, "/"); slash > 0 {
|
|
stripped = stripped[:slash]
|
|
}
|
|
parsed.Addresses = append(parsed.Addresses, sanitizeGuestAddressStrings(stripped)...)
|
|
}
|
|
}
|
|
|
|
parsed.Addresses = dedupeStringsPreserveOrder(parsed.Addresses)
|
|
|
|
if len(parsed.Addresses) > 0 {
|
|
for _, addr := range parsed.Addresses {
|
|
addAddress(addr)
|
|
}
|
|
}
|
|
|
|
if parsed.Name != "" || parsed.MAC != "" || len(parsed.Addresses) > 0 {
|
|
mergeContainerNetworkInterface(&networkIfaces, parsed)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if len(networkIfaces) > 1 {
|
|
sort.SliceStable(networkIfaces, func(i, j int) bool {
|
|
left := strings.TrimSpace(networkIfaces[i].Name)
|
|
right := strings.TrimSpace(networkIfaces[j].Name)
|
|
return left < right
|
|
})
|
|
}
|
|
|
|
if len(addressOrder) > 1 {
|
|
sort.Strings(addressOrder)
|
|
}
|
|
|
|
if len(addressOrder) > 0 {
|
|
container.IPAddresses = addressOrder
|
|
}
|
|
|
|
if len(networkIfaces) > 0 {
|
|
container.NetworkInterfaces = networkIfaces
|
|
}
|
|
|
|
if disks := convertContainerDiskInfo(status, mountMetadata); len(disks) > 0 {
|
|
container.Disks = disks
|
|
}
|
|
|
|
ensureContainerRootDiskEntry(container)
|
|
|
|
if rootDeviceHint != "" && len(container.Disks) > 0 {
|
|
for i := range container.Disks {
|
|
if container.Disks[i].Mountpoint == "/" && container.Disks[i].Device == "" {
|
|
container.Disks[i].Device = rootDeviceHint
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func ensureContainerRootDiskEntry(container *models.Container) {
|
|
if container == nil || len(container.Disks) > 0 {
|
|
return
|
|
}
|
|
|
|
total := container.Disk.Total
|
|
used := container.Disk.Used
|
|
if total > 0 && used > total {
|
|
used = total
|
|
}
|
|
|
|
free := total - used
|
|
if free < 0 {
|
|
free = 0
|
|
}
|
|
|
|
usage := container.Disk.Usage
|
|
if total > 0 && usage <= 0 {
|
|
usage = safePercentage(float64(used), float64(total))
|
|
}
|
|
|
|
container.Disks = []models.Disk{
|
|
{
|
|
Total: total,
|
|
Used: used,
|
|
Free: free,
|
|
Usage: usage,
|
|
Mountpoint: "/",
|
|
Type: "rootfs",
|
|
},
|
|
}
|
|
}
|
|
|
|
func convertContainerDiskInfo(status *proxmox.Container, metadata map[string]containerMountMetadata) []models.Disk {
|
|
if status == nil || len(status.DiskInfo) == 0 {
|
|
return nil
|
|
}
|
|
|
|
disks := make([]models.Disk, 0, len(status.DiskInfo))
|
|
for name, info := range status.DiskInfo {
|
|
total := clampToInt64(info.Total)
|
|
used := clampToInt64(info.Used)
|
|
if total > 0 && used > total {
|
|
used = total
|
|
}
|
|
free := total - used
|
|
if free < 0 {
|
|
free = 0
|
|
}
|
|
|
|
disk := models.Disk{
|
|
Total: total,
|
|
Used: used,
|
|
Free: free,
|
|
}
|
|
|
|
if total > 0 {
|
|
disk.Usage = safePercentage(float64(used), float64(total))
|
|
}
|
|
|
|
label := strings.TrimSpace(name)
|
|
lowerLabel := strings.ToLower(label)
|
|
mountpoint := ""
|
|
device := ""
|
|
|
|
if metadata != nil {
|
|
if meta, ok := metadata[lowerLabel]; ok {
|
|
mountpoint = strings.TrimSpace(meta.Mountpoint)
|
|
device = strings.TrimSpace(meta.Source)
|
|
}
|
|
}
|
|
|
|
if strings.EqualFold(label, "rootfs") || label == "" {
|
|
if mountpoint == "" {
|
|
mountpoint = "/"
|
|
}
|
|
disk.Type = "rootfs"
|
|
if device == "" {
|
|
device = sanitizeRootFSDevice(status.RootFS)
|
|
}
|
|
} else {
|
|
if mountpoint == "" {
|
|
mountpoint = label
|
|
}
|
|
if lowerLabel != "" {
|
|
disk.Type = lowerLabel
|
|
} else {
|
|
disk.Type = "disk"
|
|
}
|
|
}
|
|
|
|
disk.Mountpoint = mountpoint
|
|
if disk.Device == "" && device != "" {
|
|
disk.Device = device
|
|
}
|
|
|
|
disks = append(disks, disk)
|
|
}
|
|
|
|
if len(disks) > 1 {
|
|
sort.SliceStable(disks, func(i, j int) bool {
|
|
return disks[i].Mountpoint < disks[j].Mountpoint
|
|
})
|
|
}
|
|
|
|
return disks
|
|
}
|
|
|
|
func sanitizeRootFSDevice(root string) string {
|
|
root = strings.TrimSpace(root)
|
|
if root == "" {
|
|
return ""
|
|
}
|
|
if idx := strings.Index(root, ","); idx != -1 {
|
|
root = root[:idx]
|
|
}
|
|
return root
|
|
}
|
|
|
|
func parseContainerRawIPs(raw json.RawMessage) []string {
|
|
if len(raw) == 0 {
|
|
return nil
|
|
}
|
|
var data interface{}
|
|
if err := json.Unmarshal(raw, &data); err != nil {
|
|
return nil
|
|
}
|
|
return collectIPsFromInterface(data)
|
|
}
|
|
|
|
func collectIPsFromInterface(value interface{}) []string {
|
|
switch v := value.(type) {
|
|
case nil:
|
|
return nil
|
|
case string:
|
|
return sanitizeGuestAddressStrings(v)
|
|
case []interface{}:
|
|
results := make([]string, 0, len(v))
|
|
for _, item := range v {
|
|
results = append(results, collectIPsFromInterface(item)...)
|
|
}
|
|
return results
|
|
case []string:
|
|
results := make([]string, 0, len(v))
|
|
for _, item := range v {
|
|
results = append(results, sanitizeGuestAddressStrings(item)...)
|
|
}
|
|
return results
|
|
case map[string]interface{}:
|
|
results := make([]string, 0)
|
|
for _, key := range []string{"ip", "ip6", "ipv4", "ipv6", "address", "value"} {
|
|
if val, ok := v[key]; ok {
|
|
results = append(results, collectIPsFromInterface(val)...)
|
|
}
|
|
}
|
|
return results
|
|
case json.Number:
|
|
return sanitizeGuestAddressStrings(v.String())
|
|
default:
|
|
return nil
|
|
}
|
|
}
|
|
|
|
func sanitizeGuestAddressStrings(value string) []string {
|
|
value = strings.TrimSpace(value)
|
|
if value == "" {
|
|
return nil
|
|
}
|
|
|
|
lower := strings.ToLower(value)
|
|
switch lower {
|
|
case "dhcp", "manual", "static", "auto", "none", "n/a", "unknown", "0.0.0.0", "::", "::1":
|
|
return nil
|
|
}
|
|
|
|
parts := strings.FieldsFunc(value, func(r rune) bool {
|
|
return unicode.IsSpace(r) || r == ',' || r == ';'
|
|
})
|
|
|
|
if len(parts) > 1 {
|
|
results := make([]string, 0, len(parts))
|
|
for _, part := range parts {
|
|
results = append(results, sanitizeGuestAddressStrings(part)...)
|
|
}
|
|
return results
|
|
}
|
|
|
|
if idx := strings.Index(value, "/"); idx > 0 {
|
|
value = strings.TrimSpace(value[:idx])
|
|
}
|
|
|
|
lower = strings.ToLower(value)
|
|
|
|
if idx := strings.Index(value, "%"); idx > 0 {
|
|
value = strings.TrimSpace(value[:idx])
|
|
lower = strings.ToLower(value)
|
|
}
|
|
|
|
if strings.HasPrefix(value, "127.") || strings.HasPrefix(lower, "0.0.0.0") {
|
|
return nil
|
|
}
|
|
|
|
if strings.HasPrefix(lower, "fe80") {
|
|
return nil
|
|
}
|
|
|
|
if strings.HasPrefix(lower, "::1") {
|
|
return nil
|
|
}
|
|
|
|
return []string{value}
|
|
}
|
|
|
|
func dedupeStringsPreserveOrder(values []string) []string {
|
|
if len(values) == 0 {
|
|
return nil
|
|
}
|
|
seen := make(map[string]struct{}, len(values))
|
|
result := make([]string, 0, len(values))
|
|
for _, v := range values {
|
|
v = strings.TrimSpace(v)
|
|
if v == "" {
|
|
continue
|
|
}
|
|
if _, ok := seen[v]; ok {
|
|
continue
|
|
}
|
|
seen[v] = struct{}{}
|
|
result = append(result, v)
|
|
}
|
|
if len(result) == 0 {
|
|
return nil
|
|
}
|
|
return result
|
|
}
|
|
|
|
type containerNetworkDetails struct {
|
|
Name string
|
|
MAC string
|
|
Addresses []string
|
|
}
|
|
|
|
type containerMountMetadata struct {
|
|
Key string
|
|
Mountpoint string
|
|
Source string
|
|
}
|
|
|
|
func parseContainerConfigNetworks(config map[string]interface{}) []containerNetworkDetails {
|
|
if len(config) == 0 {
|
|
return nil
|
|
}
|
|
|
|
keys := make([]string, 0, len(config))
|
|
for key := range config {
|
|
if strings.HasPrefix(strings.ToLower(strings.TrimSpace(key)), "net") {
|
|
keys = append(keys, key)
|
|
}
|
|
}
|
|
if len(keys) == 0 {
|
|
return nil
|
|
}
|
|
sort.Strings(keys)
|
|
|
|
results := make([]containerNetworkDetails, 0, len(keys))
|
|
for _, key := range keys {
|
|
raw := fmt.Sprint(config[key])
|
|
raw = strings.TrimSpace(raw)
|
|
if raw == "" {
|
|
continue
|
|
}
|
|
|
|
detail := containerNetworkDetails{}
|
|
parts := strings.Split(raw, ",")
|
|
for _, part := range parts {
|
|
kv := strings.SplitN(strings.TrimSpace(part), "=", 2)
|
|
if len(kv) != 2 {
|
|
continue
|
|
}
|
|
k := strings.ToLower(strings.TrimSpace(kv[0]))
|
|
value := strings.TrimSpace(kv[1])
|
|
switch k {
|
|
case "name":
|
|
detail.Name = value
|
|
case "hwaddr", "mac", "macaddr":
|
|
detail.MAC = strings.ToUpper(value)
|
|
case "ip", "ip6", "ips", "ip6addr", "ip6prefix":
|
|
detail.Addresses = append(detail.Addresses, sanitizeGuestAddressStrings(value)...)
|
|
}
|
|
}
|
|
|
|
if detail.Name == "" {
|
|
detail.Name = strings.TrimSpace(key)
|
|
}
|
|
if len(detail.Addresses) > 0 {
|
|
detail.Addresses = dedupeStringsPreserveOrder(detail.Addresses)
|
|
}
|
|
|
|
if detail.Name != "" || detail.MAC != "" || len(detail.Addresses) > 0 {
|
|
results = append(results, detail)
|
|
}
|
|
}
|
|
|
|
if len(results) == 0 {
|
|
return nil
|
|
}
|
|
|
|
return results
|
|
}
|
|
|
|
func parseContainerMountMetadata(config map[string]interface{}) map[string]containerMountMetadata {
|
|
if len(config) == 0 {
|
|
return nil
|
|
}
|
|
|
|
results := make(map[string]containerMountMetadata)
|
|
for rawKey, rawValue := range config {
|
|
key := strings.ToLower(strings.TrimSpace(rawKey))
|
|
if key != "rootfs" && !strings.HasPrefix(key, "mp") {
|
|
continue
|
|
}
|
|
|
|
value := strings.TrimSpace(fmt.Sprint(rawValue))
|
|
if value == "" {
|
|
continue
|
|
}
|
|
|
|
meta := containerMountMetadata{
|
|
Key: key,
|
|
}
|
|
|
|
parts := strings.Split(value, ",")
|
|
if len(parts) > 0 {
|
|
meta.Source = strings.TrimSpace(parts[0])
|
|
}
|
|
|
|
for _, part := range parts[1:] {
|
|
kv := strings.SplitN(strings.TrimSpace(part), "=", 2)
|
|
if len(kv) != 2 {
|
|
continue
|
|
}
|
|
k := strings.ToLower(strings.TrimSpace(kv[0]))
|
|
v := strings.TrimSpace(kv[1])
|
|
switch k {
|
|
case "mp", "mountpoint":
|
|
meta.Mountpoint = v
|
|
}
|
|
}
|
|
|
|
if meta.Mountpoint == "" && key == "rootfs" {
|
|
meta.Mountpoint = "/"
|
|
}
|
|
|
|
results[key] = meta
|
|
}
|
|
|
|
if len(results) == 0 {
|
|
return nil
|
|
}
|
|
|
|
return results
|
|
}
|
|
|
|
func mergeContainerNetworkInterface(target *[]models.GuestNetworkInterface, detail containerNetworkDetails) {
|
|
if target == nil {
|
|
return
|
|
}
|
|
if len(detail.Addresses) > 0 {
|
|
detail.Addresses = dedupeStringsPreserveOrder(detail.Addresses)
|
|
}
|
|
|
|
findMatch := func() int {
|
|
for i := range *target {
|
|
if detail.Name != "" && (*target)[i].Name != "" && strings.EqualFold((*target)[i].Name, detail.Name) {
|
|
return i
|
|
}
|
|
if detail.MAC != "" && (*target)[i].MAC != "" && strings.EqualFold((*target)[i].MAC, detail.MAC) {
|
|
return i
|
|
}
|
|
}
|
|
return -1
|
|
}
|
|
|
|
if idx := findMatch(); idx >= 0 {
|
|
if detail.Name != "" && (*target)[idx].Name == "" {
|
|
(*target)[idx].Name = detail.Name
|
|
}
|
|
if detail.MAC != "" && (*target)[idx].MAC == "" {
|
|
(*target)[idx].MAC = detail.MAC
|
|
}
|
|
if len(detail.Addresses) > 0 {
|
|
combined := append((*target)[idx].Addresses, detail.Addresses...)
|
|
(*target)[idx].Addresses = dedupeStringsPreserveOrder(combined)
|
|
}
|
|
return
|
|
}
|
|
|
|
newIface := models.GuestNetworkInterface{
|
|
Name: detail.Name,
|
|
MAC: detail.MAC,
|
|
}
|
|
if len(detail.Addresses) > 0 {
|
|
newIface.Addresses = dedupeStringsPreserveOrder(detail.Addresses)
|
|
}
|
|
*target = append(*target, newIface)
|
|
}
|
|
|
|
func extractContainerRootDeviceFromConfig(config map[string]interface{}) string {
|
|
if len(config) == 0 {
|
|
return ""
|
|
}
|
|
raw, ok := config["rootfs"]
|
|
if !ok {
|
|
return ""
|
|
}
|
|
|
|
value := strings.TrimSpace(fmt.Sprint(raw))
|
|
if value == "" {
|
|
return ""
|
|
}
|
|
|
|
parts := strings.Split(value, ",")
|
|
device := strings.TrimSpace(parts[0])
|
|
return device
|
|
}
|
|
|
|
// GetConnectionStatuses returns the current connection status for all nodes
|
|
func (m *Monitor) GetConnectionStatuses() map[string]bool {
|
|
if mock.IsMockEnabled() {
|
|
statuses := make(map[string]bool)
|
|
state := mock.GetMockState()
|
|
for _, node := range state.Nodes {
|
|
key := "pve-" + node.Name
|
|
statuses[key] = strings.ToLower(node.Status) == "online"
|
|
if node.Host != "" {
|
|
statuses[node.Host] = strings.ToLower(node.Status) == "online"
|
|
}
|
|
}
|
|
for _, pbsInst := range state.PBSInstances {
|
|
key := "pbs-" + pbsInst.Name
|
|
statuses[key] = strings.ToLower(pbsInst.Status) != "offline"
|
|
if pbsInst.Host != "" {
|
|
statuses[pbsInst.Host] = strings.ToLower(pbsInst.Status) != "offline"
|
|
}
|
|
}
|
|
|
|
for _, dockerHost := range state.DockerHosts {
|
|
key := dockerConnectionPrefix + dockerHost.ID
|
|
statuses[key] = strings.ToLower(dockerHost.Status) == "online"
|
|
}
|
|
return statuses
|
|
}
|
|
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
|
|
statuses := make(map[string]bool)
|
|
|
|
// Check all configured PVE nodes (not just ones with clients)
|
|
for _, pve := range m.config.PVEInstances {
|
|
key := "pve-" + pve.Name
|
|
// Check if we have a client for this node
|
|
if client, exists := m.pveClients[pve.Name]; exists && client != nil {
|
|
// We have a client, check actual connection health from state
|
|
if m.state != nil && m.state.ConnectionHealth != nil {
|
|
statuses[key] = m.state.ConnectionHealth[pve.Name]
|
|
} else {
|
|
statuses[key] = true // Assume connected if we have a client
|
|
}
|
|
} else {
|
|
// No client means disconnected
|
|
statuses[key] = false
|
|
}
|
|
}
|
|
|
|
// Check all configured PBS nodes (not just ones with clients)
|
|
for _, pbs := range m.config.PBSInstances {
|
|
key := "pbs-" + pbs.Name
|
|
// Check if we have a client for this node
|
|
if client, exists := m.pbsClients[pbs.Name]; exists && client != nil {
|
|
// We have a client, check actual connection health from state
|
|
if m.state != nil && m.state.ConnectionHealth != nil {
|
|
statuses[key] = m.state.ConnectionHealth["pbs-"+pbs.Name]
|
|
} else {
|
|
statuses[key] = true // Assume connected if we have a client
|
|
}
|
|
} else {
|
|
// No client means disconnected
|
|
statuses[key] = false
|
|
}
|
|
}
|
|
|
|
return statuses
|
|
}
|
|
|
|
// HasSocketTemperatureProxy reports whether the local unix socket proxy is available.
|
|
func (m *Monitor) HasSocketTemperatureProxy() bool {
|
|
// Always check the real socket path first so we reflect the actual runtime state
|
|
// even if the temperature collector hasn't latched onto the proxy yet.
|
|
if tempproxy.NewClient().IsAvailable() {
|
|
return true
|
|
}
|
|
|
|
if m == nil {
|
|
return false
|
|
}
|
|
|
|
m.mu.RLock()
|
|
collector := m.tempCollector
|
|
m.mu.RUnlock()
|
|
|
|
if collector == nil {
|
|
return false
|
|
}
|
|
return collector.SocketProxyDetected()
|
|
}
|
|
|
|
// SocketProxyHostDiagnostics exposes per-host proxy cooldown state for diagnostics.
|
|
func (m *Monitor) SocketProxyHostDiagnostics() []ProxyHostDiagnostics {
|
|
m.mu.RLock()
|
|
collector := m.tempCollector
|
|
m.mu.RUnlock()
|
|
|
|
if collector == nil {
|
|
return nil
|
|
}
|
|
|
|
return collector.ProxyHostDiagnostics()
|
|
}
|
|
|
|
// checkContainerizedTempMonitoring logs a security warning if Pulse is running
|
|
// in a container with SSH-based temperature monitoring enabled
|
|
func checkContainerizedTempMonitoring() {
|
|
// Check if running in container
|
|
isContainer := os.Getenv("PULSE_DOCKER") == "true" || system.InContainer()
|
|
if !isContainer {
|
|
return
|
|
}
|
|
|
|
// Check if SSH keys exist (indicates temperature monitoring is configured)
|
|
homeDir := os.Getenv("HOME")
|
|
if homeDir == "" {
|
|
homeDir = "/home/pulse"
|
|
}
|
|
sshKeyPath := homeDir + "/.ssh/id_ed25519"
|
|
if _, err := os.Stat(sshKeyPath); err != nil {
|
|
// No SSH key found, temperature monitoring not configured
|
|
return
|
|
}
|
|
|
|
// Log warning
|
|
log.Warn().
|
|
Msg("🔐 SECURITY NOTICE: Pulse is running in a container with SSH-based temperature monitoring enabled. " +
|
|
"SSH private keys are stored inside the container, which could be a security risk if the container is compromised. " +
|
|
"Future versions will use agent-based architecture for better security. " +
|
|
"See documentation for hardening recommendations.")
|
|
}
|
|
|
|
// New creates a new Monitor instance
|
|
func New(cfg *config.Config) (*Monitor, error) {
|
|
// Initialize temperature collector with sensors SSH key
|
|
// Will use root user for now - can be made configurable later
|
|
homeDir := os.Getenv("HOME")
|
|
if homeDir == "" {
|
|
homeDir = "/home/pulse"
|
|
}
|
|
sshKeyPath := filepath.Join(homeDir, ".ssh/id_ed25519_sensors")
|
|
tempCollector := NewTemperatureCollectorWithPort("root", sshKeyPath, cfg.SSHPort)
|
|
|
|
// Security warning if running in container with SSH temperature monitoring
|
|
checkContainerizedTempMonitoring()
|
|
|
|
if cfg != nil && cfg.TemperatureMonitoringEnabled {
|
|
isContainer := os.Getenv("PULSE_DOCKER") == "true" || system.InContainer()
|
|
if isContainer && tempCollector != nil && !tempCollector.SocketProxyAvailable() {
|
|
log.Warn().Msg("Temperature monitoring is enabled but the container does not have access to pulse-sensor-proxy. Install the proxy on the host or disable temperatures until it is available.")
|
|
}
|
|
}
|
|
|
|
stalenessTracker := NewStalenessTracker(getPollMetrics())
|
|
stalenessTracker.SetBounds(cfg.AdaptivePollingBaseInterval, cfg.AdaptivePollingMaxInterval)
|
|
taskQueue := NewTaskQueue()
|
|
deadLetterQueue := NewTaskQueue()
|
|
breakers := make(map[string]*circuitBreaker)
|
|
failureCounts := make(map[string]int)
|
|
lastOutcome := make(map[string]taskOutcome)
|
|
backoff := backoffConfig{
|
|
Initial: 5 * time.Second,
|
|
Multiplier: 2,
|
|
Jitter: 0.2,
|
|
Max: 5 * time.Minute,
|
|
}
|
|
|
|
if cfg.AdaptivePollingEnabled && cfg.AdaptivePollingMaxInterval > 0 && cfg.AdaptivePollingMaxInterval <= 15*time.Second {
|
|
backoff.Initial = 750 * time.Millisecond
|
|
backoff.Max = 6 * time.Second
|
|
}
|
|
|
|
var scheduler *AdaptiveScheduler
|
|
if cfg.AdaptivePollingEnabled {
|
|
scheduler = NewAdaptiveScheduler(SchedulerConfig{
|
|
BaseInterval: cfg.AdaptivePollingBaseInterval,
|
|
MinInterval: cfg.AdaptivePollingMinInterval,
|
|
MaxInterval: cfg.AdaptivePollingMaxInterval,
|
|
}, stalenessTracker, nil, nil)
|
|
}
|
|
|
|
minRefresh := cfg.GuestMetadataMinRefreshInterval
|
|
if minRefresh <= 0 {
|
|
minRefresh = config.DefaultGuestMetadataMinRefresh
|
|
}
|
|
jitter := cfg.GuestMetadataRefreshJitter
|
|
if jitter < 0 {
|
|
jitter = 0
|
|
}
|
|
retryBackoff := cfg.GuestMetadataRetryBackoff
|
|
if retryBackoff <= 0 {
|
|
retryBackoff = config.DefaultGuestMetadataRetryBackoff
|
|
}
|
|
concurrency := cfg.GuestMetadataMaxConcurrent
|
|
if concurrency <= 0 {
|
|
concurrency = config.DefaultGuestMetadataMaxConcurrent
|
|
}
|
|
holdDuration := defaultGuestMetadataHold
|
|
|
|
// Load guest agent timeout configuration from environment variables (refs #592)
|
|
guestAgentFSInfoTimeout := parseDurationEnv("GUEST_AGENT_FSINFO_TIMEOUT", defaultGuestAgentFSInfoTimeout)
|
|
guestAgentNetworkTimeout := parseDurationEnv("GUEST_AGENT_NETWORK_TIMEOUT", defaultGuestAgentNetworkTimeout)
|
|
guestAgentOSInfoTimeout := parseDurationEnv("GUEST_AGENT_OSINFO_TIMEOUT", defaultGuestAgentOSInfoTimeout)
|
|
guestAgentVersionTimeout := parseDurationEnv("GUEST_AGENT_VERSION_TIMEOUT", defaultGuestAgentVersionTimeout)
|
|
guestAgentRetries := parseIntEnv("GUEST_AGENT_RETRIES", defaultGuestAgentRetries)
|
|
|
|
m := &Monitor{
|
|
config: cfg,
|
|
state: models.NewState(),
|
|
pveClients: make(map[string]PVEClientInterface),
|
|
pbsClients: make(map[string]*pbs.Client),
|
|
pmgClients: make(map[string]*pmg.Client),
|
|
pollMetrics: getPollMetrics(),
|
|
scheduler: scheduler,
|
|
stalenessTracker: stalenessTracker,
|
|
taskQueue: taskQueue,
|
|
pollTimeout: derivePollTimeout(cfg),
|
|
deadLetterQueue: deadLetterQueue,
|
|
circuitBreakers: breakers,
|
|
failureCounts: failureCounts,
|
|
lastOutcome: lastOutcome,
|
|
backoffCfg: backoff,
|
|
rng: rand.New(rand.NewSource(time.Now().UnixNano())),
|
|
maxRetryAttempts: 5,
|
|
tempCollector: tempCollector,
|
|
guestMetadataStore: config.NewGuestMetadataStore(cfg.DataPath),
|
|
dockerMetadataStore: config.NewDockerMetadataStore(cfg.DataPath),
|
|
startTime: time.Now(),
|
|
rateTracker: NewRateTracker(),
|
|
metricsHistory: NewMetricsHistory(1000, 24*time.Hour), // Keep up to 1000 points or 24 hours
|
|
alertManager: alerts.NewManager(),
|
|
notificationMgr: notifications.NewNotificationManager(cfg.PublicURL),
|
|
configPersist: config.NewConfigPersistence(cfg.DataPath),
|
|
discoveryService: nil, // Will be initialized in Start()
|
|
authFailures: make(map[string]int),
|
|
lastAuthAttempt: make(map[string]time.Time),
|
|
lastClusterCheck: make(map[string]time.Time),
|
|
lastPhysicalDiskPoll: make(map[string]time.Time),
|
|
lastPVEBackupPoll: make(map[string]time.Time),
|
|
lastPBSBackupPoll: make(map[string]time.Time),
|
|
persistence: config.NewConfigPersistence(cfg.DataPath),
|
|
pbsBackupPollers: make(map[string]bool),
|
|
nodeSnapshots: make(map[string]NodeMemorySnapshot),
|
|
guestSnapshots: make(map[string]GuestMemorySnapshot),
|
|
nodeRRDMemCache: make(map[string]rrdMemCacheEntry),
|
|
removedDockerHosts: make(map[string]time.Time),
|
|
dockerTokenBindings: make(map[string]string),
|
|
hostTokenBindings: make(map[string]string),
|
|
dockerCommands: make(map[string]*dockerHostCommand),
|
|
dockerCommandIndex: make(map[string]string),
|
|
guestMetadataCache: make(map[string]guestMetadataCacheEntry),
|
|
guestMetadataLimiter: make(map[string]time.Time),
|
|
guestMetadataMinRefresh: minRefresh,
|
|
guestMetadataRefreshJitter: jitter,
|
|
guestMetadataRetryBackoff: retryBackoff,
|
|
guestMetadataHoldDuration: holdDuration,
|
|
guestAgentFSInfoTimeout: guestAgentFSInfoTimeout,
|
|
guestAgentNetworkTimeout: guestAgentNetworkTimeout,
|
|
guestAgentOSInfoTimeout: guestAgentOSInfoTimeout,
|
|
guestAgentVersionTimeout: guestAgentVersionTimeout,
|
|
guestAgentRetries: guestAgentRetries,
|
|
instanceInfoCache: make(map[string]*instanceInfo),
|
|
pollStatusMap: make(map[string]*pollStatus),
|
|
dlqInsightMap: make(map[string]*dlqInsight),
|
|
nodeLastOnline: make(map[string]time.Time),
|
|
}
|
|
|
|
m.breakerBaseRetry = 5 * time.Second
|
|
m.breakerMaxDelay = 5 * time.Minute
|
|
m.breakerHalfOpenWindow = 30 * time.Second
|
|
|
|
if cfg.AdaptivePollingEnabled && cfg.AdaptivePollingMaxInterval > 0 && cfg.AdaptivePollingMaxInterval <= 15*time.Second {
|
|
m.breakerBaseRetry = 2 * time.Second
|
|
m.breakerMaxDelay = 10 * time.Second
|
|
m.breakerHalfOpenWindow = 2 * time.Second
|
|
}
|
|
|
|
m.executor = newRealExecutor(m)
|
|
m.buildInstanceInfoCache(cfg)
|
|
|
|
// Initialize state with config values
|
|
m.state.TemperatureMonitoringEnabled = cfg.TemperatureMonitoringEnabled
|
|
|
|
if m.pollMetrics != nil {
|
|
m.pollMetrics.ResetQueueDepth(0)
|
|
}
|
|
|
|
// Load saved configurations
|
|
if alertConfig, err := m.configPersist.LoadAlertConfig(); err == nil {
|
|
m.alertManager.UpdateConfig(*alertConfig)
|
|
// Apply schedule settings to notification manager
|
|
m.notificationMgr.SetCooldown(alertConfig.Schedule.Cooldown)
|
|
groupWindow := alertConfig.Schedule.Grouping.Window
|
|
if groupWindow == 0 && alertConfig.Schedule.GroupingWindow != 0 {
|
|
groupWindow = alertConfig.Schedule.GroupingWindow
|
|
}
|
|
m.notificationMgr.SetGroupingWindow(groupWindow)
|
|
m.notificationMgr.SetGroupingOptions(
|
|
alertConfig.Schedule.Grouping.ByNode,
|
|
alertConfig.Schedule.Grouping.ByGuest,
|
|
)
|
|
m.notificationMgr.SetNotifyOnResolve(alertConfig.Schedule.NotifyOnResolve)
|
|
} else {
|
|
log.Warn().Err(err).Msg("Failed to load alert configuration")
|
|
}
|
|
|
|
if emailConfig, err := m.configPersist.LoadEmailConfig(); err == nil {
|
|
m.notificationMgr.SetEmailConfig(*emailConfig)
|
|
} else {
|
|
log.Warn().Err(err).Msg("Failed to load email configuration")
|
|
}
|
|
|
|
if concurrency > 0 {
|
|
m.guestMetadataSlots = make(chan struct{}, concurrency)
|
|
}
|
|
|
|
if appriseConfig, err := m.configPersist.LoadAppriseConfig(); err == nil {
|
|
m.notificationMgr.SetAppriseConfig(*appriseConfig)
|
|
} else {
|
|
log.Warn().Err(err).Msg("Failed to load Apprise configuration")
|
|
}
|
|
|
|
// Migrate webhooks if needed (from unencrypted to encrypted)
|
|
if err := m.configPersist.MigrateWebhooksIfNeeded(); err != nil {
|
|
log.Warn().Err(err).Msg("Failed to migrate webhooks")
|
|
}
|
|
|
|
if webhooks, err := m.configPersist.LoadWebhooks(); err == nil {
|
|
for _, webhook := range webhooks {
|
|
m.notificationMgr.AddWebhook(webhook)
|
|
}
|
|
} else {
|
|
log.Warn().Err(err).Msg("Failed to load webhook configuration")
|
|
}
|
|
|
|
// Check if mock mode is enabled before initializing clients
|
|
mockEnabled := mock.IsMockEnabled()
|
|
|
|
if mockEnabled {
|
|
log.Info().Msg("Mock mode enabled - skipping PVE/PBS client initialization")
|
|
} else {
|
|
// Initialize PVE clients
|
|
log.Info().Int("count", len(cfg.PVEInstances)).Msg("Initializing PVE clients")
|
|
for _, pve := range cfg.PVEInstances {
|
|
log.Info().
|
|
Str("name", pve.Name).
|
|
Str("host", pve.Host).
|
|
Str("user", pve.User).
|
|
Bool("hasToken", pve.TokenName != "").
|
|
Msg("Configuring PVE instance")
|
|
|
|
// Check if this is a cluster
|
|
if pve.IsCluster && len(pve.ClusterEndpoints) > 0 {
|
|
// For clusters, check if endpoints have IPs/resolvable hosts
|
|
// If not, use the main host for all connections (Proxmox will route cluster API calls)
|
|
hasValidEndpoints := false
|
|
endpoints := make([]string, 0, len(pve.ClusterEndpoints))
|
|
|
|
for _, ep := range pve.ClusterEndpoints {
|
|
hasFingerprint := pve.Fingerprint != ""
|
|
effectiveURL := clusterEndpointEffectiveURL(ep, pve.VerifySSL, hasFingerprint)
|
|
if effectiveURL == "" {
|
|
log.Warn().
|
|
Str("node", ep.NodeName).
|
|
Msg("Skipping cluster endpoint with no host/IP")
|
|
continue
|
|
}
|
|
|
|
if parsed, err := url.Parse(effectiveURL); err == nil {
|
|
hostname := parsed.Hostname()
|
|
if hostname != "" && (strings.Contains(hostname, ".") || net.ParseIP(hostname) != nil) {
|
|
hasValidEndpoints = true
|
|
}
|
|
} else {
|
|
hostname := normalizeEndpointHost(effectiveURL)
|
|
if hostname != "" && (strings.Contains(hostname, ".") || net.ParseIP(hostname) != nil) {
|
|
hasValidEndpoints = true
|
|
}
|
|
}
|
|
|
|
endpoints = append(endpoints, effectiveURL)
|
|
}
|
|
|
|
// If endpoints are just node names (not FQDNs or IPs), use main host only
|
|
// This is common when cluster nodes are discovered but not directly reachable
|
|
if !hasValidEndpoints || len(endpoints) == 0 {
|
|
log.Info().
|
|
Str("instance", pve.Name).
|
|
Str("mainHost", pve.Host).
|
|
Msg("Cluster endpoints are not resolvable, using main host for all cluster operations")
|
|
fallback := ensureClusterEndpointURL(pve.Host)
|
|
if fallback == "" {
|
|
fallback = ensureClusterEndpointURL(pve.Host)
|
|
}
|
|
endpoints = []string{fallback}
|
|
}
|
|
|
|
log.Info().
|
|
Str("cluster", pve.ClusterName).
|
|
Strs("endpoints", endpoints).
|
|
Msg("Creating cluster-aware client")
|
|
|
|
clientConfig := config.CreateProxmoxConfig(&pve)
|
|
clientConfig.Timeout = cfg.ConnectionTimeout
|
|
clusterClient := proxmox.NewClusterClient(
|
|
pve.Name,
|
|
clientConfig,
|
|
endpoints,
|
|
)
|
|
m.pveClients[pve.Name] = clusterClient
|
|
log.Info().
|
|
Str("instance", pve.Name).
|
|
Str("cluster", pve.ClusterName).
|
|
Int("endpoints", len(endpoints)).
|
|
Msg("Cluster client created successfully")
|
|
// Set initial connection health to true for cluster
|
|
m.state.SetConnectionHealth(pve.Name, true)
|
|
} else {
|
|
// Create regular client
|
|
clientConfig := config.CreateProxmoxConfig(&pve)
|
|
clientConfig.Timeout = cfg.ConnectionTimeout
|
|
client, err := proxmox.NewClient(clientConfig)
|
|
if err != nil {
|
|
monErr := errors.WrapConnectionError("create_pve_client", pve.Name, err)
|
|
log.Error().
|
|
Err(monErr).
|
|
Str("instance", pve.Name).
|
|
Str("host", pve.Host).
|
|
Str("user", pve.User).
|
|
Bool("hasPassword", pve.Password != "").
|
|
Bool("hasToken", pve.TokenValue != "").
|
|
Msg("Failed to create PVE client - node will show as disconnected")
|
|
// Set initial connection health to false for this node
|
|
m.state.SetConnectionHealth(pve.Name, false)
|
|
continue
|
|
}
|
|
m.pveClients[pve.Name] = client
|
|
log.Info().Str("instance", pve.Name).Msg("PVE client created successfully")
|
|
// Set initial connection health to true
|
|
m.state.SetConnectionHealth(pve.Name, true)
|
|
}
|
|
}
|
|
|
|
// Initialize PBS clients
|
|
log.Info().Int("count", len(cfg.PBSInstances)).Msg("Initializing PBS clients")
|
|
for _, pbsInst := range cfg.PBSInstances {
|
|
log.Info().
|
|
Str("name", pbsInst.Name).
|
|
Str("host", pbsInst.Host).
|
|
Str("user", pbsInst.User).
|
|
Bool("hasToken", pbsInst.TokenName != "").
|
|
Msg("Configuring PBS instance")
|
|
|
|
clientConfig := config.CreatePBSConfig(&pbsInst)
|
|
clientConfig.Timeout = 60 * time.Second // Very generous timeout for slow PBS servers
|
|
client, err := pbs.NewClient(clientConfig)
|
|
if err != nil {
|
|
monErr := errors.WrapConnectionError("create_pbs_client", pbsInst.Name, err)
|
|
log.Error().
|
|
Err(monErr).
|
|
Str("instance", pbsInst.Name).
|
|
Str("host", pbsInst.Host).
|
|
Str("user", pbsInst.User).
|
|
Bool("hasPassword", pbsInst.Password != "").
|
|
Bool("hasToken", pbsInst.TokenValue != "").
|
|
Msg("Failed to create PBS client - node will show as disconnected")
|
|
// Set initial connection health to false for this node
|
|
m.state.SetConnectionHealth("pbs-"+pbsInst.Name, false)
|
|
continue
|
|
}
|
|
m.pbsClients[pbsInst.Name] = client
|
|
log.Info().Str("instance", pbsInst.Name).Msg("PBS client created successfully")
|
|
// Set initial connection health to true
|
|
m.state.SetConnectionHealth("pbs-"+pbsInst.Name, true)
|
|
}
|
|
|
|
// Initialize PMG clients
|
|
log.Info().Int("count", len(cfg.PMGInstances)).Msg("Initializing PMG clients")
|
|
for _, pmgInst := range cfg.PMGInstances {
|
|
log.Info().
|
|
Str("name", pmgInst.Name).
|
|
Str("host", pmgInst.Host).
|
|
Str("user", pmgInst.User).
|
|
Bool("hasToken", pmgInst.TokenName != "").
|
|
Msg("Configuring PMG instance")
|
|
|
|
clientConfig := config.CreatePMGConfig(&pmgInst)
|
|
if clientConfig.Timeout <= 0 {
|
|
clientConfig.Timeout = 45 * time.Second
|
|
}
|
|
|
|
client, err := pmg.NewClient(clientConfig)
|
|
if err != nil {
|
|
monErr := errors.WrapConnectionError("create_pmg_client", pmgInst.Name, err)
|
|
log.Error().
|
|
Err(monErr).
|
|
Str("instance", pmgInst.Name).
|
|
Str("host", pmgInst.Host).
|
|
Str("user", pmgInst.User).
|
|
Bool("hasPassword", pmgInst.Password != "").
|
|
Bool("hasToken", pmgInst.TokenValue != "").
|
|
Msg("Failed to create PMG client - gateway will show as disconnected")
|
|
m.state.SetConnectionHealth("pmg-"+pmgInst.Name, false)
|
|
continue
|
|
}
|
|
|
|
m.pmgClients[pmgInst.Name] = client
|
|
log.Info().Str("instance", pmgInst.Name).Msg("PMG client created successfully")
|
|
m.state.SetConnectionHealth("pmg-"+pmgInst.Name, true)
|
|
}
|
|
} // End of else block for mock mode check
|
|
|
|
// Initialize state stats
|
|
m.state.Stats = models.Stats{
|
|
StartTime: m.startTime,
|
|
Version: "2.0.0-go",
|
|
}
|
|
|
|
return m, nil
|
|
}
|
|
|
|
// SetExecutor allows tests to override the poll executor; passing nil restores the default executor.
|
|
func (m *Monitor) SetExecutor(exec PollExecutor) {
|
|
if m == nil {
|
|
return
|
|
}
|
|
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
if exec == nil {
|
|
m.executor = newRealExecutor(m)
|
|
return
|
|
}
|
|
|
|
m.executor = exec
|
|
}
|
|
|
|
func (m *Monitor) buildInstanceInfoCache(cfg *config.Config) {
|
|
if m == nil || cfg == nil {
|
|
return
|
|
}
|
|
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
if m.instanceInfoCache == nil {
|
|
m.instanceInfoCache = make(map[string]*instanceInfo)
|
|
}
|
|
|
|
add := func(instType InstanceType, name string, displayName string, connection string, metadata map[string]string) {
|
|
key := schedulerKey(instType, name)
|
|
m.instanceInfoCache[key] = &instanceInfo{
|
|
Key: key,
|
|
Type: instType,
|
|
DisplayName: displayName,
|
|
Connection: connection,
|
|
Metadata: metadata,
|
|
}
|
|
}
|
|
|
|
// PVE instances
|
|
for _, inst := range cfg.PVEInstances {
|
|
name := strings.TrimSpace(inst.Name)
|
|
if name == "" {
|
|
name = strings.TrimSpace(inst.Host)
|
|
}
|
|
if name == "" {
|
|
name = "pve-instance"
|
|
}
|
|
display := name
|
|
if display == "" {
|
|
display = strings.TrimSpace(inst.Host)
|
|
}
|
|
connection := strings.TrimSpace(inst.Host)
|
|
add(InstanceTypePVE, name, display, connection, nil)
|
|
}
|
|
|
|
// PBS instances
|
|
for _, inst := range cfg.PBSInstances {
|
|
name := strings.TrimSpace(inst.Name)
|
|
if name == "" {
|
|
name = strings.TrimSpace(inst.Host)
|
|
}
|
|
if name == "" {
|
|
name = "pbs-instance"
|
|
}
|
|
display := name
|
|
if display == "" {
|
|
display = strings.TrimSpace(inst.Host)
|
|
}
|
|
connection := strings.TrimSpace(inst.Host)
|
|
add(InstanceTypePBS, name, display, connection, nil)
|
|
}
|
|
|
|
// PMG instances
|
|
for _, inst := range cfg.PMGInstances {
|
|
name := strings.TrimSpace(inst.Name)
|
|
if name == "" {
|
|
name = strings.TrimSpace(inst.Host)
|
|
}
|
|
if name == "" {
|
|
name = "pmg-instance"
|
|
}
|
|
display := name
|
|
if display == "" {
|
|
display = strings.TrimSpace(inst.Host)
|
|
}
|
|
connection := strings.TrimSpace(inst.Host)
|
|
add(InstanceTypePMG, name, display, connection, nil)
|
|
}
|
|
}
|
|
|
|
func (m *Monitor) getExecutor() PollExecutor {
|
|
m.mu.RLock()
|
|
exec := m.executor
|
|
m.mu.RUnlock()
|
|
return exec
|
|
}
|
|
|
|
func clampInterval(value, min, max time.Duration) time.Duration {
|
|
if value <= 0 {
|
|
return min
|
|
}
|
|
if min > 0 && value < min {
|
|
return min
|
|
}
|
|
if max > 0 && value > max {
|
|
return max
|
|
}
|
|
return value
|
|
}
|
|
|
|
func (m *Monitor) effectivePVEPollingInterval() time.Duration {
|
|
const minInterval = 10 * time.Second
|
|
const maxInterval = time.Hour
|
|
|
|
interval := minInterval
|
|
if m != nil && m.config != nil && m.config.PVEPollingInterval > 0 {
|
|
interval = m.config.PVEPollingInterval
|
|
}
|
|
if interval < minInterval {
|
|
interval = minInterval
|
|
}
|
|
if interval > maxInterval {
|
|
interval = maxInterval
|
|
}
|
|
return interval
|
|
}
|
|
|
|
func (m *Monitor) baseIntervalForInstanceType(instanceType InstanceType) time.Duration {
|
|
if m == nil || m.config == nil {
|
|
return DefaultSchedulerConfig().BaseInterval
|
|
}
|
|
|
|
switch instanceType {
|
|
case InstanceTypePVE:
|
|
return m.effectivePVEPollingInterval()
|
|
case InstanceTypePBS:
|
|
return clampInterval(m.config.PBSPollingInterval, 10*time.Second, time.Hour)
|
|
case InstanceTypePMG:
|
|
return clampInterval(m.config.PMGPollingInterval, 10*time.Second, time.Hour)
|
|
default:
|
|
base := m.config.AdaptivePollingBaseInterval
|
|
if base <= 0 {
|
|
base = DefaultSchedulerConfig().BaseInterval
|
|
}
|
|
return clampInterval(base, time.Second, 0)
|
|
}
|
|
}
|
|
|
|
// Start begins the monitoring loop
|
|
func (m *Monitor) Start(ctx context.Context, wsHub *websocket.Hub) {
|
|
pollingInterval := m.effectivePVEPollingInterval()
|
|
log.Info().
|
|
Dur("pollingInterval", pollingInterval).
|
|
Msg("Starting monitoring loop")
|
|
|
|
m.mu.Lock()
|
|
m.runtimeCtx = ctx
|
|
m.wsHub = wsHub
|
|
m.mu.Unlock()
|
|
|
|
// Initialize and start discovery service if enabled
|
|
if mock.IsMockEnabled() {
|
|
log.Info().Msg("Mock mode enabled - skipping discovery service")
|
|
m.discoveryService = nil
|
|
} else if m.config.DiscoveryEnabled {
|
|
discoverySubnet := m.config.DiscoverySubnet
|
|
if discoverySubnet == "" {
|
|
discoverySubnet = "auto"
|
|
}
|
|
cfgProvider := func() config.DiscoveryConfig {
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
if m.config == nil {
|
|
return config.DefaultDiscoveryConfig()
|
|
}
|
|
return config.CloneDiscoveryConfig(m.config.Discovery)
|
|
}
|
|
m.discoveryService = discovery.NewService(wsHub, 5*time.Minute, discoverySubnet, cfgProvider)
|
|
if m.discoveryService != nil {
|
|
m.discoveryService.Start(ctx)
|
|
log.Info().Msg("Discovery service initialized and started")
|
|
} else {
|
|
log.Error().Msg("Failed to initialize discovery service")
|
|
}
|
|
} else {
|
|
log.Info().Msg("Discovery service disabled by configuration")
|
|
m.discoveryService = nil
|
|
}
|
|
|
|
// Set up alert callbacks
|
|
m.alertManager.SetAlertCallback(func(alert *alerts.Alert) {
|
|
wsHub.BroadcastAlert(alert)
|
|
// Send notifications
|
|
log.Debug().
|
|
Str("alertID", alert.ID).
|
|
Str("level", string(alert.Level)).
|
|
Msg("Alert raised, sending to notification manager")
|
|
go m.notificationMgr.SendAlert(alert)
|
|
})
|
|
m.alertManager.SetResolvedCallback(func(alertID string) {
|
|
wsHub.BroadcastAlertResolved(alertID)
|
|
m.notificationMgr.CancelAlert(alertID)
|
|
if m.notificationMgr.GetNotifyOnResolve() {
|
|
if resolved := m.alertManager.GetResolvedAlert(alertID); resolved != nil {
|
|
go m.notificationMgr.SendResolvedAlert(resolved)
|
|
}
|
|
}
|
|
// Don't broadcast full state here - it causes a cascade with many guests.
|
|
// The frontend will get the updated alerts through the regular broadcast ticker.
|
|
})
|
|
m.alertManager.SetEscalateCallback(func(alert *alerts.Alert, level int) {
|
|
log.Info().
|
|
Str("alertID", alert.ID).
|
|
Int("level", level).
|
|
Msg("Alert escalated - sending notifications")
|
|
|
|
// Get escalation config
|
|
config := m.alertManager.GetConfig()
|
|
if level <= 0 || level > len(config.Schedule.Escalation.Levels) {
|
|
return
|
|
}
|
|
|
|
escalationLevel := config.Schedule.Escalation.Levels[level-1]
|
|
|
|
// Send notifications based on escalation level
|
|
switch escalationLevel.Notify {
|
|
case "email":
|
|
// Only send email
|
|
if emailConfig := m.notificationMgr.GetEmailConfig(); emailConfig.Enabled {
|
|
m.notificationMgr.SendAlert(alert)
|
|
}
|
|
case "webhook":
|
|
// Only send webhooks
|
|
for _, webhook := range m.notificationMgr.GetWebhooks() {
|
|
if webhook.Enabled {
|
|
m.notificationMgr.SendAlert(alert)
|
|
break
|
|
}
|
|
}
|
|
case "all":
|
|
// Send all notifications
|
|
m.notificationMgr.SendAlert(alert)
|
|
}
|
|
|
|
// Update WebSocket with escalation
|
|
wsHub.BroadcastAlert(alert)
|
|
})
|
|
|
|
// Create separate tickers for polling and broadcasting using the configured cadence
|
|
|
|
workerCount := len(m.pveClients) + len(m.pbsClients) + len(m.pmgClients)
|
|
m.startTaskWorkers(ctx, workerCount)
|
|
|
|
pollTicker := time.NewTicker(pollingInterval)
|
|
defer pollTicker.Stop()
|
|
|
|
broadcastTicker := time.NewTicker(pollingInterval)
|
|
defer broadcastTicker.Stop()
|
|
|
|
// Start connection retry mechanism for failed clients
|
|
// This handles cases where network/Proxmox isn't ready on initial startup
|
|
if !mock.IsMockEnabled() {
|
|
go m.retryFailedConnections(ctx)
|
|
}
|
|
|
|
// Do an immediate poll on start (only if not in mock mode)
|
|
if mock.IsMockEnabled() {
|
|
log.Info().Msg("Mock mode enabled - skipping real node polling")
|
|
go m.checkMockAlerts()
|
|
} else {
|
|
go m.poll(ctx, wsHub)
|
|
}
|
|
|
|
for {
|
|
select {
|
|
case <-pollTicker.C:
|
|
now := time.Now()
|
|
m.evaluateDockerAgents(now)
|
|
m.evaluateHostAgents(now)
|
|
m.cleanupRemovedDockerHosts(now)
|
|
m.cleanupGuestMetadataCache(now)
|
|
m.cleanupDiagnosticSnapshots(now)
|
|
m.cleanupRRDCache(now)
|
|
if mock.IsMockEnabled() {
|
|
// In mock mode, keep synthetic alerts fresh
|
|
go m.checkMockAlerts()
|
|
} else {
|
|
// Poll real infrastructure
|
|
go m.poll(ctx, wsHub)
|
|
}
|
|
|
|
case <-broadcastTicker.C:
|
|
// Broadcast current state regardless of polling status
|
|
// Use GetState() instead of m.state.GetSnapshot() to respect mock mode
|
|
state := m.GetState()
|
|
log.Info().
|
|
Int("nodes", len(state.Nodes)).
|
|
Int("vms", len(state.VMs)).
|
|
Int("containers", len(state.Containers)).
|
|
Int("pbs", len(state.PBSInstances)).
|
|
Int("pbsBackups", len(state.Backups.PBS)).
|
|
Int("physicalDisks", len(state.PhysicalDisks)).
|
|
Msg("Broadcasting state update (ticker)")
|
|
// Convert to frontend format before broadcasting (converts time.Time to int64, etc.)
|
|
wsHub.BroadcastState(state.ToFrontend())
|
|
|
|
case <-ctx.Done():
|
|
log.Info().Msg("Monitoring loop stopped")
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// retryFailedConnections attempts to recreate clients that failed during initialization
|
|
// This handles cases where Proxmox/network isn't ready when Pulse starts
|
|
func (m *Monitor) retryFailedConnections(ctx context.Context) {
|
|
defer recoverFromPanic("retryFailedConnections")
|
|
|
|
// Retry schedule: 5s, 10s, 20s, 40s, 60s, then every 60s for up to 5 minutes total
|
|
retryDelays := []time.Duration{
|
|
5 * time.Second,
|
|
10 * time.Second,
|
|
20 * time.Second,
|
|
40 * time.Second,
|
|
60 * time.Second,
|
|
}
|
|
|
|
maxRetryDuration := 5 * time.Minute
|
|
startTime := time.Now()
|
|
retryIndex := 0
|
|
|
|
for {
|
|
// Stop retrying after max duration or if context is cancelled
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
default:
|
|
}
|
|
|
|
if time.Since(startTime) > maxRetryDuration {
|
|
log.Info().Msg("Connection retry period expired")
|
|
return
|
|
}
|
|
|
|
// Calculate next retry delay
|
|
var delay time.Duration
|
|
if retryIndex < len(retryDelays) {
|
|
delay = retryDelays[retryIndex]
|
|
retryIndex++
|
|
} else {
|
|
delay = 60 * time.Second // Continue retrying every 60s
|
|
}
|
|
|
|
// Wait before retry
|
|
select {
|
|
case <-time.After(delay):
|
|
case <-ctx.Done():
|
|
return
|
|
}
|
|
|
|
// Check for missing clients and try to recreate them
|
|
m.mu.Lock()
|
|
missingPVE := []config.PVEInstance{}
|
|
missingPBS := []config.PBSInstance{}
|
|
|
|
// Find PVE instances without clients
|
|
for _, pve := range m.config.PVEInstances {
|
|
if _, exists := m.pveClients[pve.Name]; !exists {
|
|
missingPVE = append(missingPVE, pve)
|
|
}
|
|
}
|
|
|
|
// Find PBS instances without clients
|
|
for _, pbs := range m.config.PBSInstances {
|
|
if _, exists := m.pbsClients[pbs.Name]; !exists {
|
|
missingPBS = append(missingPBS, pbs)
|
|
}
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
// If no missing clients, we're done
|
|
if len(missingPVE) == 0 && len(missingPBS) == 0 {
|
|
log.Info().Msg("All client connections established successfully")
|
|
return
|
|
}
|
|
|
|
log.Info().
|
|
Int("missingPVE", len(missingPVE)).
|
|
Int("missingPBS", len(missingPBS)).
|
|
Dur("nextRetry", delay).
|
|
Msg("Attempting to reconnect failed clients")
|
|
|
|
// Try to recreate PVE clients
|
|
for _, pve := range missingPVE {
|
|
if pve.IsCluster && len(pve.ClusterEndpoints) > 0 {
|
|
// Create cluster client
|
|
hasValidEndpoints := false
|
|
endpoints := make([]string, 0, len(pve.ClusterEndpoints))
|
|
|
|
for _, ep := range pve.ClusterEndpoints {
|
|
host := ep.IP
|
|
if host == "" {
|
|
host = ep.Host
|
|
}
|
|
if host == "" {
|
|
continue
|
|
}
|
|
if strings.Contains(host, ".") || net.ParseIP(host) != nil {
|
|
hasValidEndpoints = true
|
|
}
|
|
if !strings.HasPrefix(host, "http") {
|
|
host = fmt.Sprintf("https://%s:8006", host)
|
|
}
|
|
endpoints = append(endpoints, host)
|
|
}
|
|
|
|
if !hasValidEndpoints || len(endpoints) == 0 {
|
|
endpoints = []string{pve.Host}
|
|
if !strings.HasPrefix(endpoints[0], "http") {
|
|
endpoints[0] = fmt.Sprintf("https://%s:8006", endpoints[0])
|
|
}
|
|
}
|
|
|
|
clientConfig := config.CreateProxmoxConfig(&pve)
|
|
clientConfig.Timeout = m.config.ConnectionTimeout
|
|
clusterClient := proxmox.NewClusterClient(pve.Name, clientConfig, endpoints)
|
|
|
|
m.mu.Lock()
|
|
m.pveClients[pve.Name] = clusterClient
|
|
m.state.SetConnectionHealth(pve.Name, true)
|
|
m.mu.Unlock()
|
|
|
|
log.Info().
|
|
Str("instance", pve.Name).
|
|
Str("cluster", pve.ClusterName).
|
|
Msg("Successfully reconnected cluster client")
|
|
} else {
|
|
// Create regular client
|
|
clientConfig := config.CreateProxmoxConfig(&pve)
|
|
clientConfig.Timeout = m.config.ConnectionTimeout
|
|
client, err := proxmox.NewClient(clientConfig)
|
|
if err != nil {
|
|
log.Warn().
|
|
Err(err).
|
|
Str("instance", pve.Name).
|
|
Msg("Failed to reconnect PVE client, will retry")
|
|
continue
|
|
}
|
|
|
|
m.mu.Lock()
|
|
m.pveClients[pve.Name] = client
|
|
m.state.SetConnectionHealth(pve.Name, true)
|
|
m.mu.Unlock()
|
|
|
|
log.Info().
|
|
Str("instance", pve.Name).
|
|
Msg("Successfully reconnected PVE client")
|
|
}
|
|
}
|
|
|
|
// Try to recreate PBS clients
|
|
for _, pbsInst := range missingPBS {
|
|
clientConfig := config.CreatePBSConfig(&pbsInst)
|
|
clientConfig.Timeout = 60 * time.Second
|
|
client, err := pbs.NewClient(clientConfig)
|
|
if err != nil {
|
|
log.Warn().
|
|
Err(err).
|
|
Str("instance", pbsInst.Name).
|
|
Msg("Failed to reconnect PBS client, will retry")
|
|
continue
|
|
}
|
|
|
|
m.mu.Lock()
|
|
m.pbsClients[pbsInst.Name] = client
|
|
m.state.SetConnectionHealth("pbs-"+pbsInst.Name, true)
|
|
m.mu.Unlock()
|
|
|
|
log.Info().
|
|
Str("instance", pbsInst.Name).
|
|
Msg("Successfully reconnected PBS client")
|
|
}
|
|
}
|
|
}
|
|
|
|
// poll fetches data from all configured instances
|
|
func (m *Monitor) poll(ctx context.Context, wsHub *websocket.Hub) {
|
|
defer recoverFromPanic("poll")
|
|
|
|
// Limit concurrent polls to 2 to prevent resource exhaustion
|
|
currentCount := atomic.AddInt32(&m.activePollCount, 1)
|
|
if currentCount > 2 {
|
|
atomic.AddInt32(&m.activePollCount, -1)
|
|
if logging.IsLevelEnabled(zerolog.DebugLevel) {
|
|
log.Debug().Int32("activePolls", currentCount-1).Msg("Too many concurrent polls, skipping")
|
|
}
|
|
return
|
|
}
|
|
defer atomic.AddInt32(&m.activePollCount, -1)
|
|
|
|
if logging.IsLevelEnabled(zerolog.DebugLevel) {
|
|
log.Debug().Msg("Starting polling cycle")
|
|
}
|
|
startTime := time.Now()
|
|
now := startTime
|
|
|
|
plannedTasks := m.buildScheduledTasks(now)
|
|
for _, task := range plannedTasks {
|
|
m.taskQueue.Upsert(task)
|
|
}
|
|
m.updateQueueDepthMetric()
|
|
|
|
// Update performance metrics
|
|
m.state.Performance.LastPollDuration = time.Since(startTime).Seconds()
|
|
m.state.Stats.PollingCycles++
|
|
m.state.Stats.Uptime = int64(time.Since(m.startTime).Seconds())
|
|
m.state.Stats.WebSocketClients = wsHub.GetClientCount()
|
|
|
|
// Sync alert state so broadcasts include the latest acknowledgement data
|
|
m.syncAlertsToState()
|
|
|
|
// Increment poll counter
|
|
m.mu.Lock()
|
|
m.pollCounter++
|
|
m.mu.Unlock()
|
|
|
|
if logging.IsLevelEnabled(zerolog.DebugLevel) {
|
|
log.Debug().Dur("duration", time.Since(startTime)).Msg("Polling cycle completed")
|
|
}
|
|
|
|
// Broadcasting is now handled by the timer in Start()
|
|
}
|
|
|
|
// syncAlertsToState copies the latest alert manager data into the shared state snapshot.
|
|
// This keeps WebSocket broadcasts aligned with in-memory acknowledgement updates.
|
|
func (m *Monitor) syncAlertsToState() {
|
|
if m.pruneStaleDockerAlerts() {
|
|
if logging.IsLevelEnabled(zerolog.DebugLevel) {
|
|
log.Debug().Msg("Pruned stale docker alerts during sync")
|
|
}
|
|
}
|
|
|
|
activeAlerts := m.alertManager.GetActiveAlerts()
|
|
modelAlerts := make([]models.Alert, 0, len(activeAlerts))
|
|
for _, alert := range activeAlerts {
|
|
modelAlerts = append(modelAlerts, models.Alert{
|
|
ID: alert.ID,
|
|
Type: alert.Type,
|
|
Level: string(alert.Level),
|
|
ResourceID: alert.ResourceID,
|
|
ResourceName: alert.ResourceName,
|
|
Node: alert.Node,
|
|
Instance: alert.Instance,
|
|
Message: alert.Message,
|
|
Value: alert.Value,
|
|
Threshold: alert.Threshold,
|
|
StartTime: alert.StartTime,
|
|
Acknowledged: alert.Acknowledged,
|
|
AckTime: alert.AckTime,
|
|
AckUser: alert.AckUser,
|
|
})
|
|
if alert.Acknowledged && logging.IsLevelEnabled(zerolog.DebugLevel) {
|
|
log.Debug().Str("alertID", alert.ID).Interface("ackTime", alert.AckTime).Msg("Syncing acknowledged alert")
|
|
}
|
|
}
|
|
m.state.UpdateActiveAlerts(modelAlerts)
|
|
|
|
recentlyResolved := m.alertManager.GetRecentlyResolved()
|
|
if len(recentlyResolved) > 0 {
|
|
log.Info().Int("count", len(recentlyResolved)).Msg("Syncing recently resolved alerts")
|
|
}
|
|
m.state.UpdateRecentlyResolved(recentlyResolved)
|
|
}
|
|
|
|
// SyncAlertState is the exported wrapper used by APIs that mutate alerts outside the poll loop.
|
|
func (m *Monitor) SyncAlertState() {
|
|
m.syncAlertsToState()
|
|
}
|
|
|
|
// pruneStaleDockerAlerts removes docker alerts that reference hosts no longer present in state.
|
|
func (m *Monitor) pruneStaleDockerAlerts() bool {
|
|
if m.alertManager == nil {
|
|
return false
|
|
}
|
|
|
|
hosts := m.state.GetDockerHosts()
|
|
knownHosts := make(map[string]struct{}, len(hosts))
|
|
for _, host := range hosts {
|
|
id := strings.TrimSpace(host.ID)
|
|
if id != "" {
|
|
knownHosts[id] = struct{}{}
|
|
}
|
|
}
|
|
|
|
if len(knownHosts) == 0 {
|
|
// Still allow stale entries to be cleared if no hosts remain.
|
|
}
|
|
|
|
active := m.alertManager.GetActiveAlerts()
|
|
processed := make(map[string]struct{})
|
|
cleared := false
|
|
|
|
for _, alert := range active {
|
|
var hostID string
|
|
|
|
switch {
|
|
case alert.Type == "docker-host-offline":
|
|
hostID = strings.TrimPrefix(alert.ID, "docker-host-offline-")
|
|
case strings.HasPrefix(alert.ResourceID, "docker:"):
|
|
resource := strings.TrimPrefix(alert.ResourceID, "docker:")
|
|
if idx := strings.Index(resource, "/"); idx >= 0 {
|
|
hostID = resource[:idx]
|
|
} else {
|
|
hostID = resource
|
|
}
|
|
default:
|
|
continue
|
|
}
|
|
|
|
hostID = strings.TrimSpace(hostID)
|
|
if hostID == "" {
|
|
continue
|
|
}
|
|
|
|
if _, known := knownHosts[hostID]; known {
|
|
continue
|
|
}
|
|
if _, alreadyCleared := processed[hostID]; alreadyCleared {
|
|
continue
|
|
}
|
|
|
|
host := models.DockerHost{
|
|
ID: hostID,
|
|
DisplayName: alert.ResourceName,
|
|
Hostname: alert.Node,
|
|
}
|
|
if host.DisplayName == "" {
|
|
host.DisplayName = hostID
|
|
}
|
|
if host.Hostname == "" {
|
|
host.Hostname = hostID
|
|
}
|
|
|
|
m.alertManager.HandleDockerHostRemoved(host)
|
|
processed[hostID] = struct{}{}
|
|
cleared = true
|
|
}
|
|
|
|
return cleared
|
|
}
|
|
|
|
func (m *Monitor) startTaskWorkers(ctx context.Context, workers int) {
|
|
if m.taskQueue == nil {
|
|
return
|
|
}
|
|
if workers < 1 {
|
|
workers = 1
|
|
}
|
|
if workers > 10 {
|
|
workers = 10
|
|
}
|
|
for i := 0; i < workers; i++ {
|
|
go m.taskWorker(ctx, i)
|
|
}
|
|
}
|
|
|
|
func (m *Monitor) taskWorker(ctx context.Context, id int) {
|
|
defer recoverFromPanic(fmt.Sprintf("taskWorker-%d", id))
|
|
|
|
if logging.IsLevelEnabled(zerolog.DebugLevel) {
|
|
log.Debug().Int("worker", id).Msg("Task worker started")
|
|
}
|
|
for {
|
|
task, ok := m.taskQueue.WaitNext(ctx)
|
|
if !ok {
|
|
if logging.IsLevelEnabled(zerolog.DebugLevel) {
|
|
log.Debug().Int("worker", id).Msg("Task worker stopping")
|
|
}
|
|
return
|
|
}
|
|
|
|
m.executeScheduledTask(ctx, task)
|
|
|
|
m.rescheduleTask(task)
|
|
m.updateQueueDepthMetric()
|
|
}
|
|
}
|
|
|
|
func derivePollTimeout(cfg *config.Config) time.Duration {
|
|
timeout := defaultTaskTimeout
|
|
if cfg != nil && cfg.ConnectionTimeout > 0 {
|
|
timeout = cfg.ConnectionTimeout * 2
|
|
}
|
|
if timeout < minTaskTimeout {
|
|
timeout = minTaskTimeout
|
|
}
|
|
if timeout > maxTaskTimeout {
|
|
timeout = maxTaskTimeout
|
|
}
|
|
return timeout
|
|
}
|
|
|
|
func (m *Monitor) taskExecutionTimeout(instanceType InstanceType) time.Duration {
|
|
if m == nil {
|
|
return defaultTaskTimeout
|
|
}
|
|
timeout := m.pollTimeout
|
|
if timeout <= 0 {
|
|
timeout = defaultTaskTimeout
|
|
}
|
|
return timeout
|
|
}
|
|
|
|
func (m *Monitor) executeScheduledTask(ctx context.Context, task ScheduledTask) {
|
|
if !m.allowExecution(task) {
|
|
if logging.IsLevelEnabled(zerolog.DebugLevel) {
|
|
log.Debug().
|
|
Str("instance", task.InstanceName).
|
|
Str("type", string(task.InstanceType)).
|
|
Msg("Task blocked by circuit breaker")
|
|
}
|
|
return
|
|
}
|
|
|
|
if m.pollMetrics != nil {
|
|
wait := time.Duration(0)
|
|
if !task.NextRun.IsZero() {
|
|
wait = time.Since(task.NextRun)
|
|
if wait < 0 {
|
|
wait = 0
|
|
}
|
|
}
|
|
instanceType := string(task.InstanceType)
|
|
if strings.TrimSpace(instanceType) == "" {
|
|
instanceType = "unknown"
|
|
}
|
|
m.pollMetrics.RecordQueueWait(instanceType, wait)
|
|
}
|
|
|
|
executor := m.getExecutor()
|
|
if executor == nil {
|
|
log.Error().
|
|
Str("instance", task.InstanceName).
|
|
Str("type", string(task.InstanceType)).
|
|
Msg("No poll executor configured; skipping task")
|
|
return
|
|
}
|
|
|
|
pollTask := PollTask{
|
|
InstanceName: task.InstanceName,
|
|
InstanceType: string(task.InstanceType),
|
|
}
|
|
|
|
switch task.InstanceType {
|
|
case InstanceTypePVE:
|
|
client, ok := m.pveClients[task.InstanceName]
|
|
if !ok || client == nil {
|
|
log.Warn().Str("instance", task.InstanceName).Msg("PVE client missing for scheduled task")
|
|
return
|
|
}
|
|
pollTask.PVEClient = client
|
|
case InstanceTypePBS:
|
|
client, ok := m.pbsClients[task.InstanceName]
|
|
if !ok || client == nil {
|
|
log.Warn().Str("instance", task.InstanceName).Msg("PBS client missing for scheduled task")
|
|
return
|
|
}
|
|
pollTask.PBSClient = client
|
|
case InstanceTypePMG:
|
|
client, ok := m.pmgClients[task.InstanceName]
|
|
if !ok || client == nil {
|
|
log.Warn().Str("instance", task.InstanceName).Msg("PMG client missing for scheduled task")
|
|
return
|
|
}
|
|
pollTask.PMGClient = client
|
|
default:
|
|
log.Debug().
|
|
Str("instance", task.InstanceName).
|
|
Str("type", string(task.InstanceType)).
|
|
Msg("Skipping unsupported task type")
|
|
return
|
|
}
|
|
|
|
taskCtx := ctx
|
|
var cancel context.CancelFunc
|
|
timeout := m.taskExecutionTimeout(task.InstanceType)
|
|
if timeout > 0 {
|
|
taskCtx, cancel = context.WithTimeout(ctx, timeout)
|
|
defer cancel()
|
|
}
|
|
|
|
executor.Execute(taskCtx, pollTask)
|
|
|
|
if timeout > 0 && stderrors.Is(taskCtx.Err(), context.DeadlineExceeded) {
|
|
log.Warn().
|
|
Str("instance", task.InstanceName).
|
|
Str("type", string(task.InstanceType)).
|
|
Dur("timeout", timeout).
|
|
Msg("Polling task timed out; rescheduling with fresh worker")
|
|
}
|
|
}
|
|
|
|
func (m *Monitor) rescheduleTask(task ScheduledTask) {
|
|
if m.taskQueue == nil {
|
|
return
|
|
}
|
|
|
|
key := schedulerKey(task.InstanceType, task.InstanceName)
|
|
m.mu.Lock()
|
|
outcome, hasOutcome := m.lastOutcome[key]
|
|
failureCount := m.failureCounts[key]
|
|
m.mu.Unlock()
|
|
|
|
if hasOutcome && !outcome.success {
|
|
if !outcome.transient || failureCount >= m.maxRetryAttempts {
|
|
m.sendToDeadLetter(task, outcome.err)
|
|
return
|
|
}
|
|
delay := m.backoffCfg.nextDelay(failureCount-1, m.randomFloat())
|
|
if delay <= 0 {
|
|
delay = 5 * time.Second
|
|
}
|
|
if m.config != nil && m.config.AdaptivePollingEnabled && m.config.AdaptivePollingMaxInterval > 0 && m.config.AdaptivePollingMaxInterval <= 15*time.Second {
|
|
maxDelay := 4 * time.Second
|
|
if delay > maxDelay {
|
|
delay = maxDelay
|
|
}
|
|
}
|
|
next := task
|
|
next.Interval = delay
|
|
next.NextRun = time.Now().Add(delay)
|
|
m.taskQueue.Upsert(next)
|
|
return
|
|
}
|
|
|
|
if m.scheduler == nil {
|
|
baseInterval := m.baseIntervalForInstanceType(task.InstanceType)
|
|
nextInterval := task.Interval
|
|
if nextInterval <= 0 {
|
|
nextInterval = baseInterval
|
|
}
|
|
if nextInterval <= 0 {
|
|
nextInterval = DefaultSchedulerConfig().BaseInterval
|
|
}
|
|
next := task
|
|
next.NextRun = time.Now().Add(nextInterval)
|
|
next.Interval = nextInterval
|
|
m.taskQueue.Upsert(next)
|
|
return
|
|
}
|
|
|
|
desc := InstanceDescriptor{
|
|
Name: task.InstanceName,
|
|
Type: task.InstanceType,
|
|
LastInterval: task.Interval,
|
|
LastScheduled: task.NextRun,
|
|
}
|
|
if m.stalenessTracker != nil {
|
|
if snap, ok := m.stalenessTracker.snapshot(task.InstanceType, task.InstanceName); ok {
|
|
desc.LastSuccess = snap.LastSuccess
|
|
desc.LastFailure = snap.LastError
|
|
if snap.ChangeHash != "" {
|
|
desc.Metadata = map[string]any{"changeHash": snap.ChangeHash}
|
|
}
|
|
}
|
|
}
|
|
|
|
tasks := m.scheduler.BuildPlan(time.Now(), []InstanceDescriptor{desc}, m.taskQueue.Size())
|
|
if len(tasks) == 0 {
|
|
next := task
|
|
nextInterval := task.Interval
|
|
if nextInterval <= 0 && m.config != nil {
|
|
nextInterval = m.config.AdaptivePollingBaseInterval
|
|
}
|
|
if nextInterval <= 0 {
|
|
nextInterval = DefaultSchedulerConfig().BaseInterval
|
|
}
|
|
next.Interval = nextInterval
|
|
next.NextRun = time.Now().Add(nextInterval)
|
|
m.taskQueue.Upsert(next)
|
|
return
|
|
}
|
|
for _, next := range tasks {
|
|
m.taskQueue.Upsert(next)
|
|
}
|
|
}
|
|
|
|
func (m *Monitor) sendToDeadLetter(task ScheduledTask, err error) {
|
|
if m.deadLetterQueue == nil {
|
|
log.Error().
|
|
Str("instance", task.InstanceName).
|
|
Str("type", string(task.InstanceType)).
|
|
Err(err).
|
|
Msg("Dead-letter queue unavailable; dropping task")
|
|
return
|
|
}
|
|
|
|
log.Error().
|
|
Str("instance", task.InstanceName).
|
|
Str("type", string(task.InstanceType)).
|
|
Err(err).
|
|
Msg("Routing task to dead-letter queue after repeated failures")
|
|
|
|
next := task
|
|
next.Interval = 30 * time.Minute
|
|
next.NextRun = time.Now().Add(next.Interval)
|
|
m.deadLetterQueue.Upsert(next)
|
|
m.updateDeadLetterMetrics()
|
|
|
|
key := schedulerKey(task.InstanceType, task.InstanceName)
|
|
now := time.Now()
|
|
|
|
m.mu.Lock()
|
|
if m.dlqInsightMap == nil {
|
|
m.dlqInsightMap = make(map[string]*dlqInsight)
|
|
}
|
|
info, ok := m.dlqInsightMap[key]
|
|
if !ok {
|
|
info = &dlqInsight{}
|
|
m.dlqInsightMap[key] = info
|
|
}
|
|
if info.FirstAttempt.IsZero() {
|
|
info.FirstAttempt = now
|
|
}
|
|
info.LastAttempt = now
|
|
info.RetryCount++
|
|
info.NextRetry = next.NextRun
|
|
if err != nil {
|
|
info.Reason = classifyDLQReason(err)
|
|
}
|
|
m.mu.Unlock()
|
|
}
|
|
|
|
func classifyDLQReason(err error) string {
|
|
if err == nil {
|
|
return ""
|
|
}
|
|
if errors.IsRetryableError(err) {
|
|
return "max_retry_attempts"
|
|
}
|
|
return "permanent_failure"
|
|
}
|
|
|
|
func (m *Monitor) updateDeadLetterMetrics() {
|
|
if m.pollMetrics == nil || m.deadLetterQueue == nil {
|
|
return
|
|
}
|
|
|
|
size := m.deadLetterQueue.Size()
|
|
if size <= 0 {
|
|
m.pollMetrics.UpdateDeadLetterCounts(nil)
|
|
return
|
|
}
|
|
|
|
tasks := m.deadLetterQueue.PeekAll(size)
|
|
m.pollMetrics.UpdateDeadLetterCounts(tasks)
|
|
}
|
|
|
|
func (m *Monitor) updateBreakerMetric(instanceType InstanceType, instance string, breaker *circuitBreaker) {
|
|
if m.pollMetrics == nil || breaker == nil {
|
|
return
|
|
}
|
|
|
|
state, failures, retryAt, _, _ := breaker.stateDetails()
|
|
m.pollMetrics.SetBreakerState(string(instanceType), instance, state, failures, retryAt)
|
|
}
|
|
|
|
func (m *Monitor) randomFloat() float64 {
|
|
if m.rng == nil {
|
|
m.rng = rand.New(rand.NewSource(time.Now().UnixNano()))
|
|
}
|
|
return m.rng.Float64()
|
|
}
|
|
|
|
func (m *Monitor) updateQueueDepthMetric() {
|
|
if m.pollMetrics == nil || m.taskQueue == nil {
|
|
return
|
|
}
|
|
snapshot := m.taskQueue.Snapshot()
|
|
m.pollMetrics.SetQueueDepth(snapshot.Depth)
|
|
m.pollMetrics.UpdateQueueSnapshot(snapshot)
|
|
}
|
|
|
|
func (m *Monitor) allowExecution(task ScheduledTask) bool {
|
|
if m.circuitBreakers == nil {
|
|
return true
|
|
}
|
|
key := schedulerKey(task.InstanceType, task.InstanceName)
|
|
breaker := m.ensureBreaker(key)
|
|
allowed := breaker.allow(time.Now())
|
|
m.updateBreakerMetric(task.InstanceType, task.InstanceName, breaker)
|
|
return allowed
|
|
}
|
|
|
|
func (m *Monitor) ensureBreaker(key string) *circuitBreaker {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
if m.circuitBreakers == nil {
|
|
m.circuitBreakers = make(map[string]*circuitBreaker)
|
|
}
|
|
if breaker, ok := m.circuitBreakers[key]; ok {
|
|
return breaker
|
|
}
|
|
baseRetry := m.breakerBaseRetry
|
|
if baseRetry <= 0 {
|
|
baseRetry = 5 * time.Second
|
|
}
|
|
maxDelay := m.breakerMaxDelay
|
|
if maxDelay <= 0 {
|
|
maxDelay = 5 * time.Minute
|
|
}
|
|
halfOpen := m.breakerHalfOpenWindow
|
|
if halfOpen <= 0 {
|
|
halfOpen = 30 * time.Second
|
|
}
|
|
breaker := newCircuitBreaker(3, baseRetry, maxDelay, halfOpen)
|
|
m.circuitBreakers[key] = breaker
|
|
return breaker
|
|
}
|
|
|
|
func (m *Monitor) recordTaskResult(instanceType InstanceType, instance string, pollErr error) {
|
|
if m == nil {
|
|
return
|
|
}
|
|
|
|
key := schedulerKey(instanceType, instance)
|
|
now := time.Now()
|
|
|
|
breaker := m.ensureBreaker(key)
|
|
|
|
m.mu.Lock()
|
|
status, ok := m.pollStatusMap[key]
|
|
if !ok {
|
|
status = &pollStatus{}
|
|
m.pollStatusMap[key] = status
|
|
}
|
|
|
|
if pollErr == nil {
|
|
if m.failureCounts != nil {
|
|
m.failureCounts[key] = 0
|
|
}
|
|
if m.lastOutcome != nil {
|
|
m.lastOutcome[key] = taskOutcome{
|
|
success: true,
|
|
transient: true,
|
|
err: nil,
|
|
recordedAt: now,
|
|
}
|
|
}
|
|
status.LastSuccess = now
|
|
status.ConsecutiveFailures = 0
|
|
status.FirstFailureAt = time.Time{}
|
|
m.mu.Unlock()
|
|
if breaker != nil {
|
|
breaker.recordSuccess()
|
|
m.updateBreakerMetric(instanceType, instance, breaker)
|
|
}
|
|
return
|
|
}
|
|
|
|
transient := isTransientError(pollErr)
|
|
category := "permanent"
|
|
if transient {
|
|
category = "transient"
|
|
}
|
|
if m.failureCounts != nil {
|
|
m.failureCounts[key] = m.failureCounts[key] + 1
|
|
}
|
|
if m.lastOutcome != nil {
|
|
m.lastOutcome[key] = taskOutcome{
|
|
success: false,
|
|
transient: transient,
|
|
err: pollErr,
|
|
recordedAt: now,
|
|
}
|
|
}
|
|
status.LastErrorAt = now
|
|
status.LastErrorMessage = pollErr.Error()
|
|
status.LastErrorCategory = category
|
|
status.ConsecutiveFailures++
|
|
if status.ConsecutiveFailures == 1 {
|
|
status.FirstFailureAt = now
|
|
}
|
|
m.mu.Unlock()
|
|
if breaker != nil {
|
|
breaker.recordFailure(now)
|
|
m.updateBreakerMetric(instanceType, instance, breaker)
|
|
}
|
|
}
|
|
|
|
// SchedulerHealthResponse contains complete scheduler health data for API exposure.
|
|
type SchedulerHealthResponse struct {
|
|
UpdatedAt time.Time `json:"updatedAt"`
|
|
Enabled bool `json:"enabled"`
|
|
Queue QueueSnapshot `json:"queue"`
|
|
DeadLetter DeadLetterSnapshot `json:"deadLetter"`
|
|
Breakers []BreakerSnapshot `json:"breakers,omitempty"`
|
|
Staleness []StalenessSnapshot `json:"staleness,omitempty"`
|
|
Instances []InstanceHealth `json:"instances"`
|
|
}
|
|
|
|
// DeadLetterSnapshot contains dead-letter queue data.
|
|
type DeadLetterSnapshot struct {
|
|
Count int `json:"count"`
|
|
Tasks []DeadLetterTask `json:"tasks"`
|
|
}
|
|
|
|
// SchedulerHealth returns a complete snapshot of scheduler health for API exposure.
|
|
func (m *Monitor) SchedulerHealth() SchedulerHealthResponse {
|
|
response := SchedulerHealthResponse{
|
|
UpdatedAt: time.Now(),
|
|
Enabled: m.config != nil && m.config.AdaptivePollingEnabled,
|
|
}
|
|
|
|
// Queue snapshot
|
|
if m.taskQueue != nil {
|
|
response.Queue = m.taskQueue.Snapshot()
|
|
if m.pollMetrics != nil {
|
|
m.pollMetrics.UpdateQueueSnapshot(response.Queue)
|
|
}
|
|
}
|
|
|
|
// Dead-letter queue snapshot
|
|
if m.deadLetterQueue != nil {
|
|
deadLetterTasks := m.deadLetterQueue.PeekAll(25) // limit to top 25
|
|
m.mu.RLock()
|
|
for i := range deadLetterTasks {
|
|
key := schedulerKey(InstanceType(deadLetterTasks[i].Type), deadLetterTasks[i].Instance)
|
|
if outcome, ok := m.lastOutcome[key]; ok && outcome.err != nil {
|
|
deadLetterTasks[i].LastError = outcome.err.Error()
|
|
}
|
|
if count, ok := m.failureCounts[key]; ok {
|
|
deadLetterTasks[i].Failures = count
|
|
}
|
|
}
|
|
m.mu.RUnlock()
|
|
response.DeadLetter = DeadLetterSnapshot{
|
|
Count: m.deadLetterQueue.Size(),
|
|
Tasks: deadLetterTasks,
|
|
}
|
|
m.updateDeadLetterMetrics()
|
|
}
|
|
|
|
// Circuit breaker snapshots
|
|
m.mu.RLock()
|
|
breakerSnapshots := make([]BreakerSnapshot, 0, len(m.circuitBreakers))
|
|
for key, breaker := range m.circuitBreakers {
|
|
state, failures, retryAt := breaker.State()
|
|
// Only include breakers that are not in default closed state with 0 failures
|
|
if state != "closed" || failures > 0 {
|
|
// Parse instance type and name from key
|
|
parts := strings.SplitN(key, "::", 2)
|
|
instanceType, instanceName := "unknown", key
|
|
if len(parts) == 2 {
|
|
instanceType, instanceName = parts[0], parts[1]
|
|
}
|
|
breakerSnapshots = append(breakerSnapshots, BreakerSnapshot{
|
|
Instance: instanceName,
|
|
Type: instanceType,
|
|
State: state,
|
|
Failures: failures,
|
|
RetryAt: retryAt,
|
|
})
|
|
}
|
|
}
|
|
m.mu.RUnlock()
|
|
response.Breakers = breakerSnapshots
|
|
|
|
// Staleness snapshots
|
|
if m.stalenessTracker != nil {
|
|
response.Staleness = m.stalenessTracker.Snapshot()
|
|
}
|
|
|
|
instanceInfos := make(map[string]*instanceInfo)
|
|
pollStatuses := make(map[string]pollStatus)
|
|
dlqInsights := make(map[string]dlqInsight)
|
|
breakerRefs := make(map[string]*circuitBreaker)
|
|
|
|
m.mu.RLock()
|
|
for k, v := range m.instanceInfoCache {
|
|
if v == nil {
|
|
continue
|
|
}
|
|
copyVal := *v
|
|
instanceInfos[k] = ©Val
|
|
}
|
|
for k, v := range m.pollStatusMap {
|
|
if v == nil {
|
|
continue
|
|
}
|
|
pollStatuses[k] = *v
|
|
}
|
|
for k, v := range m.dlqInsightMap {
|
|
if v == nil {
|
|
continue
|
|
}
|
|
dlqInsights[k] = *v
|
|
}
|
|
for k, v := range m.circuitBreakers {
|
|
if v != nil {
|
|
breakerRefs[k] = v
|
|
}
|
|
}
|
|
m.mu.RUnlock()
|
|
for key, breaker := range breakerRefs {
|
|
instanceType := InstanceType("unknown")
|
|
instanceName := key
|
|
if parts := strings.SplitN(key, "::", 2); len(parts) == 2 {
|
|
if parts[0] != "" {
|
|
instanceType = InstanceType(parts[0])
|
|
}
|
|
if parts[1] != "" {
|
|
instanceName = parts[1]
|
|
}
|
|
}
|
|
m.updateBreakerMetric(instanceType, instanceName, breaker)
|
|
}
|
|
|
|
keySet := make(map[string]struct{})
|
|
for k := range instanceInfos {
|
|
if k != "" {
|
|
keySet[k] = struct{}{}
|
|
}
|
|
}
|
|
for k := range pollStatuses {
|
|
if k != "" {
|
|
keySet[k] = struct{}{}
|
|
}
|
|
}
|
|
for k := range dlqInsights {
|
|
if k != "" {
|
|
keySet[k] = struct{}{}
|
|
}
|
|
}
|
|
for k := range breakerRefs {
|
|
if k != "" {
|
|
keySet[k] = struct{}{}
|
|
}
|
|
}
|
|
for _, task := range response.DeadLetter.Tasks {
|
|
if task.Instance == "" {
|
|
continue
|
|
}
|
|
keySet[schedulerKey(InstanceType(task.Type), task.Instance)] = struct{}{}
|
|
}
|
|
for _, snap := range response.Staleness {
|
|
if snap.Instance == "" {
|
|
continue
|
|
}
|
|
keySet[schedulerKey(InstanceType(snap.Type), snap.Instance)] = struct{}{}
|
|
}
|
|
|
|
if len(keySet) > 0 {
|
|
keys := make([]string, 0, len(keySet))
|
|
for k := range keySet {
|
|
keys = append(keys, k)
|
|
}
|
|
sort.Strings(keys)
|
|
|
|
instances := make([]InstanceHealth, 0, len(keys))
|
|
for _, key := range keys {
|
|
instType := "unknown"
|
|
instName := key
|
|
if parts := strings.SplitN(key, "::", 2); len(parts) == 2 {
|
|
if parts[0] != "" {
|
|
instType = parts[0]
|
|
}
|
|
if parts[1] != "" {
|
|
instName = parts[1]
|
|
}
|
|
}
|
|
instType = strings.TrimSpace(instType)
|
|
instName = strings.TrimSpace(instName)
|
|
|
|
info := instanceInfos[key]
|
|
display := instName
|
|
connection := ""
|
|
if info != nil {
|
|
if instType == "unknown" || instType == "" {
|
|
if info.Type != "" {
|
|
instType = string(info.Type)
|
|
}
|
|
}
|
|
if strings.Contains(info.Key, "::") {
|
|
if parts := strings.SplitN(info.Key, "::", 2); len(parts) == 2 {
|
|
if instName == key {
|
|
instName = parts[1]
|
|
}
|
|
if (instType == "" || instType == "unknown") && parts[0] != "" {
|
|
instType = parts[0]
|
|
}
|
|
}
|
|
}
|
|
if info.DisplayName != "" {
|
|
display = info.DisplayName
|
|
}
|
|
if info.Connection != "" {
|
|
connection = info.Connection
|
|
}
|
|
}
|
|
display = strings.TrimSpace(display)
|
|
connection = strings.TrimSpace(connection)
|
|
if display == "" {
|
|
display = instName
|
|
}
|
|
if display == "" {
|
|
display = connection
|
|
}
|
|
if instType == "" {
|
|
instType = "unknown"
|
|
}
|
|
if instName == "" {
|
|
instName = key
|
|
}
|
|
|
|
status, hasStatus := pollStatuses[key]
|
|
instanceStatus := InstancePollStatus{}
|
|
if hasStatus {
|
|
instanceStatus.ConsecutiveFailures = status.ConsecutiveFailures
|
|
instanceStatus.LastSuccess = timePtr(status.LastSuccess)
|
|
if !status.FirstFailureAt.IsZero() {
|
|
instanceStatus.FirstFailureAt = timePtr(status.FirstFailureAt)
|
|
}
|
|
if !status.LastErrorAt.IsZero() && status.LastErrorMessage != "" {
|
|
instanceStatus.LastError = &ErrorDetail{
|
|
At: status.LastErrorAt,
|
|
Message: status.LastErrorMessage,
|
|
Category: status.LastErrorCategory,
|
|
}
|
|
}
|
|
}
|
|
|
|
breakerInfo := InstanceBreaker{
|
|
State: "closed",
|
|
FailureCount: 0,
|
|
}
|
|
if br, ok := breakerRefs[key]; ok && br != nil {
|
|
state, failures, retryAt, since, lastTransition := br.stateDetails()
|
|
if state != "" {
|
|
breakerInfo.State = state
|
|
}
|
|
breakerInfo.FailureCount = failures
|
|
breakerInfo.RetryAt = timePtr(retryAt)
|
|
breakerInfo.Since = timePtr(since)
|
|
breakerInfo.LastTransition = timePtr(lastTransition)
|
|
}
|
|
|
|
dlqInfo := InstanceDLQ{Present: false}
|
|
if dlq, ok := dlqInsights[key]; ok {
|
|
dlqInfo.Present = true
|
|
dlqInfo.Reason = dlq.Reason
|
|
dlqInfo.FirstAttempt = timePtr(dlq.FirstAttempt)
|
|
dlqInfo.LastAttempt = timePtr(dlq.LastAttempt)
|
|
dlqInfo.RetryCount = dlq.RetryCount
|
|
dlqInfo.NextRetry = timePtr(dlq.NextRetry)
|
|
}
|
|
|
|
instances = append(instances, InstanceHealth{
|
|
Key: key,
|
|
Type: instType,
|
|
DisplayName: display,
|
|
Instance: instName,
|
|
Connection: connection,
|
|
PollStatus: instanceStatus,
|
|
Breaker: breakerInfo,
|
|
DeadLetter: dlqInfo,
|
|
})
|
|
}
|
|
|
|
response.Instances = instances
|
|
} else {
|
|
response.Instances = []InstanceHealth{}
|
|
}
|
|
|
|
return response
|
|
}
|
|
|
|
func isTransientError(err error) bool {
|
|
if err == nil {
|
|
return true
|
|
}
|
|
if errors.IsRetryableError(err) {
|
|
return true
|
|
}
|
|
if stderrors.Is(err, context.Canceled) || stderrors.Is(err, context.DeadlineExceeded) {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
func shouldTryPortlessFallback(err error) bool {
|
|
if err == nil {
|
|
return false
|
|
}
|
|
msg := strings.ToLower(err.Error())
|
|
if strings.Contains(msg, "connection refused") ||
|
|
strings.Contains(msg, "connection reset") ||
|
|
strings.Contains(msg, "no such host") ||
|
|
strings.Contains(msg, "client.timeout exceeded") ||
|
|
strings.Contains(msg, "i/o timeout") ||
|
|
strings.Contains(msg, "context deadline exceeded") {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
// retryPVEPortFallback handles the case where a normalized :8006 host is unreachable
|
|
// because the actual endpoint is fronted by a reverse proxy on 443. If the initial
|
|
// GetNodes call fails with a connection error and the host has the default PVE port,
|
|
// retry without the default port to hit the proxy. On success, swap the client so
|
|
// subsequent polls reuse the working endpoint.
|
|
func (m *Monitor) retryPVEPortFallback(ctx context.Context, instanceName string, instanceCfg *config.PVEInstance, currentClient PVEClientInterface, cause error) ([]proxmox.Node, PVEClientInterface, error) {
|
|
if instanceCfg == nil || !shouldTryPortlessFallback(cause) {
|
|
return nil, currentClient, cause
|
|
}
|
|
|
|
fallbackHost := config.StripDefaultPort(instanceCfg.Host, config.DefaultPVEPort)
|
|
if fallbackHost == "" || fallbackHost == instanceCfg.Host {
|
|
return nil, currentClient, cause
|
|
}
|
|
|
|
clientCfg := config.CreateProxmoxConfigWithHost(instanceCfg, fallbackHost, false)
|
|
if clientCfg.Timeout <= 0 {
|
|
clientCfg.Timeout = m.config.ConnectionTimeout
|
|
}
|
|
|
|
fallbackClient, err := proxmox.NewClient(clientCfg)
|
|
if err != nil {
|
|
return nil, currentClient, cause
|
|
}
|
|
|
|
fallbackNodes, err := fallbackClient.GetNodes(ctx)
|
|
if err != nil {
|
|
return nil, currentClient, cause
|
|
}
|
|
|
|
// Switch to the working host for the remainder of the poll (and future polls)
|
|
primaryHost := instanceCfg.Host
|
|
|
|
// Persist with an explicit port to avoid re-normalization back to :8006 on reloads.
|
|
persistHost := fallbackHost
|
|
if parsed, err := url.Parse(fallbackHost); err == nil && parsed.Host != "" && parsed.Port() == "" {
|
|
port := "443"
|
|
if strings.EqualFold(parsed.Scheme, "http") {
|
|
port = "80"
|
|
}
|
|
parsed.Host = net.JoinHostPort(parsed.Hostname(), port)
|
|
persistHost = parsed.Scheme + "://" + parsed.Host
|
|
}
|
|
|
|
instanceCfg.Host = persistHost
|
|
m.pveClients[instanceName] = fallbackClient
|
|
|
|
// Update in-memory config so subsequent polls build clients against the working port.
|
|
for i := range m.config.PVEInstances {
|
|
if m.config.PVEInstances[i].Name == instanceName {
|
|
m.config.PVEInstances[i].Host = persistHost
|
|
break
|
|
}
|
|
}
|
|
|
|
// Persist to disk so restarts keep the working endpoint.
|
|
if m.persistence != nil {
|
|
if err := m.persistence.SaveNodesConfig(m.config.PVEInstances, m.config.PBSInstances, m.config.PMGInstances); err != nil {
|
|
log.Warn().Err(err).Str("instance", instanceName).Msg("Failed to persist fallback PVE host")
|
|
}
|
|
}
|
|
|
|
log.Warn().
|
|
Str("instance", instanceName).
|
|
Str("primary", primaryHost).
|
|
Str("fallback", persistHost).
|
|
Msg("Primary PVE host failed; using fallback without default port")
|
|
|
|
return fallbackNodes, fallbackClient, nil
|
|
}
|
|
|
|
// pollPVEInstance polls a single PVE instance
|
|
func (m *Monitor) pollPVEInstance(ctx context.Context, instanceName string, client PVEClientInterface) {
|
|
defer recoverFromPanic(fmt.Sprintf("pollPVEInstance-%s", instanceName))
|
|
|
|
start := time.Now()
|
|
debugEnabled := logging.IsLevelEnabled(zerolog.DebugLevel)
|
|
var pollErr error
|
|
if m.pollMetrics != nil {
|
|
m.pollMetrics.IncInFlight("pve")
|
|
defer m.pollMetrics.DecInFlight("pve")
|
|
defer func() {
|
|
m.pollMetrics.RecordResult(PollResult{
|
|
InstanceName: instanceName,
|
|
InstanceType: "pve",
|
|
Success: pollErr == nil,
|
|
Error: pollErr,
|
|
StartTime: start,
|
|
EndTime: time.Now(),
|
|
})
|
|
}()
|
|
}
|
|
if m.stalenessTracker != nil {
|
|
defer func() {
|
|
if pollErr == nil {
|
|
m.stalenessTracker.UpdateSuccess(InstanceTypePVE, instanceName, nil)
|
|
} else {
|
|
m.stalenessTracker.UpdateError(InstanceTypePVE, instanceName)
|
|
}
|
|
}()
|
|
}
|
|
defer m.recordTaskResult(InstanceTypePVE, instanceName, pollErr)
|
|
|
|
// Check if context is cancelled
|
|
select {
|
|
case <-ctx.Done():
|
|
pollErr = ctx.Err()
|
|
if debugEnabled {
|
|
log.Debug().Str("instance", instanceName).Msg("Polling cancelled")
|
|
}
|
|
return
|
|
default:
|
|
}
|
|
|
|
if debugEnabled {
|
|
log.Debug().Str("instance", instanceName).Msg("Polling PVE instance")
|
|
}
|
|
|
|
// Get instance config
|
|
var instanceCfg *config.PVEInstance
|
|
for _, cfg := range m.config.PVEInstances {
|
|
if cfg.Name == instanceName {
|
|
instanceCfg = &cfg
|
|
break
|
|
}
|
|
}
|
|
if instanceCfg == nil {
|
|
pollErr = fmt.Errorf("pve instance config not found for %s", instanceName)
|
|
return
|
|
}
|
|
|
|
// Poll nodes
|
|
nodes, err := client.GetNodes(ctx)
|
|
if err != nil {
|
|
if fallbackNodes, fallbackClient, fallbackErr := m.retryPVEPortFallback(ctx, instanceName, instanceCfg, client, err); fallbackErr == nil {
|
|
client = fallbackClient
|
|
nodes = fallbackNodes
|
|
} else {
|
|
monErr := errors.WrapConnectionError("poll_nodes", instanceName, err)
|
|
pollErr = monErr
|
|
log.Error().Err(monErr).Str("instance", instanceName).Msg("Failed to get nodes")
|
|
m.state.SetConnectionHealth(instanceName, false)
|
|
|
|
// Track auth failure if it's an authentication error
|
|
if errors.IsAuthError(err) {
|
|
m.recordAuthFailure(instanceName, "pve")
|
|
}
|
|
return
|
|
}
|
|
}
|
|
|
|
// Reset auth failures on successful connection
|
|
m.resetAuthFailures(instanceName, "pve")
|
|
|
|
// Check if client is a ClusterClient to determine health status
|
|
connectionHealthStr := "healthy"
|
|
if clusterClient, ok := client.(*proxmox.ClusterClient); ok {
|
|
// For cluster clients, check if all endpoints are healthy
|
|
healthStatus := clusterClient.GetHealthStatus()
|
|
healthyCount := 0
|
|
totalCount := len(healthStatus)
|
|
|
|
for _, isHealthy := range healthStatus {
|
|
if isHealthy {
|
|
healthyCount++
|
|
}
|
|
}
|
|
|
|
if healthyCount == 0 {
|
|
// All endpoints are down
|
|
connectionHealthStr = "error"
|
|
m.state.SetConnectionHealth(instanceName, false)
|
|
} else if healthyCount < totalCount {
|
|
// Some endpoints are down - degraded state
|
|
connectionHealthStr = "degraded"
|
|
m.state.SetConnectionHealth(instanceName, true) // Still functional but degraded
|
|
log.Warn().
|
|
Str("instance", instanceName).
|
|
Int("healthy", healthyCount).
|
|
Int("total", totalCount).
|
|
Msg("Cluster is in degraded state - some nodes are unreachable")
|
|
} else {
|
|
// All endpoints are healthy
|
|
connectionHealthStr = "healthy"
|
|
m.state.SetConnectionHealth(instanceName, true)
|
|
}
|
|
} else {
|
|
// Regular client - simple healthy/unhealthy
|
|
m.state.SetConnectionHealth(instanceName, true)
|
|
}
|
|
|
|
// Capture previous memory metrics so we can preserve them if detailed status fails
|
|
prevState := m.GetState()
|
|
prevNodeMemory := make(map[string]models.Memory)
|
|
prevInstanceNodes := make([]models.Node, 0)
|
|
for _, existingNode := range prevState.Nodes {
|
|
if existingNode.Instance != instanceName {
|
|
continue
|
|
}
|
|
prevNodeMemory[existingNode.ID] = existingNode.Memory
|
|
prevInstanceNodes = append(prevInstanceNodes, existingNode)
|
|
}
|
|
|
|
// Convert to models
|
|
var modelNodes []models.Node
|
|
nodeEffectiveStatus := make(map[string]string) // Track effective status (with grace period) for each node
|
|
// Parallel node polling
|
|
type nodePollResult struct {
|
|
node models.Node
|
|
effectiveStatus string
|
|
}
|
|
|
|
resultChan := make(chan nodePollResult, len(nodes))
|
|
var wg sync.WaitGroup
|
|
|
|
if debugEnabled {
|
|
log.Debug().
|
|
Str("instance", instanceName).
|
|
Int("nodes", len(nodes)).
|
|
Msg("Starting parallel node polling")
|
|
}
|
|
|
|
for _, node := range nodes {
|
|
wg.Add(1)
|
|
go func(node proxmox.Node) {
|
|
defer wg.Done()
|
|
|
|
modelNode, effectiveStatus, _ := m.pollPVENode(ctx, instanceName, instanceCfg, client, node, connectionHealthStr, prevNodeMemory, prevInstanceNodes)
|
|
|
|
resultChan <- nodePollResult{
|
|
node: modelNode,
|
|
effectiveStatus: effectiveStatus,
|
|
}
|
|
}(node)
|
|
}
|
|
|
|
wg.Wait()
|
|
close(resultChan)
|
|
|
|
for res := range resultChan {
|
|
modelNodes = append(modelNodes, res.node)
|
|
nodeEffectiveStatus[res.node.Name] = res.effectiveStatus
|
|
}
|
|
|
|
if len(modelNodes) == 0 && len(prevInstanceNodes) > 0 {
|
|
log.Warn().
|
|
Str("instance", instanceName).
|
|
Int("previousCount", len(prevInstanceNodes)).
|
|
Msg("No Proxmox nodes returned this cycle - preserving previous state")
|
|
|
|
// Mark connection health as degraded to reflect polling failure
|
|
m.state.SetConnectionHealth(instanceName, false)
|
|
|
|
preserved := make([]models.Node, 0, len(prevInstanceNodes))
|
|
for _, prevNode := range prevInstanceNodes {
|
|
nodeCopy := prevNode
|
|
nodeCopy.Status = "offline"
|
|
nodeCopy.ConnectionHealth = "error"
|
|
nodeCopy.Uptime = 0
|
|
nodeCopy.CPU = 0
|
|
preserved = append(preserved, nodeCopy)
|
|
}
|
|
modelNodes = preserved
|
|
}
|
|
|
|
// Update state first so we have nodes available
|
|
m.state.UpdateNodesForInstance(instanceName, modelNodes)
|
|
|
|
// Now get storage data to use as fallback for disk metrics if needed
|
|
storageByNode := make(map[string]models.Disk)
|
|
if instanceCfg.MonitorStorage {
|
|
_, err := client.GetAllStorage(ctx)
|
|
if err == nil {
|
|
for _, node := range nodes {
|
|
// Skip offline nodes to avoid 595 errors
|
|
if nodeEffectiveStatus[node.Node] != "online" {
|
|
continue
|
|
}
|
|
|
|
nodeStorages, err := client.GetStorage(ctx, node.Node)
|
|
if err == nil {
|
|
// Look for local or local-lvm storage as most stable disk metric
|
|
for _, storage := range nodeStorages {
|
|
if reason, skip := readOnlyFilesystemReason(storage.Type, storage.Total, storage.Used); skip {
|
|
log.Debug().
|
|
Str("node", node.Node).
|
|
Str("storage", storage.Storage).
|
|
Str("type", storage.Type).
|
|
Str("skipReason", reason).
|
|
Uint64("total", storage.Total).
|
|
Uint64("used", storage.Used).
|
|
Msg("Skipping read-only storage while building disk fallback")
|
|
continue
|
|
}
|
|
if storage.Storage == "local" || storage.Storage == "local-lvm" {
|
|
disk := models.Disk{
|
|
Total: int64(storage.Total),
|
|
Used: int64(storage.Used),
|
|
Free: int64(storage.Available),
|
|
Usage: safePercentage(float64(storage.Used), float64(storage.Total)),
|
|
}
|
|
// Prefer "local" over "local-lvm"
|
|
if _, exists := storageByNode[node.Node]; !exists || storage.Storage == "local" {
|
|
storageByNode[node.Node] = disk
|
|
log.Debug().
|
|
Str("node", node.Node).
|
|
Str("storage", storage.Storage).
|
|
Float64("usage", disk.Usage).
|
|
Msg("Using storage for disk metrics fallback")
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Poll physical disks for health monitoring (enabled by default unless explicitly disabled)
|
|
// Skip if MonitorPhysicalDisks is explicitly set to false
|
|
if instanceCfg.MonitorPhysicalDisks != nil && !*instanceCfg.MonitorPhysicalDisks {
|
|
log.Debug().Str("instance", instanceName).Msg("Physical disk monitoring explicitly disabled")
|
|
// Keep any existing disk data visible (don't clear it)
|
|
} else {
|
|
// Enabled by default (when nil or true)
|
|
// Determine polling interval (default 5 minutes to avoid spinning up HDDs too frequently)
|
|
pollingInterval := 5 * time.Minute
|
|
if instanceCfg.PhysicalDiskPollingMinutes > 0 {
|
|
pollingInterval = time.Duration(instanceCfg.PhysicalDiskPollingMinutes) * time.Minute
|
|
}
|
|
|
|
// Check if enough time has elapsed since last poll
|
|
m.mu.Lock()
|
|
lastPoll, exists := m.lastPhysicalDiskPoll[instanceName]
|
|
shouldPoll := !exists || time.Since(lastPoll) >= pollingInterval
|
|
if shouldPoll {
|
|
m.lastPhysicalDiskPoll[instanceName] = time.Now()
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
if !shouldPoll {
|
|
log.Debug().
|
|
Str("instance", instanceName).
|
|
Dur("sinceLastPoll", time.Since(lastPoll)).
|
|
Dur("interval", pollingInterval).
|
|
Msg("Skipping physical disk poll - interval not elapsed")
|
|
// Refresh NVMe temperatures using the latest sensor data even when we skip the disk poll
|
|
currentState := m.state.GetSnapshot()
|
|
existing := make([]models.PhysicalDisk, 0)
|
|
for _, disk := range currentState.PhysicalDisks {
|
|
if disk.Instance == instanceName {
|
|
existing = append(existing, disk)
|
|
}
|
|
}
|
|
if len(existing) > 0 {
|
|
updated := mergeNVMeTempsIntoDisks(existing, modelNodes)
|
|
m.state.UpdatePhysicalDisks(instanceName, updated)
|
|
}
|
|
} else {
|
|
log.Debug().
|
|
Int("nodeCount", len(nodes)).
|
|
Dur("interval", pollingInterval).
|
|
Msg("Starting disk health polling")
|
|
|
|
// Get existing disks from state to preserve data for offline nodes
|
|
currentState := m.state.GetSnapshot()
|
|
existingDisksMap := make(map[string]models.PhysicalDisk)
|
|
for _, disk := range currentState.PhysicalDisks {
|
|
if disk.Instance == instanceName {
|
|
existingDisksMap[disk.ID] = disk
|
|
}
|
|
}
|
|
|
|
var allDisks []models.PhysicalDisk
|
|
polledNodes := make(map[string]bool) // Track which nodes we successfully polled
|
|
|
|
for _, node := range nodes {
|
|
// Skip offline nodes but preserve their existing disk data
|
|
if nodeEffectiveStatus[node.Node] != "online" {
|
|
log.Debug().Str("node", node.Node).Msg("Skipping disk poll for offline node - preserving existing data")
|
|
continue
|
|
}
|
|
|
|
// Get disk list for this node
|
|
log.Debug().Str("node", node.Node).Msg("Getting disk list for node")
|
|
disks, err := client.GetDisks(ctx, node.Node)
|
|
if err != nil {
|
|
// Check if it's a permission error or if the endpoint doesn't exist
|
|
if strings.Contains(err.Error(), "401") || strings.Contains(err.Error(), "403") {
|
|
log.Warn().
|
|
Str("node", node.Node).
|
|
Err(err).
|
|
Msg("Insufficient permissions to access disk information - check API token permissions")
|
|
} else if strings.Contains(err.Error(), "404") || strings.Contains(err.Error(), "501") {
|
|
log.Info().
|
|
Str("node", node.Node).
|
|
Msg("Disk monitoring not available on this node (may be using non-standard storage)")
|
|
} else {
|
|
log.Warn().
|
|
Str("node", node.Node).
|
|
Err(err).
|
|
Msg("Failed to get disk list")
|
|
}
|
|
continue
|
|
}
|
|
|
|
log.Debug().
|
|
Str("node", node.Node).
|
|
Int("diskCount", len(disks)).
|
|
Msg("Got disk list for node")
|
|
|
|
// Mark this node as successfully polled
|
|
polledNodes[node.Node] = true
|
|
|
|
// Check each disk for health issues and add to state
|
|
for _, disk := range disks {
|
|
// Create PhysicalDisk model
|
|
diskID := fmt.Sprintf("%s-%s-%s", instanceName, node.Node, strings.ReplaceAll(disk.DevPath, "/", "-"))
|
|
physicalDisk := models.PhysicalDisk{
|
|
ID: diskID,
|
|
Node: node.Node,
|
|
Instance: instanceName,
|
|
DevPath: disk.DevPath,
|
|
Model: disk.Model,
|
|
Serial: disk.Serial,
|
|
WWN: disk.WWN,
|
|
Type: disk.Type,
|
|
Size: disk.Size,
|
|
Health: disk.Health,
|
|
Wearout: disk.Wearout,
|
|
RPM: disk.RPM,
|
|
Used: disk.Used,
|
|
LastChecked: time.Now(),
|
|
}
|
|
|
|
allDisks = append(allDisks, physicalDisk)
|
|
|
|
log.Debug().
|
|
Str("node", node.Node).
|
|
Str("disk", disk.DevPath).
|
|
Str("model", disk.Model).
|
|
Str("health", disk.Health).
|
|
Int("wearout", disk.Wearout).
|
|
Msg("Checking disk health")
|
|
|
|
normalizedHealth := strings.ToUpper(strings.TrimSpace(disk.Health))
|
|
if normalizedHealth != "" && normalizedHealth != "UNKNOWN" && normalizedHealth != "PASSED" && normalizedHealth != "OK" {
|
|
// Disk has failed or is failing - alert manager will handle this
|
|
log.Warn().
|
|
Str("node", node.Node).
|
|
Str("disk", disk.DevPath).
|
|
Str("model", disk.Model).
|
|
Str("health", disk.Health).
|
|
Int("wearout", disk.Wearout).
|
|
Msg("Disk health issue detected")
|
|
|
|
// Pass disk info to alert manager
|
|
m.alertManager.CheckDiskHealth(instanceName, node.Node, disk)
|
|
} else if disk.Wearout > 0 && disk.Wearout < 10 {
|
|
// Low wearout warning (less than 10% life remaining)
|
|
log.Warn().
|
|
Str("node", node.Node).
|
|
Str("disk", disk.DevPath).
|
|
Str("model", disk.Model).
|
|
Int("wearout", disk.Wearout).
|
|
Msg("SSD wearout critical - less than 10% life remaining")
|
|
|
|
// Pass to alert manager for wearout alert
|
|
m.alertManager.CheckDiskHealth(instanceName, node.Node, disk)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Preserve existing disk data for nodes that weren't polled (offline or error)
|
|
for _, existingDisk := range existingDisksMap {
|
|
// Only preserve if we didn't poll this node
|
|
if !polledNodes[existingDisk.Node] {
|
|
// Keep the existing disk data but update the LastChecked to indicate it's stale
|
|
allDisks = append(allDisks, existingDisk)
|
|
log.Debug().
|
|
Str("node", existingDisk.Node).
|
|
Str("disk", existingDisk.DevPath).
|
|
Msg("Preserving existing disk data for unpolled node")
|
|
}
|
|
}
|
|
|
|
allDisks = mergeNVMeTempsIntoDisks(allDisks, modelNodes)
|
|
|
|
// Update physical disks in state
|
|
log.Debug().
|
|
Str("instance", instanceName).
|
|
Int("diskCount", len(allDisks)).
|
|
Int("preservedCount", len(existingDisksMap)-len(polledNodes)).
|
|
Msg("Updating physical disks in state")
|
|
m.state.UpdatePhysicalDisks(instanceName, allDisks)
|
|
}
|
|
}
|
|
// Note: Physical disk monitoring is now enabled by default with a 5-minute polling interval.
|
|
// Users can explicitly disable it in node settings. Disk data is preserved between polls.
|
|
|
|
// Update nodes with storage fallback if rootfs was not available
|
|
for i := range modelNodes {
|
|
if modelNodes[i].Disk.Total == 0 {
|
|
if disk, exists := storageByNode[modelNodes[i].Name]; exists {
|
|
modelNodes[i].Disk = disk
|
|
log.Debug().
|
|
Str("node", modelNodes[i].Name).
|
|
Float64("usage", disk.Usage).
|
|
Msg("Applied storage fallback for disk metrics")
|
|
}
|
|
}
|
|
|
|
if modelNodes[i].Status == "online" {
|
|
// Record node metrics history only for online nodes
|
|
now := time.Now()
|
|
m.metricsHistory.AddNodeMetric(modelNodes[i].ID, "cpu", modelNodes[i].CPU*100, now)
|
|
m.metricsHistory.AddNodeMetric(modelNodes[i].ID, "memory", modelNodes[i].Memory.Usage, now)
|
|
m.metricsHistory.AddNodeMetric(modelNodes[i].ID, "disk", modelNodes[i].Disk.Usage, now)
|
|
}
|
|
|
|
// Check thresholds for alerts
|
|
m.alertManager.CheckNode(modelNodes[i])
|
|
}
|
|
|
|
// Update state again with corrected disk metrics
|
|
m.state.UpdateNodesForInstance(instanceName, modelNodes)
|
|
|
|
// Clean up alerts for nodes that no longer exist
|
|
// Get all nodes from the global state (includes all instances)
|
|
existingNodes := make(map[string]bool)
|
|
allState := m.state.GetSnapshot()
|
|
for _, node := range allState.Nodes {
|
|
existingNodes[node.Name] = true
|
|
}
|
|
m.alertManager.CleanupAlertsForNodes(existingNodes)
|
|
|
|
// Periodically re-check cluster status for nodes marked as standalone
|
|
// This addresses issue #437 where clusters aren't detected on first attempt
|
|
if !instanceCfg.IsCluster {
|
|
// Check every 5 minutes if this is actually a cluster
|
|
if time.Since(m.lastClusterCheck[instanceName]) > 5*time.Minute {
|
|
m.lastClusterCheck[instanceName] = time.Now()
|
|
|
|
// Try to detect if this is actually a cluster
|
|
isActuallyCluster, checkErr := client.IsClusterMember(ctx)
|
|
if checkErr == nil && isActuallyCluster {
|
|
// This node is actually part of a cluster!
|
|
log.Info().
|
|
Str("instance", instanceName).
|
|
Msg("Detected that standalone node is actually part of a cluster - updating configuration")
|
|
|
|
// Update the configuration
|
|
for i := range m.config.PVEInstances {
|
|
if m.config.PVEInstances[i].Name == instanceName {
|
|
m.config.PVEInstances[i].IsCluster = true
|
|
// Note: We can't get the cluster name here without direct client access
|
|
// It will be detected on the next configuration update
|
|
log.Info().
|
|
Str("instance", instanceName).
|
|
Msg("Marked node as cluster member - cluster name will be detected on next update")
|
|
|
|
// Save the updated configuration
|
|
if m.persistence != nil {
|
|
if err := m.persistence.SaveNodesConfig(m.config.PVEInstances, m.config.PBSInstances, m.config.PMGInstances); err != nil {
|
|
log.Warn().Err(err).Msg("Failed to persist updated node configuration")
|
|
}
|
|
}
|
|
break
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Update cluster endpoint online status if this is a cluster
|
|
if instanceCfg.IsCluster && len(instanceCfg.ClusterEndpoints) > 0 {
|
|
// Create a map of online nodes from our polling results
|
|
onlineNodes := make(map[string]bool)
|
|
for _, node := range modelNodes {
|
|
// Node is online if we successfully got its data
|
|
onlineNodes[node.Name] = node.Status == "online"
|
|
}
|
|
|
|
// Get Pulse connectivity status from ClusterClient if available
|
|
var pulseHealth map[string]proxmox.EndpointHealth
|
|
if clusterClient, ok := client.(*proxmox.ClusterClient); ok {
|
|
pulseHealth = clusterClient.GetHealthStatusWithErrors()
|
|
}
|
|
|
|
// Update the online status for each cluster endpoint
|
|
hasFingerprint := instanceCfg.Fingerprint != ""
|
|
for i := range instanceCfg.ClusterEndpoints {
|
|
if online, exists := onlineNodes[instanceCfg.ClusterEndpoints[i].NodeName]; exists {
|
|
instanceCfg.ClusterEndpoints[i].Online = online
|
|
if online {
|
|
instanceCfg.ClusterEndpoints[i].LastSeen = time.Now()
|
|
}
|
|
}
|
|
|
|
// Update Pulse connectivity status
|
|
if pulseHealth != nil {
|
|
// Try to find the endpoint in the health map by matching the effective URL
|
|
endpointURL := clusterEndpointEffectiveURL(instanceCfg.ClusterEndpoints[i], instanceCfg.VerifySSL, hasFingerprint)
|
|
if health, exists := pulseHealth[endpointURL]; exists {
|
|
reachable := health.Healthy
|
|
instanceCfg.ClusterEndpoints[i].PulseReachable = &reachable
|
|
if !health.LastCheck.IsZero() {
|
|
instanceCfg.ClusterEndpoints[i].LastPulseCheck = &health.LastCheck
|
|
}
|
|
instanceCfg.ClusterEndpoints[i].PulseError = health.LastError
|
|
}
|
|
}
|
|
}
|
|
|
|
// Update the config with the new online status
|
|
// This is needed so the UI can reflect the current status
|
|
for idx, cfg := range m.config.PVEInstances {
|
|
if cfg.Name == instanceName {
|
|
m.config.PVEInstances[idx].ClusterEndpoints = instanceCfg.ClusterEndpoints
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
// Poll VMs and containers together using cluster/resources for efficiency
|
|
if instanceCfg.MonitorVMs || instanceCfg.MonitorContainers {
|
|
select {
|
|
case <-ctx.Done():
|
|
pollErr = ctx.Err()
|
|
return
|
|
default:
|
|
// Always try the efficient cluster/resources endpoint first
|
|
// This endpoint works on both clustered and standalone nodes
|
|
// Testing confirmed it works on standalone nodes like pimox
|
|
useClusterEndpoint := m.pollVMsAndContainersEfficient(ctx, instanceName, client, nodeEffectiveStatus)
|
|
|
|
if !useClusterEndpoint {
|
|
// Fall back to traditional polling only if cluster/resources not available
|
|
// This should be rare - only for very old Proxmox versions
|
|
log.Debug().
|
|
Str("instance", instanceName).
|
|
Msg("cluster/resources endpoint not available, using traditional polling")
|
|
|
|
// Check if configuration needs updating
|
|
if instanceCfg.IsCluster {
|
|
isActuallyCluster, checkErr := client.IsClusterMember(ctx)
|
|
if checkErr == nil && !isActuallyCluster {
|
|
log.Warn().
|
|
Str("instance", instanceName).
|
|
Msg("Instance marked as cluster but is actually standalone - consider updating configuration")
|
|
instanceCfg.IsCluster = false
|
|
}
|
|
}
|
|
|
|
// Use optimized parallel polling for better performance
|
|
if instanceCfg.MonitorVMs {
|
|
m.pollVMsWithNodes(ctx, instanceName, client, nodes, nodeEffectiveStatus)
|
|
}
|
|
if instanceCfg.MonitorContainers {
|
|
m.pollContainersWithNodes(ctx, instanceName, client, nodes, nodeEffectiveStatus)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Poll storage if enabled
|
|
if instanceCfg.MonitorStorage {
|
|
select {
|
|
case <-ctx.Done():
|
|
pollErr = ctx.Err()
|
|
return
|
|
default:
|
|
m.pollStorageWithNodes(ctx, instanceName, client, nodes)
|
|
}
|
|
}
|
|
|
|
// Poll backups if enabled - respect configured interval or cycle gating
|
|
if instanceCfg.MonitorBackups {
|
|
if !m.config.EnableBackupPolling {
|
|
log.Debug().
|
|
Str("instance", instanceName).
|
|
Msg("Skipping backup polling - globally disabled")
|
|
} else {
|
|
now := time.Now()
|
|
|
|
m.mu.RLock()
|
|
lastPoll := m.lastPVEBackupPoll[instanceName]
|
|
m.mu.RUnlock()
|
|
|
|
shouldPoll, reason, newLast := m.shouldRunBackupPoll(lastPoll, now)
|
|
if !shouldPoll {
|
|
if reason != "" {
|
|
log.Debug().
|
|
Str("instance", instanceName).
|
|
Str("reason", reason).
|
|
Msg("Skipping PVE backup polling this cycle")
|
|
}
|
|
} else {
|
|
select {
|
|
case <-ctx.Done():
|
|
pollErr = ctx.Err()
|
|
return
|
|
default:
|
|
// Set initial timestamp before starting goroutine (prevents concurrent starts)
|
|
m.mu.Lock()
|
|
m.lastPVEBackupPoll[instanceName] = newLast
|
|
m.mu.Unlock()
|
|
|
|
// Run backup polling in a separate goroutine to avoid blocking real-time stats
|
|
go func(startTime time.Time, inst string, pveClient PVEClientInterface) {
|
|
timeout := m.calculateBackupOperationTimeout(inst)
|
|
log.Info().
|
|
Str("instance", inst).
|
|
Dur("timeout", timeout).
|
|
Msg("Starting background backup/snapshot polling")
|
|
|
|
// The per-cycle ctx is canceled as soon as the main polling loop finishes,
|
|
// so derive the backup poll context from the long-lived runtime context instead.
|
|
parentCtx := m.runtimeCtx
|
|
if parentCtx == nil {
|
|
parentCtx = context.Background()
|
|
}
|
|
|
|
backupCtx, cancel := context.WithTimeout(parentCtx, timeout)
|
|
defer cancel()
|
|
|
|
// Poll backup tasks
|
|
m.pollBackupTasks(backupCtx, inst, pveClient)
|
|
|
|
// Poll storage backups - pass nodes to avoid duplicate API calls
|
|
m.pollStorageBackupsWithNodes(backupCtx, inst, pveClient, nodes, nodeEffectiveStatus)
|
|
|
|
// Poll guest snapshots
|
|
m.pollGuestSnapshots(backupCtx, inst, pveClient)
|
|
|
|
duration := time.Since(startTime)
|
|
log.Info().
|
|
Str("instance", inst).
|
|
Dur("duration", duration).
|
|
Msg("Completed background backup/snapshot polling")
|
|
|
|
// Update timestamp after completion for accurate interval scheduling
|
|
m.mu.Lock()
|
|
m.lastPVEBackupPoll[inst] = time.Now()
|
|
m.mu.Unlock()
|
|
}(now, instanceName, client)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// pollVMsAndContainersEfficient uses the cluster/resources endpoint to get all VMs and containers in one call
|
|
// This works on both clustered and standalone nodes for efficient polling
|
|
func (m *Monitor) pollVMsAndContainersEfficient(ctx context.Context, instanceName string, client PVEClientInterface, nodeEffectiveStatus map[string]string) bool {
|
|
log.Info().Str("instance", instanceName).Msg("Polling VMs and containers using efficient cluster/resources endpoint")
|
|
|
|
// Get all resources in a single API call
|
|
resources, err := client.GetClusterResources(ctx, "vm")
|
|
if err != nil {
|
|
log.Debug().Err(err).Str("instance", instanceName).Msg("cluster/resources not available, falling back to traditional polling")
|
|
return false
|
|
}
|
|
|
|
var allVMs []models.VM
|
|
var allContainers []models.Container
|
|
|
|
for _, res := range resources {
|
|
// Avoid duplicating node name in ID when instance name equals node name
|
|
var guestID string
|
|
if instanceName == res.Node {
|
|
guestID = fmt.Sprintf("%s-%d", res.Node, res.VMID)
|
|
} else {
|
|
guestID = fmt.Sprintf("%s-%s-%d", instanceName, res.Node, res.VMID)
|
|
}
|
|
|
|
// Debug log the resource type
|
|
log.Debug().
|
|
Str("instance", instanceName).
|
|
Str("name", res.Name).
|
|
Int("vmid", res.VMID).
|
|
Str("type", res.Type).
|
|
Msg("Processing cluster resource")
|
|
|
|
// Initialize I/O metrics from cluster resources (may be 0 for VMs)
|
|
diskReadBytes := int64(res.DiskRead)
|
|
diskWriteBytes := int64(res.DiskWrite)
|
|
networkInBytes := int64(res.NetIn)
|
|
networkOutBytes := int64(res.NetOut)
|
|
var individualDisks []models.Disk // Store individual filesystems for multi-disk monitoring
|
|
var ipAddresses []string
|
|
var networkInterfaces []models.GuestNetworkInterface
|
|
var osName, osVersion, agentVersion string
|
|
|
|
if res.Type == "qemu" {
|
|
// Skip templates if configured
|
|
if res.Template == 1 {
|
|
continue
|
|
}
|
|
|
|
memTotal := res.MaxMem
|
|
memUsed := res.Mem
|
|
memorySource := "cluster-resources"
|
|
guestRaw := VMMemoryRaw{
|
|
ListingMem: res.Mem,
|
|
ListingMaxMem: res.MaxMem,
|
|
}
|
|
var detailedStatus *proxmox.VMStatus
|
|
|
|
// Try to get actual disk usage from guest agent if VM is running
|
|
diskUsed := res.Disk
|
|
diskTotal := res.MaxDisk
|
|
diskFree := diskTotal - diskUsed
|
|
diskUsage := safePercentage(float64(diskUsed), float64(diskTotal))
|
|
|
|
// If VM shows 0 disk usage but has allocated disk, it's likely guest agent issue
|
|
// Set to -1 to indicate "unknown" rather than showing misleading 0%
|
|
if res.Type == "qemu" && diskUsed == 0 && diskTotal > 0 && res.Status == "running" {
|
|
diskUsage = -1
|
|
}
|
|
|
|
// For running VMs, always try to get filesystem info from guest agent
|
|
// The cluster/resources endpoint often returns 0 or incorrect values for disk usage
|
|
// We should prefer guest agent data when available for accurate metrics
|
|
if res.Status == "running" && res.Type == "qemu" {
|
|
// First check if agent is enabled by getting VM status
|
|
status, err := client.GetVMStatus(ctx, res.Node, res.VMID)
|
|
if err != nil {
|
|
log.Debug().
|
|
Err(err).
|
|
Str("instance", instanceName).
|
|
Str("vm", res.Name).
|
|
Int("vmid", res.VMID).
|
|
Msg("Could not get VM status to check guest agent availability")
|
|
} else if status != nil {
|
|
detailedStatus = status
|
|
guestRaw.StatusMaxMem = detailedStatus.MaxMem
|
|
guestRaw.StatusMem = detailedStatus.Mem
|
|
guestRaw.StatusFreeMem = detailedStatus.FreeMem
|
|
guestRaw.Balloon = detailedStatus.Balloon
|
|
guestRaw.BalloonMin = detailedStatus.BalloonMin
|
|
guestRaw.Agent = detailedStatus.Agent.Value
|
|
memAvailable := uint64(0)
|
|
if detailedStatus.MemInfo != nil {
|
|
guestRaw.MemInfoUsed = detailedStatus.MemInfo.Used
|
|
guestRaw.MemInfoFree = detailedStatus.MemInfo.Free
|
|
guestRaw.MemInfoTotal = detailedStatus.MemInfo.Total
|
|
guestRaw.MemInfoAvailable = detailedStatus.MemInfo.Available
|
|
guestRaw.MemInfoBuffers = detailedStatus.MemInfo.Buffers
|
|
guestRaw.MemInfoCached = detailedStatus.MemInfo.Cached
|
|
guestRaw.MemInfoShared = detailedStatus.MemInfo.Shared
|
|
|
|
switch {
|
|
case detailedStatus.MemInfo.Available > 0:
|
|
memAvailable = detailedStatus.MemInfo.Available
|
|
memorySource = "meminfo-available"
|
|
case detailedStatus.MemInfo.Free > 0 ||
|
|
detailedStatus.MemInfo.Buffers > 0 ||
|
|
detailedStatus.MemInfo.Cached > 0:
|
|
memAvailable = detailedStatus.MemInfo.Free +
|
|
detailedStatus.MemInfo.Buffers +
|
|
detailedStatus.MemInfo.Cached
|
|
memorySource = "meminfo-derived"
|
|
}
|
|
}
|
|
|
|
// Use actual disk I/O values from detailed status
|
|
diskReadBytes = int64(detailedStatus.DiskRead)
|
|
diskWriteBytes = int64(detailedStatus.DiskWrite)
|
|
networkInBytes = int64(detailedStatus.NetIn)
|
|
networkOutBytes = int64(detailedStatus.NetOut)
|
|
|
|
if detailedStatus.Balloon > 0 && detailedStatus.Balloon < detailedStatus.MaxMem {
|
|
memTotal = detailedStatus.Balloon
|
|
guestRaw.DerivedFromBall = true
|
|
} else if detailedStatus.MaxMem > 0 {
|
|
memTotal = detailedStatus.MaxMem
|
|
guestRaw.DerivedFromBall = false
|
|
}
|
|
|
|
switch {
|
|
case memAvailable > 0:
|
|
if memAvailable > memTotal {
|
|
memAvailable = memTotal
|
|
}
|
|
memUsed = memTotal - memAvailable
|
|
case detailedStatus.FreeMem > 0 && memTotal >= detailedStatus.FreeMem:
|
|
memUsed = memTotal - detailedStatus.FreeMem
|
|
memorySource = "status-freemem"
|
|
case detailedStatus.Mem > 0:
|
|
memUsed = detailedStatus.Mem
|
|
memorySource = "status-mem"
|
|
}
|
|
if memUsed > memTotal {
|
|
memUsed = memTotal
|
|
}
|
|
|
|
// Gather guest metadata from the agent when available
|
|
guestIPs, guestIfaces, guestOSName, guestOSVersion, guestAgentVersion := m.fetchGuestAgentMetadata(ctx, client, instanceName, res.Node, res.Name, res.VMID, detailedStatus)
|
|
if len(guestIPs) > 0 {
|
|
ipAddresses = guestIPs
|
|
}
|
|
if len(guestIfaces) > 0 {
|
|
networkInterfaces = guestIfaces
|
|
}
|
|
if guestOSName != "" {
|
|
osName = guestOSName
|
|
}
|
|
if guestOSVersion != "" {
|
|
osVersion = guestOSVersion
|
|
}
|
|
if guestAgentVersion != "" {
|
|
agentVersion = guestAgentVersion
|
|
}
|
|
|
|
// Always try to get filesystem info if agent is enabled
|
|
// Prefer guest agent data over cluster/resources data for accuracy
|
|
if detailedStatus.Agent.Value > 0 {
|
|
log.Debug().
|
|
Str("instance", instanceName).
|
|
Str("vm", res.Name).
|
|
Int("vmid", res.VMID).
|
|
Int("agent", detailedStatus.Agent.Value).
|
|
Uint64("current_disk", diskUsed).
|
|
Uint64("current_maxdisk", diskTotal).
|
|
Msg("Guest agent enabled, querying filesystem info for accurate disk usage")
|
|
|
|
// Use retry logic for guest agent calls to handle transient timeouts (refs #630)
|
|
fsInfoRaw, err := m.retryGuestAgentCall(ctx, m.guestAgentFSInfoTimeout, m.guestAgentRetries, func(ctx context.Context) (interface{}, error) {
|
|
return client.GetVMFSInfo(ctx, res.Node, res.VMID)
|
|
})
|
|
var fsInfo []proxmox.VMFileSystem
|
|
if err == nil {
|
|
if fs, ok := fsInfoRaw.([]proxmox.VMFileSystem); ok {
|
|
fsInfo = fs
|
|
}
|
|
}
|
|
if err != nil {
|
|
// Log more helpful error messages based on the error type
|
|
errMsg := err.Error()
|
|
if strings.Contains(errMsg, "500") || strings.Contains(errMsg, "QEMU guest agent is not running") {
|
|
log.Info().
|
|
Str("instance", instanceName).
|
|
Str("vm", res.Name).
|
|
Int("vmid", res.VMID).
|
|
Msg("Guest agent enabled in VM config but not running inside guest OS. Install and start qemu-guest-agent in the VM")
|
|
log.Info().
|
|
Str("instance", instanceName).
|
|
Str("vm", res.Name).
|
|
Msg("To verify: ssh into VM and run 'systemctl status qemu-guest-agent' or 'ps aux | grep qemu-ga'")
|
|
} else if strings.Contains(errMsg, "timeout") {
|
|
log.Info().
|
|
Str("instance", instanceName).
|
|
Str("vm", res.Name).
|
|
Int("vmid", res.VMID).
|
|
Msg("Guest agent timeout - agent may be installed but not responding")
|
|
} else if strings.Contains(errMsg, "403") || strings.Contains(errMsg, "401") || strings.Contains(errMsg, "authentication error") {
|
|
// Permission error - user/token lacks required permissions
|
|
log.Info().
|
|
Str("instance", instanceName).
|
|
Str("vm", res.Name).
|
|
Int("vmid", res.VMID).
|
|
Msg("VM disk monitoring permission denied. Check permissions:")
|
|
log.Info().
|
|
Str("instance", instanceName).
|
|
Str("vm", res.Name).
|
|
Msg("• Proxmox 9: Ensure token/user has VM.GuestAgent.Audit privilege (Pulse setup adds this via PulseMonitor role)")
|
|
log.Info().
|
|
Str("instance", instanceName).
|
|
Str("vm", res.Name).
|
|
Msg("• Proxmox 8: Ensure token/user has VM.Monitor privilege (Pulse setup adds this via PulseMonitor role)")
|
|
log.Info().
|
|
Str("instance", instanceName).
|
|
Str("vm", res.Name).
|
|
Msg("• All versions: Sys.Audit is recommended for Ceph metrics and applied when available")
|
|
log.Info().
|
|
Str("instance", instanceName).
|
|
Str("vm", res.Name).
|
|
Msg("• Re-run Pulse setup script if node was added before v4.7")
|
|
log.Info().
|
|
Str("instance", instanceName).
|
|
Str("vm", res.Name).
|
|
Msg("• Verify guest agent is installed and running inside the VM")
|
|
} else {
|
|
log.Debug().
|
|
Err(err).
|
|
Str("instance", instanceName).
|
|
Str("vm", res.Name).
|
|
Int("vmid", res.VMID).
|
|
Msg("Failed to get filesystem info from guest agent")
|
|
}
|
|
} else if len(fsInfo) == 0 {
|
|
log.Info().
|
|
Str("instance", instanceName).
|
|
Str("vm", res.Name).
|
|
Int("vmid", res.VMID).
|
|
Msg("Guest agent returned no filesystem info - agent may need restart or VM may have no mounted filesystems")
|
|
} else {
|
|
log.Debug().
|
|
Str("instance", instanceName).
|
|
Str("vm", res.Name).
|
|
Int("filesystems", len(fsInfo)).
|
|
Msg("Got filesystem info from guest agent")
|
|
|
|
// Aggregate disk usage from all filesystems AND preserve individual disk data
|
|
var totalBytes, usedBytes uint64
|
|
var skippedFS []string
|
|
var includedFS []string
|
|
|
|
// Log all filesystems received for debugging
|
|
log.Debug().
|
|
Str("instance", instanceName).
|
|
Str("vm", res.Name).
|
|
Int("vmid", res.VMID).
|
|
Int("filesystem_count", len(fsInfo)).
|
|
Msg("Processing filesystems from guest agent")
|
|
|
|
for _, fs := range fsInfo {
|
|
// Skip special filesystems and mounts
|
|
skipReasons := []string{}
|
|
reasonReadOnly := ""
|
|
shouldSkip := false
|
|
|
|
// Check filesystem type
|
|
fsTypeLower := strings.ToLower(fs.Type)
|
|
if reason, skip := readOnlyFilesystemReason(fs.Type, fs.TotalBytes, fs.UsedBytes); skip {
|
|
skipReasons = append(skipReasons, fmt.Sprintf("read-only-%s", reason))
|
|
reasonReadOnly = reason
|
|
shouldSkip = true
|
|
}
|
|
if fs.Type == "tmpfs" || fs.Type == "devtmpfs" ||
|
|
fs.Type == "cgroup" || fs.Type == "cgroup2" ||
|
|
fs.Type == "sysfs" || fs.Type == "proc" ||
|
|
fs.Type == "devpts" || fs.Type == "securityfs" ||
|
|
fs.Type == "debugfs" || fs.Type == "tracefs" ||
|
|
fs.Type == "fusectl" || fs.Type == "configfs" ||
|
|
fs.Type == "pstore" || fs.Type == "hugetlbfs" ||
|
|
fs.Type == "mqueue" || fs.Type == "bpf" ||
|
|
strings.Contains(fsTypeLower, "fuse") || // Skip FUSE mounts (often network/special)
|
|
strings.Contains(fsTypeLower, "9p") || // Skip 9p mounts (VM shared folders)
|
|
strings.Contains(fsTypeLower, "nfs") || // Skip NFS mounts
|
|
strings.Contains(fsTypeLower, "cifs") || // Skip CIFS/SMB mounts
|
|
strings.Contains(fsTypeLower, "smb") { // Skip SMB mounts
|
|
skipReasons = append(skipReasons, "special-fs-type")
|
|
shouldSkip = true
|
|
}
|
|
|
|
// Check mountpoint patterns
|
|
if strings.HasPrefix(fs.Mountpoint, "/dev") ||
|
|
strings.HasPrefix(fs.Mountpoint, "/proc") ||
|
|
strings.HasPrefix(fs.Mountpoint, "/sys") ||
|
|
strings.HasPrefix(fs.Mountpoint, "/run") ||
|
|
strings.HasPrefix(fs.Mountpoint, "/var/lib/docker") || // Skip Docker volumes
|
|
strings.HasPrefix(fs.Mountpoint, "/snap") || // Skip snap mounts
|
|
fs.Mountpoint == "/boot/efi" ||
|
|
fs.Mountpoint == "System Reserved" || // Windows System Reserved partition
|
|
strings.Contains(fs.Mountpoint, "System Reserved") { // Various Windows reserved formats
|
|
skipReasons = append(skipReasons, "special-mountpoint")
|
|
shouldSkip = true
|
|
}
|
|
|
|
if shouldSkip {
|
|
if reasonReadOnly != "" {
|
|
log.Debug().
|
|
Str("instance", instanceName).
|
|
Str("vm", res.Name).
|
|
Int("vmid", res.VMID).
|
|
Str("mountpoint", fs.Mountpoint).
|
|
Str("type", fs.Type).
|
|
Float64("total_gb", float64(fs.TotalBytes)/1073741824).
|
|
Float64("used_gb", float64(fs.UsedBytes)/1073741824).
|
|
Msg("Skipping read-only filesystem from disk aggregation")
|
|
}
|
|
skippedFS = append(skippedFS, fmt.Sprintf("%s(%s,%s)",
|
|
fs.Mountpoint, fs.Type, strings.Join(skipReasons, ",")))
|
|
continue
|
|
}
|
|
|
|
// Only count real filesystems with valid data
|
|
// Some filesystems report 0 bytes (like unformatted or system partitions)
|
|
if fs.TotalBytes > 0 {
|
|
totalBytes += fs.TotalBytes
|
|
usedBytes += fs.UsedBytes
|
|
includedFS = append(includedFS, fmt.Sprintf("%s(%s,%.1fGB)",
|
|
fs.Mountpoint, fs.Type, float64(fs.TotalBytes)/1073741824))
|
|
|
|
// Add to individual disks array
|
|
individualDisks = append(individualDisks, models.Disk{
|
|
Total: int64(fs.TotalBytes),
|
|
Used: int64(fs.UsedBytes),
|
|
Free: int64(fs.TotalBytes - fs.UsedBytes),
|
|
Usage: safePercentage(float64(fs.UsedBytes), float64(fs.TotalBytes)),
|
|
Mountpoint: fs.Mountpoint,
|
|
Type: fs.Type,
|
|
Device: fs.Disk,
|
|
})
|
|
|
|
log.Debug().
|
|
Str("instance", instanceName).
|
|
Str("vm", res.Name).
|
|
Int("vmid", res.VMID).
|
|
Str("mountpoint", fs.Mountpoint).
|
|
Str("type", fs.Type).
|
|
Uint64("total", fs.TotalBytes).
|
|
Uint64("used", fs.UsedBytes).
|
|
Float64("total_gb", float64(fs.TotalBytes)/1073741824).
|
|
Float64("used_gb", float64(fs.UsedBytes)/1073741824).
|
|
Msg("Including filesystem in disk usage calculation")
|
|
} else if fs.TotalBytes == 0 && len(fs.Mountpoint) > 0 {
|
|
skippedFS = append(skippedFS, fmt.Sprintf("%s(%s,0GB)", fs.Mountpoint, fs.Type))
|
|
log.Debug().
|
|
Str("instance", instanceName).
|
|
Str("vm", res.Name).
|
|
Int("vmid", res.VMID).
|
|
Str("mountpoint", fs.Mountpoint).
|
|
Str("type", fs.Type).
|
|
Msg("Skipping filesystem with zero total bytes")
|
|
}
|
|
}
|
|
|
|
if len(skippedFS) > 0 {
|
|
log.Debug().
|
|
Str("instance", instanceName).
|
|
Str("vm", res.Name).
|
|
Strs("skipped", skippedFS).
|
|
Msg("Skipped special filesystems")
|
|
}
|
|
|
|
if len(includedFS) > 0 {
|
|
log.Info().
|
|
Str("instance", instanceName).
|
|
Str("vm", res.Name).
|
|
Int("vmid", res.VMID).
|
|
Strs("included", includedFS).
|
|
Msg("Filesystems included in disk calculation")
|
|
}
|
|
|
|
// If we got valid data from guest agent, use it
|
|
if totalBytes > 0 {
|
|
// Sanity check: if the reported disk is way larger than allocated disk,
|
|
// we might be getting host disk info somehow
|
|
allocatedDiskGB := float64(res.MaxDisk) / 1073741824
|
|
reportedDiskGB := float64(totalBytes) / 1073741824
|
|
|
|
// If reported disk is more than 2x the allocated disk, log a warning
|
|
// This could indicate we're getting host disk or network shares
|
|
if allocatedDiskGB > 0 && reportedDiskGB > allocatedDiskGB*2 {
|
|
log.Warn().
|
|
Str("instance", instanceName).
|
|
Str("vm", res.Name).
|
|
Int("vmid", res.VMID).
|
|
Float64("allocated_gb", allocatedDiskGB).
|
|
Float64("reported_gb", reportedDiskGB).
|
|
Float64("ratio", reportedDiskGB/allocatedDiskGB).
|
|
Strs("filesystems", includedFS).
|
|
Msg("VM reports disk usage significantly larger than allocated disk - possible issue with filesystem detection")
|
|
}
|
|
|
|
diskTotal = totalBytes
|
|
diskUsed = usedBytes
|
|
diskFree = totalBytes - usedBytes
|
|
diskUsage = safePercentage(float64(usedBytes), float64(totalBytes))
|
|
|
|
log.Info().
|
|
Str("instance", instanceName).
|
|
Str("vm", res.Name).
|
|
Int("vmid", res.VMID).
|
|
Uint64("totalBytes", totalBytes).
|
|
Uint64("usedBytes", usedBytes).
|
|
Float64("total_gb", float64(totalBytes)/1073741824).
|
|
Float64("used_gb", float64(usedBytes)/1073741824).
|
|
Float64("allocated_gb", allocatedDiskGB).
|
|
Float64("usage", diskUsage).
|
|
Uint64("old_disk", res.Disk).
|
|
Uint64("old_maxdisk", res.MaxDisk).
|
|
Msg("Using guest agent data for accurate disk usage (replacing cluster/resources data)")
|
|
} else {
|
|
// Only special filesystems found - show allocated disk size instead
|
|
if diskTotal > 0 {
|
|
diskUsage = -1 // Show as allocated size
|
|
}
|
|
log.Info().
|
|
Str("instance", instanceName).
|
|
Str("vm", res.Name).
|
|
Int("filesystems_found", len(fsInfo)).
|
|
Msg("Guest agent provided filesystem info but no usable filesystems found (all were special mounts)")
|
|
}
|
|
}
|
|
} else {
|
|
// Agent disabled - show allocated disk size
|
|
if diskTotal > 0 {
|
|
diskUsage = -1 // Show as allocated size
|
|
}
|
|
log.Debug().
|
|
Str("instance", instanceName).
|
|
Str("vm", res.Name).
|
|
Int("vmid", res.VMID).
|
|
Int("agent", detailedStatus.Agent.Value).
|
|
Msg("VM does not have guest agent enabled in config")
|
|
}
|
|
} else {
|
|
// No vmStatus available - keep cluster/resources data
|
|
log.Debug().
|
|
Str("instance", instanceName).
|
|
Str("vm", res.Name).
|
|
Int("vmid", res.VMID).
|
|
Msg("Could not get VM status, using cluster/resources disk data")
|
|
}
|
|
}
|
|
|
|
if res.Status != "running" {
|
|
memorySource = "powered-off"
|
|
memUsed = 0
|
|
}
|
|
|
|
memFree := uint64(0)
|
|
if memTotal >= memUsed {
|
|
memFree = memTotal - memUsed
|
|
}
|
|
|
|
sampleTime := time.Now()
|
|
currentMetrics := IOMetrics{
|
|
DiskRead: diskReadBytes,
|
|
DiskWrite: diskWriteBytes,
|
|
NetworkIn: networkInBytes,
|
|
NetworkOut: networkOutBytes,
|
|
Timestamp: sampleTime,
|
|
}
|
|
diskReadRate, diskWriteRate, netInRate, netOutRate := m.rateTracker.CalculateRates(guestID, currentMetrics)
|
|
|
|
memoryUsage := safePercentage(float64(memUsed), float64(memTotal))
|
|
memory := models.Memory{
|
|
Total: int64(memTotal),
|
|
Used: int64(memUsed),
|
|
Free: int64(memFree),
|
|
Usage: memoryUsage,
|
|
}
|
|
if memory.Free < 0 {
|
|
memory.Free = 0
|
|
}
|
|
if memory.Used > memory.Total {
|
|
memory.Used = memory.Total
|
|
}
|
|
if detailedStatus != nil && detailedStatus.Balloon > 0 {
|
|
memory.Balloon = int64(detailedStatus.Balloon)
|
|
}
|
|
|
|
vm := models.VM{
|
|
ID: guestID,
|
|
VMID: res.VMID,
|
|
Name: res.Name,
|
|
Node: res.Node,
|
|
Instance: instanceName,
|
|
Status: res.Status,
|
|
Type: "qemu",
|
|
CPU: safeFloat(res.CPU),
|
|
CPUs: res.MaxCPU,
|
|
Memory: memory,
|
|
Disk: models.Disk{
|
|
Total: int64(diskTotal),
|
|
Used: int64(diskUsed),
|
|
Free: int64(diskFree),
|
|
Usage: diskUsage,
|
|
},
|
|
Disks: individualDisks, // Individual filesystem data
|
|
IPAddresses: ipAddresses,
|
|
OSName: osName,
|
|
OSVersion: osVersion,
|
|
AgentVersion: agentVersion,
|
|
NetworkInterfaces: networkInterfaces,
|
|
NetworkIn: maxInt64(0, int64(netInRate)),
|
|
NetworkOut: maxInt64(0, int64(netOutRate)),
|
|
DiskRead: maxInt64(0, int64(diskReadRate)),
|
|
DiskWrite: maxInt64(0, int64(diskWriteRate)),
|
|
Uptime: int64(res.Uptime),
|
|
Template: res.Template == 1,
|
|
LastSeen: sampleTime,
|
|
}
|
|
|
|
// Parse tags
|
|
if res.Tags != "" {
|
|
vm.Tags = strings.Split(res.Tags, ";")
|
|
|
|
// Log if Pulse-specific tags are detected
|
|
for _, tag := range vm.Tags {
|
|
switch tag {
|
|
case "pulse-no-alerts", "pulse-monitor-only", "pulse-relaxed":
|
|
log.Info().
|
|
Str("vm", vm.Name).
|
|
Str("node", vm.Node).
|
|
Str("tag", tag).
|
|
Msg("Pulse control tag detected on VM")
|
|
}
|
|
}
|
|
}
|
|
|
|
allVMs = append(allVMs, vm)
|
|
|
|
m.recordGuestSnapshot(instanceName, vm.Type, res.Node, res.VMID, GuestMemorySnapshot{
|
|
Name: vm.Name,
|
|
Status: vm.Status,
|
|
RetrievedAt: sampleTime,
|
|
MemorySource: memorySource,
|
|
Memory: vm.Memory,
|
|
Raw: guestRaw,
|
|
})
|
|
|
|
// For non-running VMs, zero out resource usage metrics to prevent false alerts
|
|
// Proxmox may report stale or residual metrics for stopped VMs
|
|
if vm.Status != "running" {
|
|
log.Debug().
|
|
Str("vm", vm.Name).
|
|
Str("status", vm.Status).
|
|
Float64("originalCpu", vm.CPU).
|
|
Float64("originalMemUsage", vm.Memory.Usage).
|
|
Msg("Non-running VM detected - zeroing metrics")
|
|
|
|
// Zero out all usage metrics for stopped/paused/suspended VMs
|
|
vm.CPU = 0
|
|
vm.Memory.Usage = 0
|
|
vm.Disk.Usage = 0
|
|
vm.NetworkIn = 0
|
|
vm.NetworkOut = 0
|
|
vm.DiskRead = 0
|
|
vm.DiskWrite = 0
|
|
}
|
|
|
|
// Check thresholds for alerts
|
|
m.alertManager.CheckGuest(vm, instanceName)
|
|
|
|
} else if res.Type == "lxc" {
|
|
// Skip templates if configured
|
|
if res.Template == 1 {
|
|
continue
|
|
}
|
|
|
|
// Calculate I/O rates for container
|
|
sampleTime := time.Now()
|
|
currentMetrics := IOMetrics{
|
|
DiskRead: int64(res.DiskRead),
|
|
DiskWrite: int64(res.DiskWrite),
|
|
NetworkIn: int64(res.NetIn),
|
|
NetworkOut: int64(res.NetOut),
|
|
Timestamp: sampleTime,
|
|
}
|
|
diskReadRate, diskWriteRate, netInRate, netOutRate := m.rateTracker.CalculateRates(guestID, currentMetrics)
|
|
|
|
// Calculate cache-aware memory for LXC containers
|
|
// The cluster resources API returns mem from cgroup which includes cache/buffers (inflated).
|
|
// Try to get more accurate memory metrics from RRD data.
|
|
memTotal := res.MaxMem
|
|
memUsed := res.Mem
|
|
memorySource := "cluster-resources"
|
|
guestRaw := VMMemoryRaw{
|
|
ListingMem: res.Mem,
|
|
ListingMaxMem: res.MaxMem,
|
|
}
|
|
|
|
// For running containers, try to get RRD data for cache-aware memory calculation
|
|
if res.Status == "running" {
|
|
rrdCtx, rrdCancel := context.WithTimeout(ctx, 5*time.Second)
|
|
rrdPoints, err := client.GetLXCRRDData(rrdCtx, res.Node, res.VMID, "hour", "AVERAGE", []string{"memavailable", "memused", "maxmem"})
|
|
rrdCancel()
|
|
|
|
if err == nil && len(rrdPoints) > 0 {
|
|
// Use the most recent RRD point
|
|
point := rrdPoints[len(rrdPoints)-1]
|
|
|
|
if point.MaxMem != nil && *point.MaxMem > 0 {
|
|
guestRaw.StatusMaxMem = uint64(*point.MaxMem)
|
|
}
|
|
|
|
// Prefer memavailable-based calculation (excludes cache/buffers)
|
|
if point.MemAvailable != nil && *point.MemAvailable > 0 {
|
|
memAvailable := uint64(*point.MemAvailable)
|
|
if memAvailable <= memTotal {
|
|
memUsed = memTotal - memAvailable
|
|
memorySource = "rrd-memavailable"
|
|
guestRaw.MemInfoAvailable = memAvailable
|
|
log.Debug().
|
|
Str("container", res.Name).
|
|
Str("node", res.Node).
|
|
Uint64("total", memTotal).
|
|
Uint64("available", memAvailable).
|
|
Uint64("used", memUsed).
|
|
Float64("usage", safePercentage(float64(memUsed), float64(memTotal))).
|
|
Msg("LXC memory: using RRD memavailable (excludes reclaimable cache)")
|
|
}
|
|
} else if point.MemUsed != nil && *point.MemUsed > 0 {
|
|
// Fall back to memused from RRD if available
|
|
memUsed = uint64(*point.MemUsed)
|
|
if memUsed <= memTotal {
|
|
memorySource = "rrd-memused"
|
|
guestRaw.MemInfoUsed = memUsed
|
|
log.Debug().
|
|
Str("container", res.Name).
|
|
Str("node", res.Node).
|
|
Uint64("total", memTotal).
|
|
Uint64("used", memUsed).
|
|
Float64("usage", safePercentage(float64(memUsed), float64(memTotal))).
|
|
Msg("LXC memory: using RRD memused (excludes reclaimable cache)")
|
|
}
|
|
}
|
|
} else if err != nil {
|
|
log.Debug().
|
|
Err(err).
|
|
Str("instance", instanceName).
|
|
Str("container", res.Name).
|
|
Int("vmid", res.VMID).
|
|
Msg("RRD memory data unavailable for LXC, using cluster resources value")
|
|
}
|
|
}
|
|
|
|
container := models.Container{
|
|
ID: guestID,
|
|
VMID: res.VMID,
|
|
Name: res.Name,
|
|
Node: res.Node,
|
|
Instance: instanceName,
|
|
Status: res.Status,
|
|
Type: "lxc",
|
|
CPU: safeFloat(res.CPU),
|
|
CPUs: int(res.MaxCPU),
|
|
Memory: models.Memory{
|
|
Total: int64(memTotal),
|
|
Used: int64(memUsed),
|
|
Free: int64(memTotal - memUsed),
|
|
Usage: safePercentage(float64(memUsed), float64(memTotal)),
|
|
},
|
|
Disk: models.Disk{
|
|
Total: int64(res.MaxDisk),
|
|
Used: int64(res.Disk),
|
|
Free: int64(res.MaxDisk - res.Disk),
|
|
Usage: safePercentage(float64(res.Disk), float64(res.MaxDisk)),
|
|
},
|
|
NetworkIn: maxInt64(0, int64(netInRate)),
|
|
NetworkOut: maxInt64(0, int64(netOutRate)),
|
|
DiskRead: maxInt64(0, int64(diskReadRate)),
|
|
DiskWrite: maxInt64(0, int64(diskWriteRate)),
|
|
Uptime: int64(res.Uptime),
|
|
Template: res.Template == 1,
|
|
LastSeen: time.Now(),
|
|
}
|
|
|
|
// Parse tags
|
|
if res.Tags != "" {
|
|
container.Tags = strings.Split(res.Tags, ";")
|
|
|
|
// Log if Pulse-specific tags are detected
|
|
for _, tag := range container.Tags {
|
|
switch tag {
|
|
case "pulse-no-alerts", "pulse-monitor-only", "pulse-relaxed":
|
|
log.Info().
|
|
Str("container", container.Name).
|
|
Str("node", container.Node).
|
|
Str("tag", tag).
|
|
Msg("Pulse control tag detected on container")
|
|
}
|
|
}
|
|
}
|
|
|
|
m.enrichContainerMetadata(ctx, client, instanceName, res.Node, &container)
|
|
|
|
allContainers = append(allContainers, container)
|
|
|
|
m.recordGuestSnapshot(instanceName, container.Type, res.Node, res.VMID, GuestMemorySnapshot{
|
|
Name: container.Name,
|
|
Status: container.Status,
|
|
RetrievedAt: sampleTime,
|
|
MemorySource: memorySource,
|
|
Memory: container.Memory,
|
|
Raw: guestRaw,
|
|
})
|
|
|
|
// For non-running containers, zero out resource usage metrics to prevent false alerts
|
|
// Proxmox may report stale or residual metrics for stopped containers
|
|
if container.Status != "running" {
|
|
log.Debug().
|
|
Str("container", container.Name).
|
|
Str("status", container.Status).
|
|
Float64("originalCpu", container.CPU).
|
|
Float64("originalMemUsage", container.Memory.Usage).
|
|
Msg("Non-running container detected - zeroing metrics")
|
|
|
|
// Zero out all usage metrics for stopped/paused containers
|
|
container.CPU = 0
|
|
container.Memory.Usage = 0
|
|
container.Disk.Usage = 0
|
|
container.NetworkIn = 0
|
|
container.NetworkOut = 0
|
|
container.DiskRead = 0
|
|
container.DiskWrite = 0
|
|
}
|
|
|
|
// Check thresholds for alerts
|
|
m.alertManager.CheckGuest(container, instanceName)
|
|
}
|
|
}
|
|
|
|
// Preserve VMs and containers from nodes within grace period
|
|
// The cluster/resources endpoint doesn't return VMs/containers from nodes Proxmox considers offline,
|
|
// but we want to keep showing them if the node is within grace period
|
|
prevState := m.GetState()
|
|
|
|
// Count previous resources for this instance
|
|
prevVMCount := 0
|
|
prevContainerCount := 0
|
|
for _, vm := range prevState.VMs {
|
|
if vm.Instance == instanceName {
|
|
prevVMCount++
|
|
}
|
|
}
|
|
for _, container := range prevState.Containers {
|
|
if container.Instance == instanceName {
|
|
prevContainerCount++
|
|
}
|
|
}
|
|
|
|
// Build map of which nodes are covered by current resources
|
|
nodesWithResources := make(map[string]bool)
|
|
for _, res := range resources {
|
|
nodesWithResources[res.Node] = true
|
|
}
|
|
|
|
log.Info().
|
|
Str("instance", instanceName).
|
|
Int("nodesInResources", len(nodesWithResources)).
|
|
Int("totalVMsFromResources", len(allVMs)).
|
|
Int("totalContainersFromResources", len(allContainers)).
|
|
Int("prevVMs", prevVMCount).
|
|
Int("prevContainers", prevContainerCount).
|
|
Msg("Cluster resources received, checking for grace period preservation")
|
|
|
|
// If we got ZERO resources but had resources before, and we have no node data,
|
|
// this likely means the cluster health check failed. Preserve everything.
|
|
if len(allVMs) == 0 && len(allContainers) == 0 &&
|
|
(prevVMCount > 0 || prevContainerCount > 0) &&
|
|
len(nodeEffectiveStatus) == 0 {
|
|
log.Warn().
|
|
Str("instance", instanceName).
|
|
Int("prevVMs", prevVMCount).
|
|
Int("prevContainers", prevContainerCount).
|
|
Msg("Cluster returned zero resources but had resources before - likely cluster health issue, preserving all previous resources")
|
|
|
|
// Preserve all previous VMs and containers for this instance
|
|
for _, vm := range prevState.VMs {
|
|
if vm.Instance == instanceName {
|
|
allVMs = append(allVMs, vm)
|
|
}
|
|
}
|
|
for _, container := range prevState.Containers {
|
|
if container.Instance == instanceName {
|
|
allContainers = append(allContainers, container)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Check for nodes that are within grace period but not in cluster/resources response
|
|
preservedVMCount := 0
|
|
preservedContainerCount := 0
|
|
for nodeName, effectiveStatus := range nodeEffectiveStatus {
|
|
if effectiveStatus == "online" && !nodesWithResources[nodeName] {
|
|
// This node is within grace period but Proxmox didn't return its resources
|
|
// Preserve previous VMs and containers from this node
|
|
vmsBefore := len(allVMs)
|
|
containersBefore := len(allContainers)
|
|
|
|
// Preserve VMs from this node
|
|
for _, vm := range prevState.VMs {
|
|
if vm.Instance == instanceName && vm.Node == nodeName {
|
|
allVMs = append(allVMs, vm)
|
|
}
|
|
}
|
|
|
|
// Preserve containers from this node
|
|
for _, container := range prevState.Containers {
|
|
if container.Instance == instanceName && container.Node == nodeName {
|
|
allContainers = append(allContainers, container)
|
|
}
|
|
}
|
|
|
|
vmsPreserved := len(allVMs) - vmsBefore
|
|
containersPreserved := len(allContainers) - containersBefore
|
|
preservedVMCount += vmsPreserved
|
|
preservedContainerCount += containersPreserved
|
|
|
|
log.Info().
|
|
Str("instance", instanceName).
|
|
Str("node", nodeName).
|
|
Int("vmsPreserved", vmsPreserved).
|
|
Int("containersPreserved", containersPreserved).
|
|
Msg("Preserved VMs/containers from node in grace period")
|
|
}
|
|
}
|
|
|
|
if preservedVMCount > 0 || preservedContainerCount > 0 {
|
|
log.Info().
|
|
Str("instance", instanceName).
|
|
Int("totalPreservedVMs", preservedVMCount).
|
|
Int("totalPreservedContainers", preservedContainerCount).
|
|
Msg("Grace period preservation complete")
|
|
}
|
|
|
|
// Always update state when using efficient polling path
|
|
// Even if arrays are empty, we need to update to clear out VMs from genuinely offline nodes
|
|
m.state.UpdateVMsForInstance(instanceName, allVMs)
|
|
m.state.UpdateContainersForInstance(instanceName, allContainers)
|
|
|
|
m.pollReplicationStatus(ctx, instanceName, client, allVMs)
|
|
|
|
log.Info().
|
|
Str("instance", instanceName).
|
|
Int("vms", len(allVMs)).
|
|
Int("containers", len(allContainers)).
|
|
Msg("VMs and containers polled efficiently with cluster/resources")
|
|
|
|
return true
|
|
}
|
|
|
|
// pollBackupTasks polls backup tasks from a PVE instance
|
|
func (m *Monitor) pollBackupTasks(ctx context.Context, instanceName string, client PVEClientInterface) {
|
|
log.Debug().Str("instance", instanceName).Msg("Polling backup tasks")
|
|
|
|
tasks, err := client.GetBackupTasks(ctx)
|
|
if err != nil {
|
|
monErr := errors.WrapAPIError("get_backup_tasks", instanceName, err, 0)
|
|
log.Error().Err(monErr).Str("instance", instanceName).Msg("Failed to get backup tasks")
|
|
return
|
|
}
|
|
|
|
var backupTasks []models.BackupTask
|
|
for _, task := range tasks {
|
|
// Extract VMID from task ID (format: "UPID:node:pid:starttime:type:vmid:user@realm:")
|
|
vmid := 0
|
|
if task.ID != "" {
|
|
if vmidInt, err := strconv.Atoi(task.ID); err == nil {
|
|
vmid = vmidInt
|
|
}
|
|
}
|
|
|
|
taskID := fmt.Sprintf("%s-%s", instanceName, task.UPID)
|
|
|
|
backupTask := models.BackupTask{
|
|
ID: taskID,
|
|
Node: task.Node,
|
|
Type: task.Type,
|
|
VMID: vmid,
|
|
Status: task.Status,
|
|
StartTime: time.Unix(task.StartTime, 0),
|
|
}
|
|
|
|
if task.EndTime > 0 {
|
|
backupTask.EndTime = time.Unix(task.EndTime, 0)
|
|
}
|
|
|
|
backupTasks = append(backupTasks, backupTask)
|
|
}
|
|
|
|
// Update state with new backup tasks for this instance
|
|
m.state.UpdateBackupTasksForInstance(instanceName, backupTasks)
|
|
}
|
|
|
|
// pollReplicationStatus polls storage replication jobs for a PVE instance.
|
|
func (m *Monitor) pollReplicationStatus(ctx context.Context, instanceName string, client PVEClientInterface, vms []models.VM) {
|
|
log.Debug().Str("instance", instanceName).Msg("Polling replication status")
|
|
|
|
jobs, err := client.GetReplicationStatus(ctx)
|
|
if err != nil {
|
|
errMsg := err.Error()
|
|
lowerMsg := strings.ToLower(errMsg)
|
|
if strings.Contains(errMsg, "501") || strings.Contains(errMsg, "404") || strings.Contains(lowerMsg, "not implemented") || strings.Contains(lowerMsg, "not supported") {
|
|
log.Debug().
|
|
Str("instance", instanceName).
|
|
Msg("Replication API not available on this Proxmox instance")
|
|
m.state.UpdateReplicationJobsForInstance(instanceName, []models.ReplicationJob{})
|
|
return
|
|
}
|
|
|
|
monErr := errors.WrapAPIError("get_replication_status", instanceName, err, 0)
|
|
log.Warn().
|
|
Err(monErr).
|
|
Str("instance", instanceName).
|
|
Msg("Failed to get replication status")
|
|
return
|
|
}
|
|
|
|
if len(jobs) == 0 {
|
|
m.state.UpdateReplicationJobsForInstance(instanceName, []models.ReplicationJob{})
|
|
return
|
|
}
|
|
|
|
vmByID := make(map[int]models.VM, len(vms))
|
|
for _, vm := range vms {
|
|
vmByID[vm.VMID] = vm
|
|
}
|
|
|
|
converted := make([]models.ReplicationJob, 0, len(jobs))
|
|
now := time.Now()
|
|
|
|
for idx, job := range jobs {
|
|
guestID := job.GuestID
|
|
if guestID == 0 {
|
|
if parsed, err := strconv.Atoi(strings.TrimSpace(job.Guest)); err == nil {
|
|
guestID = parsed
|
|
}
|
|
}
|
|
|
|
guestName := ""
|
|
guestType := ""
|
|
guestNode := ""
|
|
if guestID > 0 {
|
|
if vm, ok := vmByID[guestID]; ok {
|
|
guestName = vm.Name
|
|
guestType = vm.Type
|
|
guestNode = vm.Node
|
|
}
|
|
}
|
|
if guestNode == "" {
|
|
guestNode = strings.TrimSpace(job.Source)
|
|
}
|
|
|
|
sourceNode := strings.TrimSpace(job.Source)
|
|
if sourceNode == "" {
|
|
sourceNode = guestNode
|
|
}
|
|
|
|
targetNode := strings.TrimSpace(job.Target)
|
|
|
|
var lastSyncTime *time.Time
|
|
if job.LastSyncTime != nil && !job.LastSyncTime.IsZero() {
|
|
t := job.LastSyncTime.UTC()
|
|
lastSyncTime = &t
|
|
}
|
|
|
|
var nextSyncTime *time.Time
|
|
if job.NextSyncTime != nil && !job.NextSyncTime.IsZero() {
|
|
t := job.NextSyncTime.UTC()
|
|
nextSyncTime = &t
|
|
}
|
|
|
|
lastSyncDurationHuman := job.LastSyncDurationHuman
|
|
if lastSyncDurationHuman == "" && job.LastSyncDurationSeconds > 0 {
|
|
lastSyncDurationHuman = formatSeconds(job.LastSyncDurationSeconds)
|
|
}
|
|
durationHuman := job.DurationHuman
|
|
if durationHuman == "" && job.DurationSeconds > 0 {
|
|
durationHuman = formatSeconds(job.DurationSeconds)
|
|
}
|
|
|
|
rateLimit := copyFloatPointer(job.RateLimitMbps)
|
|
|
|
status := job.Status
|
|
if status == "" {
|
|
status = job.State
|
|
}
|
|
|
|
jobID := strings.TrimSpace(job.ID)
|
|
if jobID == "" {
|
|
if job.JobNumber > 0 && guestID > 0 {
|
|
jobID = fmt.Sprintf("%d-%d", guestID, job.JobNumber)
|
|
} else {
|
|
jobID = fmt.Sprintf("job-%s-%d", instanceName, idx)
|
|
}
|
|
}
|
|
|
|
uniqueID := fmt.Sprintf("%s-%s", instanceName, jobID)
|
|
|
|
converted = append(converted, models.ReplicationJob{
|
|
ID: uniqueID,
|
|
Instance: instanceName,
|
|
JobID: jobID,
|
|
JobNumber: job.JobNumber,
|
|
Guest: job.Guest,
|
|
GuestID: guestID,
|
|
GuestName: guestName,
|
|
GuestType: guestType,
|
|
GuestNode: guestNode,
|
|
SourceNode: sourceNode,
|
|
SourceStorage: job.SourceStorage,
|
|
TargetNode: targetNode,
|
|
TargetStorage: job.TargetStorage,
|
|
Schedule: job.Schedule,
|
|
Type: job.Type,
|
|
Enabled: job.Enabled,
|
|
State: job.State,
|
|
Status: status,
|
|
LastSyncStatus: job.LastSyncStatus,
|
|
LastSyncTime: lastSyncTime,
|
|
LastSyncUnix: job.LastSyncUnix,
|
|
LastSyncDurationSeconds: job.LastSyncDurationSeconds,
|
|
LastSyncDurationHuman: lastSyncDurationHuman,
|
|
NextSyncTime: nextSyncTime,
|
|
NextSyncUnix: job.NextSyncUnix,
|
|
DurationSeconds: job.DurationSeconds,
|
|
DurationHuman: durationHuman,
|
|
FailCount: job.FailCount,
|
|
Error: job.Error,
|
|
Comment: job.Comment,
|
|
RemoveJob: job.RemoveJob,
|
|
RateLimitMbps: rateLimit,
|
|
LastPolled: now,
|
|
})
|
|
}
|
|
|
|
m.state.UpdateReplicationJobsForInstance(instanceName, converted)
|
|
}
|
|
|
|
func formatSeconds(total int) string {
|
|
if total <= 0 {
|
|
return ""
|
|
}
|
|
hours := total / 3600
|
|
minutes := (total % 3600) / 60
|
|
seconds := total % 60
|
|
return fmt.Sprintf("%02d:%02d:%02d", hours, minutes, seconds)
|
|
}
|
|
|
|
func copyFloatPointer(src *float64) *float64 {
|
|
if src == nil {
|
|
return nil
|
|
}
|
|
val := *src
|
|
return &val
|
|
}
|
|
|
|
// pollPBSInstance polls a single PBS instance
|
|
func (m *Monitor) pollPBSInstance(ctx context.Context, instanceName string, client *pbs.Client) {
|
|
defer recoverFromPanic(fmt.Sprintf("pollPBSInstance-%s", instanceName))
|
|
|
|
start := time.Now()
|
|
debugEnabled := logging.IsLevelEnabled(zerolog.DebugLevel)
|
|
var pollErr error
|
|
if m.pollMetrics != nil {
|
|
m.pollMetrics.IncInFlight("pbs")
|
|
defer m.pollMetrics.DecInFlight("pbs")
|
|
defer func() {
|
|
m.pollMetrics.RecordResult(PollResult{
|
|
InstanceName: instanceName,
|
|
InstanceType: "pbs",
|
|
Success: pollErr == nil,
|
|
Error: pollErr,
|
|
StartTime: start,
|
|
EndTime: time.Now(),
|
|
})
|
|
}()
|
|
}
|
|
if m.stalenessTracker != nil {
|
|
defer func() {
|
|
if pollErr == nil {
|
|
m.stalenessTracker.UpdateSuccess(InstanceTypePBS, instanceName, nil)
|
|
} else {
|
|
m.stalenessTracker.UpdateError(InstanceTypePBS, instanceName)
|
|
}
|
|
}()
|
|
}
|
|
defer m.recordTaskResult(InstanceTypePBS, instanceName, pollErr)
|
|
|
|
// Check if context is cancelled
|
|
select {
|
|
case <-ctx.Done():
|
|
pollErr = ctx.Err()
|
|
if debugEnabled {
|
|
log.Debug().Str("instance", instanceName).Msg("Polling cancelled")
|
|
}
|
|
return
|
|
default:
|
|
}
|
|
|
|
if debugEnabled {
|
|
log.Debug().Str("instance", instanceName).Msg("Polling PBS instance")
|
|
}
|
|
|
|
// Get instance config
|
|
var instanceCfg *config.PBSInstance
|
|
for _, cfg := range m.config.PBSInstances {
|
|
if cfg.Name == instanceName {
|
|
instanceCfg = &cfg
|
|
if debugEnabled {
|
|
log.Debug().
|
|
Str("instance", instanceName).
|
|
Bool("monitorDatastores", cfg.MonitorDatastores).
|
|
Msg("Found PBS instance config")
|
|
}
|
|
break
|
|
}
|
|
}
|
|
if instanceCfg == nil {
|
|
log.Error().Str("instance", instanceName).Msg("PBS instance config not found")
|
|
return
|
|
}
|
|
|
|
// Initialize PBS instance with default values
|
|
pbsInst := models.PBSInstance{
|
|
ID: "pbs-" + instanceName,
|
|
Name: instanceName,
|
|
Host: instanceCfg.Host,
|
|
Status: "offline",
|
|
Version: "unknown",
|
|
ConnectionHealth: "unhealthy",
|
|
LastSeen: time.Now(),
|
|
}
|
|
|
|
// Try to get version first
|
|
version, versionErr := client.GetVersion(ctx)
|
|
if versionErr == nil {
|
|
pbsInst.Status = "online"
|
|
pbsInst.Version = version.Version
|
|
pbsInst.ConnectionHealth = "healthy"
|
|
m.resetAuthFailures(instanceName, "pbs")
|
|
m.state.SetConnectionHealth("pbs-"+instanceName, true)
|
|
|
|
if debugEnabled {
|
|
log.Debug().
|
|
Str("instance", instanceName).
|
|
Str("version", version.Version).
|
|
Bool("monitorDatastores", instanceCfg.MonitorDatastores).
|
|
Msg("PBS version retrieved successfully")
|
|
}
|
|
} else {
|
|
if debugEnabled {
|
|
log.Debug().Err(versionErr).Str("instance", instanceName).Msg("Failed to get PBS version, trying fallback")
|
|
}
|
|
|
|
// Use parent context for proper cancellation chain
|
|
ctx2, cancel2 := context.WithTimeout(ctx, 10*time.Second)
|
|
defer cancel2()
|
|
_, datastoreErr := client.GetDatastores(ctx2)
|
|
if datastoreErr == nil {
|
|
pbsInst.Status = "online"
|
|
pbsInst.Version = "connected"
|
|
pbsInst.ConnectionHealth = "healthy"
|
|
m.resetAuthFailures(instanceName, "pbs")
|
|
m.state.SetConnectionHealth("pbs-"+instanceName, true)
|
|
|
|
log.Info().
|
|
Str("instance", instanceName).
|
|
Msg("PBS connected (version unavailable but datastores accessible)")
|
|
} else {
|
|
pbsInst.Status = "offline"
|
|
pbsInst.ConnectionHealth = "error"
|
|
monErr := errors.WrapConnectionError("get_pbs_version", instanceName, versionErr)
|
|
log.Error().Err(monErr).Str("instance", instanceName).Msg("Failed to connect to PBS")
|
|
m.state.SetConnectionHealth("pbs-"+instanceName, false)
|
|
|
|
if errors.IsAuthError(versionErr) || errors.IsAuthError(datastoreErr) {
|
|
m.recordAuthFailure(instanceName, "pbs")
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// Get node status (CPU, memory, etc.)
|
|
nodeStatus, err := client.GetNodeStatus(ctx)
|
|
if err != nil {
|
|
if debugEnabled {
|
|
log.Debug().Err(err).Str("instance", instanceName).Msg("Could not get PBS node status (may need Sys.Audit permission)")
|
|
}
|
|
} else if nodeStatus != nil {
|
|
pbsInst.CPU = nodeStatus.CPU
|
|
if nodeStatus.Memory.Total > 0 {
|
|
pbsInst.Memory = float64(nodeStatus.Memory.Used) / float64(nodeStatus.Memory.Total) * 100
|
|
pbsInst.MemoryUsed = nodeStatus.Memory.Used
|
|
pbsInst.MemoryTotal = nodeStatus.Memory.Total
|
|
}
|
|
pbsInst.Uptime = nodeStatus.Uptime
|
|
|
|
log.Debug().
|
|
Str("instance", instanceName).
|
|
Float64("cpu", pbsInst.CPU).
|
|
Float64("memory", pbsInst.Memory).
|
|
Int64("uptime", pbsInst.Uptime).
|
|
Msg("PBS node status retrieved")
|
|
}
|
|
|
|
// Poll datastores if enabled
|
|
if instanceCfg.MonitorDatastores {
|
|
datastores, err := client.GetDatastores(ctx)
|
|
if err != nil {
|
|
monErr := errors.WrapAPIError("get_datastores", instanceName, err, 0)
|
|
log.Error().Err(monErr).Str("instance", instanceName).Msg("Failed to get datastores")
|
|
} else {
|
|
log.Info().
|
|
Str("instance", instanceName).
|
|
Int("count", len(datastores)).
|
|
Msg("Got PBS datastores")
|
|
|
|
for _, ds := range datastores {
|
|
total := ds.Total
|
|
if total == 0 && ds.TotalSpace > 0 {
|
|
total = ds.TotalSpace
|
|
}
|
|
used := ds.Used
|
|
if used == 0 && ds.UsedSpace > 0 {
|
|
used = ds.UsedSpace
|
|
}
|
|
avail := ds.Avail
|
|
if avail == 0 && ds.AvailSpace > 0 {
|
|
avail = ds.AvailSpace
|
|
}
|
|
if total == 0 && used > 0 && avail > 0 {
|
|
total = used + avail
|
|
}
|
|
|
|
log.Debug().
|
|
Str("store", ds.Store).
|
|
Int64("total", total).
|
|
Int64("used", used).
|
|
Int64("avail", avail).
|
|
Int64("orig_total", ds.Total).
|
|
Int64("orig_total_space", ds.TotalSpace).
|
|
Msg("PBS datastore details")
|
|
|
|
modelDS := models.PBSDatastore{
|
|
Name: ds.Store,
|
|
Total: total,
|
|
Used: used,
|
|
Free: avail,
|
|
Usage: safePercentage(float64(used), float64(total)),
|
|
Status: "available",
|
|
DeduplicationFactor: ds.DeduplicationFactor,
|
|
}
|
|
|
|
namespaces, err := client.ListNamespaces(ctx, ds.Store, "", 0)
|
|
if err != nil {
|
|
log.Warn().Err(err).
|
|
Str("instance", instanceName).
|
|
Str("datastore", ds.Store).
|
|
Msg("Failed to list namespaces")
|
|
} else {
|
|
for _, ns := range namespaces {
|
|
nsPath := ns.NS
|
|
if nsPath == "" {
|
|
nsPath = ns.Path
|
|
}
|
|
if nsPath == "" {
|
|
nsPath = ns.Name
|
|
}
|
|
|
|
modelNS := models.PBSNamespace{
|
|
Path: nsPath,
|
|
Parent: ns.Parent,
|
|
Depth: strings.Count(nsPath, "/"),
|
|
}
|
|
modelDS.Namespaces = append(modelDS.Namespaces, modelNS)
|
|
}
|
|
|
|
hasRoot := false
|
|
for _, ns := range modelDS.Namespaces {
|
|
if ns.Path == "" {
|
|
hasRoot = true
|
|
break
|
|
}
|
|
}
|
|
if !hasRoot {
|
|
modelDS.Namespaces = append([]models.PBSNamespace{{Path: "", Depth: 0}}, modelDS.Namespaces...)
|
|
}
|
|
}
|
|
|
|
pbsInst.Datastores = append(pbsInst.Datastores, modelDS)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Update state and run alerts
|
|
m.state.UpdatePBSInstance(pbsInst)
|
|
log.Info().
|
|
Str("instance", instanceName).
|
|
Str("id", pbsInst.ID).
|
|
Int("datastores", len(pbsInst.Datastores)).
|
|
Msg("PBS instance updated in state")
|
|
|
|
if m.alertManager != nil {
|
|
m.alertManager.CheckPBS(pbsInst)
|
|
}
|
|
|
|
// Poll backups if enabled
|
|
if instanceCfg.MonitorBackups {
|
|
if len(pbsInst.Datastores) == 0 {
|
|
log.Debug().
|
|
Str("instance", instanceName).
|
|
Msg("No PBS datastores available for backup polling")
|
|
} else if !m.config.EnableBackupPolling {
|
|
log.Debug().
|
|
Str("instance", instanceName).
|
|
Msg("Skipping PBS backup polling - globally disabled")
|
|
} else {
|
|
now := time.Now()
|
|
|
|
m.mu.Lock()
|
|
lastPoll := m.lastPBSBackupPoll[instanceName]
|
|
if m.pbsBackupPollers == nil {
|
|
m.pbsBackupPollers = make(map[string]bool)
|
|
}
|
|
inProgress := m.pbsBackupPollers[instanceName]
|
|
m.mu.Unlock()
|
|
|
|
shouldPoll, reason, newLast := m.shouldRunBackupPoll(lastPoll, now)
|
|
if !shouldPoll {
|
|
if reason != "" {
|
|
log.Debug().
|
|
Str("instance", instanceName).
|
|
Str("reason", reason).
|
|
Msg("Skipping PBS backup polling this cycle")
|
|
}
|
|
} else if inProgress {
|
|
log.Debug().
|
|
Str("instance", instanceName).
|
|
Msg("PBS backup polling already in progress")
|
|
} else {
|
|
datastoreSnapshot := make([]models.PBSDatastore, len(pbsInst.Datastores))
|
|
copy(datastoreSnapshot, pbsInst.Datastores)
|
|
|
|
// Atomically check and set poller flag
|
|
m.mu.Lock()
|
|
if m.pbsBackupPollers[instanceName] {
|
|
// Race: another goroutine started between our check and lock
|
|
m.mu.Unlock()
|
|
log.Debug().
|
|
Str("instance", instanceName).
|
|
Msg("PBS backup polling started by another goroutine")
|
|
} else {
|
|
m.pbsBackupPollers[instanceName] = true
|
|
m.lastPBSBackupPoll[instanceName] = newLast
|
|
m.mu.Unlock()
|
|
|
|
go func(ds []models.PBSDatastore, inst string, start time.Time, pbsClient *pbs.Client) {
|
|
defer func() {
|
|
m.mu.Lock()
|
|
delete(m.pbsBackupPollers, inst)
|
|
m.lastPBSBackupPoll[inst] = time.Now()
|
|
m.mu.Unlock()
|
|
}()
|
|
|
|
log.Info().
|
|
Str("instance", inst).
|
|
Int("datastores", len(ds)).
|
|
Msg("Starting background PBS backup polling")
|
|
|
|
// Detached background poll: parent ctx may be cancelled when the main
|
|
// poll cycle finishes, so use a fresh context to let PBS polling
|
|
// complete unless the explicit timeout is reached.
|
|
backupCtx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
|
|
defer cancel()
|
|
|
|
m.pollPBSBackups(backupCtx, inst, pbsClient, ds)
|
|
|
|
log.Info().
|
|
Str("instance", inst).
|
|
Dur("duration", time.Since(start)).
|
|
Msg("Completed background PBS backup polling")
|
|
}(datastoreSnapshot, instanceName, now, client)
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
log.Debug().
|
|
Str("instance", instanceName).
|
|
Msg("PBS backup monitoring disabled")
|
|
}
|
|
}
|
|
|
|
// pollPMGInstance polls a single Proxmox Mail Gateway instance
|
|
func (m *Monitor) pollPMGInstance(ctx context.Context, instanceName string, client *pmg.Client) {
|
|
defer recoverFromPanic(fmt.Sprintf("pollPMGInstance-%s", instanceName))
|
|
|
|
start := time.Now()
|
|
debugEnabled := logging.IsLevelEnabled(zerolog.DebugLevel)
|
|
var pollErr error
|
|
if m.pollMetrics != nil {
|
|
m.pollMetrics.IncInFlight("pmg")
|
|
defer m.pollMetrics.DecInFlight("pmg")
|
|
defer func() {
|
|
m.pollMetrics.RecordResult(PollResult{
|
|
InstanceName: instanceName,
|
|
InstanceType: "pmg",
|
|
Success: pollErr == nil,
|
|
Error: pollErr,
|
|
StartTime: start,
|
|
EndTime: time.Now(),
|
|
})
|
|
}()
|
|
}
|
|
if m.stalenessTracker != nil {
|
|
defer func() {
|
|
if pollErr == nil {
|
|
m.stalenessTracker.UpdateSuccess(InstanceTypePMG, instanceName, nil)
|
|
} else {
|
|
m.stalenessTracker.UpdateError(InstanceTypePMG, instanceName)
|
|
}
|
|
}()
|
|
}
|
|
defer m.recordTaskResult(InstanceTypePMG, instanceName, pollErr)
|
|
|
|
select {
|
|
case <-ctx.Done():
|
|
pollErr = ctx.Err()
|
|
if debugEnabled {
|
|
log.Debug().Str("instance", instanceName).Msg("PMG polling cancelled by context")
|
|
}
|
|
return
|
|
default:
|
|
}
|
|
|
|
if debugEnabled {
|
|
log.Debug().Str("instance", instanceName).Msg("Polling PMG instance")
|
|
}
|
|
|
|
var instanceCfg *config.PMGInstance
|
|
for idx := range m.config.PMGInstances {
|
|
if m.config.PMGInstances[idx].Name == instanceName {
|
|
instanceCfg = &m.config.PMGInstances[idx]
|
|
break
|
|
}
|
|
}
|
|
|
|
if instanceCfg == nil {
|
|
log.Error().Str("instance", instanceName).Msg("PMG instance config not found")
|
|
pollErr = fmt.Errorf("pmg instance config not found for %s", instanceName)
|
|
return
|
|
}
|
|
|
|
now := time.Now()
|
|
pmgInst := models.PMGInstance{
|
|
ID: "pmg-" + instanceName,
|
|
Name: instanceName,
|
|
Host: instanceCfg.Host,
|
|
Status: "offline",
|
|
ConnectionHealth: "unhealthy",
|
|
LastSeen: now,
|
|
LastUpdated: now,
|
|
}
|
|
|
|
version, err := client.GetVersion(ctx)
|
|
if err != nil {
|
|
monErr := errors.WrapConnectionError("pmg_get_version", instanceName, err)
|
|
pollErr = monErr
|
|
log.Error().Err(monErr).Str("instance", instanceName).Msg("Failed to connect to PMG instance")
|
|
m.state.SetConnectionHealth("pmg-"+instanceName, false)
|
|
m.state.UpdatePMGInstance(pmgInst)
|
|
|
|
// Check PMG offline status against alert thresholds
|
|
if m.alertManager != nil {
|
|
m.alertManager.CheckPMG(pmgInst)
|
|
}
|
|
|
|
if errors.IsAuthError(err) {
|
|
m.recordAuthFailure(instanceName, "pmg")
|
|
}
|
|
return
|
|
}
|
|
|
|
pmgInst.Status = "online"
|
|
pmgInst.ConnectionHealth = "healthy"
|
|
if version != nil {
|
|
pmgInst.Version = strings.TrimSpace(version.Version)
|
|
}
|
|
m.state.SetConnectionHealth("pmg-"+instanceName, true)
|
|
m.resetAuthFailures(instanceName, "pmg")
|
|
|
|
cluster, err := client.GetClusterStatus(ctx, true)
|
|
if err != nil {
|
|
if debugEnabled {
|
|
log.Debug().Err(err).Str("instance", instanceName).Msg("Failed to retrieve PMG cluster status")
|
|
}
|
|
}
|
|
|
|
backupNodes := make(map[string]struct{})
|
|
|
|
if len(cluster) > 0 {
|
|
nodes := make([]models.PMGNodeStatus, 0, len(cluster))
|
|
for _, entry := range cluster {
|
|
status := strings.ToLower(strings.TrimSpace(entry.Type))
|
|
if status == "" {
|
|
status = "online"
|
|
}
|
|
node := models.PMGNodeStatus{
|
|
Name: entry.Name,
|
|
Status: status,
|
|
Role: entry.Type,
|
|
}
|
|
|
|
backupNodes[entry.Name] = struct{}{}
|
|
|
|
// Fetch queue status for this node
|
|
if queueData, qErr := client.GetQueueStatus(ctx, entry.Name); qErr != nil {
|
|
if debugEnabled {
|
|
log.Debug().Err(qErr).
|
|
Str("instance", instanceName).
|
|
Str("node", entry.Name).
|
|
Msg("Failed to fetch PMG queue status")
|
|
}
|
|
} else if queueData != nil {
|
|
total := queueData.Active.Int64() + queueData.Deferred.Int64() + queueData.Hold.Int64() + queueData.Incoming.Int64()
|
|
node.QueueStatus = &models.PMGQueueStatus{
|
|
Active: queueData.Active.Int(),
|
|
Deferred: queueData.Deferred.Int(),
|
|
Hold: queueData.Hold.Int(),
|
|
Incoming: queueData.Incoming.Int(),
|
|
Total: int(total),
|
|
OldestAge: queueData.OldestAge.Int64(),
|
|
UpdatedAt: time.Now(),
|
|
}
|
|
}
|
|
|
|
nodes = append(nodes, node)
|
|
}
|
|
pmgInst.Nodes = nodes
|
|
}
|
|
|
|
if len(backupNodes) == 0 {
|
|
trimmed := strings.TrimSpace(instanceName)
|
|
if trimmed != "" {
|
|
backupNodes[trimmed] = struct{}{}
|
|
}
|
|
}
|
|
|
|
pmgBackups := make([]models.PMGBackup, 0)
|
|
seenBackupIDs := make(map[string]struct{})
|
|
|
|
for nodeName := range backupNodes {
|
|
if ctx.Err() != nil {
|
|
break
|
|
}
|
|
|
|
backups, backupErr := client.ListBackups(ctx, nodeName)
|
|
if backupErr != nil {
|
|
if debugEnabled {
|
|
log.Debug().Err(backupErr).
|
|
Str("instance", instanceName).
|
|
Str("node", nodeName).
|
|
Msg("Failed to list PMG configuration backups")
|
|
}
|
|
continue
|
|
}
|
|
|
|
for _, b := range backups {
|
|
timestamp := b.Timestamp.Int64()
|
|
backupTime := time.Unix(timestamp, 0)
|
|
id := fmt.Sprintf("pmg-%s-%s-%d", instanceName, nodeName, timestamp)
|
|
if _, exists := seenBackupIDs[id]; exists {
|
|
continue
|
|
}
|
|
seenBackupIDs[id] = struct{}{}
|
|
pmgBackups = append(pmgBackups, models.PMGBackup{
|
|
ID: id,
|
|
Instance: instanceName,
|
|
Node: nodeName,
|
|
Filename: b.Filename,
|
|
BackupTime: backupTime,
|
|
Size: b.Size.Int64(),
|
|
})
|
|
}
|
|
}
|
|
|
|
if debugEnabled {
|
|
log.Debug().
|
|
Str("instance", instanceName).
|
|
Int("backupCount", len(pmgBackups)).
|
|
Msg("PMG backups polled")
|
|
}
|
|
|
|
if stats, err := client.GetMailStatistics(ctx, ""); err != nil {
|
|
log.Warn().Err(err).Str("instance", instanceName).Msg("Failed to fetch PMG mail statistics")
|
|
} else if stats != nil {
|
|
pmgInst.MailStats = &models.PMGMailStats{
|
|
Timeframe: "day",
|
|
CountTotal: stats.Count.Float64(),
|
|
CountIn: stats.CountIn.Float64(),
|
|
CountOut: stats.CountOut.Float64(),
|
|
SpamIn: stats.SpamIn.Float64(),
|
|
SpamOut: stats.SpamOut.Float64(),
|
|
VirusIn: stats.VirusIn.Float64(),
|
|
VirusOut: stats.VirusOut.Float64(),
|
|
BouncesIn: stats.BouncesIn.Float64(),
|
|
BouncesOut: stats.BouncesOut.Float64(),
|
|
BytesIn: stats.BytesIn.Float64(),
|
|
BytesOut: stats.BytesOut.Float64(),
|
|
GreylistCount: stats.GreylistCount.Float64(),
|
|
JunkIn: stats.JunkIn.Float64(),
|
|
AverageProcessTimeMs: stats.AvgProcessSec.Float64() * 1000,
|
|
RBLRejects: stats.RBLRejects.Float64(),
|
|
PregreetRejects: stats.Pregreet.Float64(),
|
|
UpdatedAt: time.Now(),
|
|
}
|
|
}
|
|
|
|
if counts, err := client.GetMailCount(ctx, 86400); err != nil {
|
|
if debugEnabled {
|
|
log.Debug().Err(err).Str("instance", instanceName).Msg("Failed to fetch PMG mail count data")
|
|
}
|
|
} else if len(counts) > 0 {
|
|
points := make([]models.PMGMailCountPoint, 0, len(counts))
|
|
for _, entry := range counts {
|
|
ts := time.Unix(entry.Time.Int64(), 0)
|
|
points = append(points, models.PMGMailCountPoint{
|
|
Timestamp: ts,
|
|
Count: entry.Count.Float64(),
|
|
CountIn: entry.CountIn.Float64(),
|
|
CountOut: entry.CountOut.Float64(),
|
|
SpamIn: entry.SpamIn.Float64(),
|
|
SpamOut: entry.SpamOut.Float64(),
|
|
VirusIn: entry.VirusIn.Float64(),
|
|
VirusOut: entry.VirusOut.Float64(),
|
|
RBLRejects: entry.RBLRejects.Float64(),
|
|
Pregreet: entry.PregreetReject.Float64(),
|
|
BouncesIn: entry.BouncesIn.Float64(),
|
|
BouncesOut: entry.BouncesOut.Float64(),
|
|
Greylist: entry.GreylistCount.Float64(),
|
|
Index: entry.Index.Int(),
|
|
Timeframe: "hour",
|
|
WindowStart: ts,
|
|
})
|
|
}
|
|
pmgInst.MailCount = points
|
|
}
|
|
|
|
if scores, err := client.GetSpamScores(ctx); err != nil {
|
|
if debugEnabled {
|
|
log.Debug().Err(err).Str("instance", instanceName).Msg("Failed to fetch PMG spam score distribution")
|
|
}
|
|
} else if len(scores) > 0 {
|
|
buckets := make([]models.PMGSpamBucket, 0, len(scores))
|
|
for _, bucket := range scores {
|
|
buckets = append(buckets, models.PMGSpamBucket{
|
|
Score: bucket.Level,
|
|
Count: float64(bucket.Count.Int()),
|
|
})
|
|
}
|
|
pmgInst.SpamDistribution = buckets
|
|
}
|
|
|
|
quarantine := models.PMGQuarantineTotals{}
|
|
if spamStatus, err := client.GetQuarantineStatus(ctx, "spam"); err == nil && spamStatus != nil {
|
|
quarantine.Spam = int(spamStatus.Count.Int64())
|
|
}
|
|
if virusStatus, err := client.GetQuarantineStatus(ctx, "virus"); err == nil && virusStatus != nil {
|
|
quarantine.Virus = int(virusStatus.Count.Int64())
|
|
}
|
|
pmgInst.Quarantine = &quarantine
|
|
|
|
m.state.UpdatePMGBackups(instanceName, pmgBackups)
|
|
m.state.UpdatePMGInstance(pmgInst)
|
|
log.Info().
|
|
Str("instance", instanceName).
|
|
Str("status", pmgInst.Status).
|
|
Int("nodes", len(pmgInst.Nodes)).
|
|
Msg("PMG instance updated in state")
|
|
|
|
// Check PMG metrics against alert thresholds
|
|
if m.alertManager != nil {
|
|
m.alertManager.CheckPMG(pmgInst)
|
|
}
|
|
}
|
|
|
|
// GetState returns the current state
|
|
func (m *Monitor) GetState() models.StateSnapshot {
|
|
// Check if mock mode is enabled
|
|
if mock.IsMockEnabled() {
|
|
state := mock.GetMockState()
|
|
if state.ActiveAlerts == nil {
|
|
// Populate snapshot lazily if the cache hasn't been filled yet.
|
|
mock.UpdateAlertSnapshots(m.alertManager.GetActiveAlerts(), m.alertManager.GetRecentlyResolved())
|
|
state = mock.GetMockState()
|
|
}
|
|
return state
|
|
}
|
|
return m.state.GetSnapshot()
|
|
}
|
|
|
|
// SetMockMode switches between mock data and real infrastructure data at runtime.
|
|
func (m *Monitor) SetMockMode(enable bool) {
|
|
current := mock.IsMockEnabled()
|
|
if current == enable {
|
|
log.Info().Bool("mockMode", enable).Msg("Mock mode already in desired state")
|
|
return
|
|
}
|
|
|
|
if enable {
|
|
mock.SetEnabled(true)
|
|
m.alertManager.ClearActiveAlerts()
|
|
m.mu.Lock()
|
|
m.resetStateLocked()
|
|
m.mu.Unlock()
|
|
m.StopDiscoveryService()
|
|
log.Info().Msg("Switched monitor to mock mode")
|
|
} else {
|
|
mock.SetEnabled(false)
|
|
m.alertManager.ClearActiveAlerts()
|
|
m.mu.Lock()
|
|
m.resetStateLocked()
|
|
m.mu.Unlock()
|
|
log.Info().Msg("Switched monitor to real data mode")
|
|
}
|
|
|
|
m.mu.RLock()
|
|
ctx := m.runtimeCtx
|
|
hub := m.wsHub
|
|
m.mu.RUnlock()
|
|
|
|
if hub != nil {
|
|
hub.BroadcastState(m.GetState().ToFrontend())
|
|
}
|
|
|
|
if !enable && ctx != nil && hub != nil {
|
|
// Kick off an immediate poll to repopulate state with live data
|
|
go m.poll(ctx, hub)
|
|
if m.config.DiscoveryEnabled {
|
|
go m.StartDiscoveryService(ctx, hub, m.config.DiscoverySubnet)
|
|
}
|
|
}
|
|
}
|
|
|
|
func (m *Monitor) resetStateLocked() {
|
|
m.state = models.NewState()
|
|
m.state.Stats = models.Stats{
|
|
StartTime: m.startTime,
|
|
Version: "2.0.0-go",
|
|
}
|
|
}
|
|
|
|
// GetStartTime returns the monitor start time
|
|
func (m *Monitor) GetStartTime() time.Time {
|
|
return m.startTime
|
|
}
|
|
|
|
// GetDiscoveryService returns the discovery service
|
|
func (m *Monitor) GetDiscoveryService() *discovery.Service {
|
|
return m.discoveryService
|
|
}
|
|
|
|
// StartDiscoveryService starts the discovery service if not already running
|
|
func (m *Monitor) StartDiscoveryService(ctx context.Context, wsHub *websocket.Hub, subnet string) {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
if m.discoveryService != nil {
|
|
log.Debug().Msg("Discovery service already running")
|
|
return
|
|
}
|
|
|
|
if subnet == "" {
|
|
subnet = "auto"
|
|
}
|
|
|
|
cfgProvider := func() config.DiscoveryConfig {
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
if m.config == nil {
|
|
return config.DefaultDiscoveryConfig()
|
|
}
|
|
return config.CloneDiscoveryConfig(m.config.Discovery)
|
|
}
|
|
|
|
m.discoveryService = discovery.NewService(wsHub, 5*time.Minute, subnet, cfgProvider)
|
|
if m.discoveryService != nil {
|
|
m.discoveryService.Start(ctx)
|
|
log.Info().Str("subnet", subnet).Msg("Discovery service started")
|
|
} else {
|
|
log.Error().Msg("Failed to create discovery service")
|
|
}
|
|
}
|
|
|
|
// StopDiscoveryService stops the discovery service if running
|
|
func (m *Monitor) StopDiscoveryService() {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
if m.discoveryService != nil {
|
|
m.discoveryService.Stop()
|
|
m.discoveryService = nil
|
|
log.Info().Msg("Discovery service stopped")
|
|
}
|
|
}
|
|
|
|
// EnableTemperatureMonitoring enables temperature data collection
|
|
func (m *Monitor) EnableTemperatureMonitoring() {
|
|
// Temperature collection is always enabled when tempCollector is initialized
|
|
// This method exists for interface compatibility
|
|
log.Info().Msg("Temperature monitoring enabled")
|
|
}
|
|
|
|
// DisableTemperatureMonitoring disables temperature data collection
|
|
func (m *Monitor) DisableTemperatureMonitoring() {
|
|
// Temperature collection is always enabled when tempCollector is initialized
|
|
// This method exists for interface compatibility
|
|
log.Info().Msg("Temperature monitoring disabled")
|
|
}
|
|
|
|
// GetGuestMetrics returns historical metrics for a guest
|
|
func (m *Monitor) GetGuestMetrics(guestID string, duration time.Duration) map[string][]MetricPoint {
|
|
return m.metricsHistory.GetAllGuestMetrics(guestID, duration)
|
|
}
|
|
|
|
// GetNodeMetrics returns historical metrics for a node
|
|
func (m *Monitor) GetNodeMetrics(nodeID string, metricType string, duration time.Duration) []MetricPoint {
|
|
return m.metricsHistory.GetNodeMetrics(nodeID, metricType, duration)
|
|
}
|
|
|
|
// GetStorageMetrics returns historical metrics for storage
|
|
func (m *Monitor) GetStorageMetrics(storageID string, duration time.Duration) map[string][]MetricPoint {
|
|
return m.metricsHistory.GetAllStorageMetrics(storageID, duration)
|
|
}
|
|
|
|
// GetAlertManager returns the alert manager
|
|
func (m *Monitor) GetAlertManager() *alerts.Manager {
|
|
return m.alertManager
|
|
}
|
|
|
|
// GetNotificationManager returns the notification manager
|
|
func (m *Monitor) GetNotificationManager() *notifications.NotificationManager {
|
|
return m.notificationMgr
|
|
}
|
|
|
|
// GetConfigPersistence returns the config persistence manager
|
|
func (m *Monitor) GetConfigPersistence() *config.ConfigPersistence {
|
|
return m.configPersist
|
|
}
|
|
|
|
// pollStorageBackupsWithNodes polls backups using a provided nodes list to avoid duplicate GetNodes calls
|
|
func (m *Monitor) pollStorageBackupsWithNodes(ctx context.Context, instanceName string, client PVEClientInterface, nodes []proxmox.Node, nodeEffectiveStatus map[string]string) {
|
|
|
|
var allBackups []models.StorageBackup
|
|
seenVolids := make(map[string]bool) // Track seen volume IDs to avoid duplicates
|
|
hadSuccessfulNode := false // Track if at least one node responded successfully
|
|
storagesWithBackup := 0 // Number of storages that should contain backups
|
|
contentSuccess := 0 // Number of successful storage content fetches
|
|
contentFailures := 0 // Number of failed storage content fetches
|
|
storageQueryErrors := 0 // Number of nodes where storage list could not be queried
|
|
storagePreserveNeeded := map[string]struct{}{}
|
|
storageSuccess := map[string]struct{}{}
|
|
|
|
// Build guest lookup map to find actual node for each VMID
|
|
snapshot := m.state.GetSnapshot()
|
|
guestNodeMap := make(map[int]string) // VMID -> actual node name
|
|
for _, vm := range snapshot.VMs {
|
|
if vm.Instance == instanceName {
|
|
guestNodeMap[vm.VMID] = vm.Node
|
|
}
|
|
}
|
|
for _, ct := range snapshot.Containers {
|
|
if ct.Instance == instanceName {
|
|
guestNodeMap[int(ct.VMID)] = ct.Node
|
|
}
|
|
}
|
|
|
|
// For each node, get storage and check content
|
|
for _, node := range nodes {
|
|
if nodeEffectiveStatus[node.Node] != "online" {
|
|
for _, storageName := range storageNamesForNode(instanceName, node.Node, snapshot) {
|
|
storagePreserveNeeded[storageName] = struct{}{}
|
|
}
|
|
continue
|
|
}
|
|
|
|
// Get storage for this node - retry once on timeout
|
|
var storages []proxmox.Storage
|
|
var err error
|
|
|
|
for attempt := 1; attempt <= 2; attempt++ {
|
|
storages, err = client.GetStorage(ctx, node.Node)
|
|
if err == nil {
|
|
break // Success
|
|
}
|
|
|
|
// Check if it's a timeout error
|
|
if strings.Contains(err.Error(), "timeout") || strings.Contains(err.Error(), "deadline exceeded") {
|
|
if attempt == 1 {
|
|
log.Warn().
|
|
Str("node", node.Node).
|
|
Str("instance", instanceName).
|
|
Msg("Storage query timed out, retrying with extended timeout...")
|
|
// Give it a bit more time on retry
|
|
time.Sleep(2 * time.Second)
|
|
continue
|
|
}
|
|
}
|
|
// Non-timeout error or second attempt failed
|
|
break
|
|
}
|
|
|
|
if err != nil {
|
|
monErr := errors.NewMonitorError(errors.ErrorTypeAPI, "get_storage_for_backups", instanceName, err).WithNode(node.Node)
|
|
log.Warn().Err(monErr).Str("node", node.Node).Msg("Failed to get storage for backups - skipping node")
|
|
for _, storageName := range storageNamesForNode(instanceName, node.Node, snapshot) {
|
|
storagePreserveNeeded[storageName] = struct{}{}
|
|
}
|
|
storageQueryErrors++
|
|
continue
|
|
}
|
|
|
|
hadSuccessfulNode = true
|
|
|
|
// For each storage that can contain backups or templates
|
|
for _, storage := range storages {
|
|
// Check if storage supports backup content
|
|
if !strings.Contains(storage.Content, "backup") {
|
|
continue
|
|
}
|
|
if !storageContentQueryable(storage) {
|
|
continue
|
|
}
|
|
|
|
storagesWithBackup++
|
|
|
|
// Get storage content
|
|
contents, err := client.GetStorageContent(ctx, node.Node, storage.Storage)
|
|
if err != nil {
|
|
monErr := errors.NewMonitorError(errors.ErrorTypeAPI, "get_storage_content", instanceName, err).WithNode(node.Node)
|
|
log.Debug().Err(monErr).
|
|
Str("node", node.Node).
|
|
Str("storage", storage.Storage).
|
|
Msg("Failed to get storage content")
|
|
if _, ok := storageSuccess[storage.Storage]; !ok {
|
|
storagePreserveNeeded[storage.Storage] = struct{}{}
|
|
}
|
|
contentFailures++
|
|
continue
|
|
}
|
|
|
|
contentSuccess++
|
|
storageSuccess[storage.Storage] = struct{}{}
|
|
delete(storagePreserveNeeded, storage.Storage)
|
|
|
|
// Convert to models
|
|
for _, content := range contents {
|
|
// Skip if we've already seen this item (shared storage duplicate)
|
|
if seenVolids[content.Volid] {
|
|
continue
|
|
}
|
|
seenVolids[content.Volid] = true
|
|
|
|
// Skip templates and ISOs - they're not backups
|
|
if content.Content == "vztmpl" || content.Content == "iso" {
|
|
continue
|
|
}
|
|
|
|
// Determine type from content type and VMID
|
|
backupType := "unknown"
|
|
if content.VMID == 0 {
|
|
backupType = "host"
|
|
} else if strings.Contains(content.Volid, "/vm/") || strings.Contains(content.Volid, "qemu") {
|
|
backupType = "qemu"
|
|
} else if strings.Contains(content.Volid, "/ct/") || strings.Contains(content.Volid, "lxc") {
|
|
backupType = "lxc"
|
|
} else if strings.Contains(content.Format, "pbs-ct") {
|
|
// PBS format check as fallback
|
|
backupType = "lxc"
|
|
} else if strings.Contains(content.Format, "pbs-vm") {
|
|
// PBS format check as fallback
|
|
backupType = "qemu"
|
|
}
|
|
|
|
// Determine the correct node: for guest backups (VMID > 0), use the actual guest's node
|
|
// For host backups (VMID == 0), use the node where the backup was found
|
|
backupNode := node.Node
|
|
if content.VMID > 0 {
|
|
if actualNode, found := guestNodeMap[content.VMID]; found {
|
|
backupNode = actualNode
|
|
}
|
|
// If not found in map, fall back to queried node (shouldn't happen normally)
|
|
}
|
|
isPBSStorage := strings.HasPrefix(storage.Storage, "pbs-") || storage.Type == "pbs"
|
|
|
|
// Check verification status for PBS backups
|
|
verified := false
|
|
verificationInfo := ""
|
|
if isPBSStorage {
|
|
// Check if verified flag is set
|
|
if content.Verified > 0 {
|
|
verified = true
|
|
}
|
|
// Also check verification map if available
|
|
if content.Verification != nil {
|
|
if state, ok := content.Verification["state"].(string); ok {
|
|
verified = (state == "ok")
|
|
verificationInfo = state
|
|
}
|
|
}
|
|
}
|
|
|
|
backup := models.StorageBackup{
|
|
ID: fmt.Sprintf("%s-%s", instanceName, content.Volid),
|
|
Storage: storage.Storage,
|
|
Node: backupNode,
|
|
Instance: instanceName,
|
|
Type: backupType,
|
|
VMID: content.VMID,
|
|
Time: time.Unix(content.CTime, 0),
|
|
CTime: content.CTime,
|
|
Size: int64(content.Size),
|
|
Format: content.Format,
|
|
Notes: content.Notes,
|
|
Protected: content.Protected > 0,
|
|
Volid: content.Volid,
|
|
IsPBS: isPBSStorage,
|
|
Verified: verified,
|
|
Verification: verificationInfo,
|
|
}
|
|
|
|
allBackups = append(allBackups, backup)
|
|
}
|
|
}
|
|
}
|
|
|
|
allBackups, preservedStorages := preserveFailedStorageBackups(instanceName, snapshot, storagePreserveNeeded, allBackups)
|
|
if len(preservedStorages) > 0 {
|
|
log.Warn().
|
|
Str("instance", instanceName).
|
|
Strs("storages", preservedStorages).
|
|
Msg("Preserving previous storage backup data due to partial failures")
|
|
}
|
|
|
|
// Decide whether to keep existing backups when every query failed
|
|
if shouldPreserveBackups(len(nodes), hadSuccessfulNode, storagesWithBackup, contentSuccess) {
|
|
if len(nodes) > 0 && !hadSuccessfulNode {
|
|
log.Warn().
|
|
Str("instance", instanceName).
|
|
Int("nodes", len(nodes)).
|
|
Int("errors", storageQueryErrors).
|
|
Msg("Failed to query storage on all nodes; keeping previous backup list")
|
|
} else if storagesWithBackup > 0 && contentSuccess == 0 {
|
|
log.Warn().
|
|
Str("instance", instanceName).
|
|
Int("storages", storagesWithBackup).
|
|
Int("failures", contentFailures).
|
|
Msg("All storage content queries failed; keeping previous backup list")
|
|
}
|
|
return
|
|
}
|
|
|
|
// Update state with storage backups for this instance
|
|
m.state.UpdateStorageBackupsForInstance(instanceName, allBackups)
|
|
|
|
if m.alertManager != nil {
|
|
snapshot := m.state.GetSnapshot()
|
|
guestsByKey, guestsByVMID := buildGuestLookups(snapshot, m.guestMetadataStore)
|
|
pveStorage := snapshot.Backups.PVE.StorageBackups
|
|
if len(pveStorage) == 0 && len(snapshot.PVEBackups.StorageBackups) > 0 {
|
|
pveStorage = snapshot.PVEBackups.StorageBackups
|
|
}
|
|
pbsBackups := snapshot.Backups.PBS
|
|
if len(pbsBackups) == 0 && len(snapshot.PBSBackups) > 0 {
|
|
pbsBackups = snapshot.PBSBackups
|
|
}
|
|
pmgBackups := snapshot.Backups.PMG
|
|
if len(pmgBackups) == 0 && len(snapshot.PMGBackups) > 0 {
|
|
pmgBackups = snapshot.PMGBackups
|
|
}
|
|
m.alertManager.CheckBackups(pveStorage, pbsBackups, pmgBackups, guestsByKey, guestsByVMID)
|
|
}
|
|
|
|
log.Debug().
|
|
Str("instance", instanceName).
|
|
Int("count", len(allBackups)).
|
|
Msg("Storage backups polled")
|
|
}
|
|
|
|
func shouldPreserveBackups(nodeCount int, hadSuccessfulNode bool, storagesWithBackup, contentSuccess int) bool {
|
|
if nodeCount > 0 && !hadSuccessfulNode {
|
|
return true
|
|
}
|
|
if storagesWithBackup > 0 && contentSuccess == 0 {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
func shouldPreservePBSBackups(datastoreCount, datastoreFetches int) bool {
|
|
// If there are datastores but all fetches failed, preserve existing backups
|
|
if datastoreCount > 0 && datastoreFetches == 0 {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
func storageNamesForNode(instanceName, nodeName string, snapshot models.StateSnapshot) []string {
|
|
if nodeName == "" {
|
|
return nil
|
|
}
|
|
|
|
var storages []string
|
|
for _, storage := range snapshot.Storage {
|
|
if storage.Instance != instanceName {
|
|
continue
|
|
}
|
|
if storage.Name == "" {
|
|
continue
|
|
}
|
|
if !strings.Contains(storage.Content, "backup") {
|
|
continue
|
|
}
|
|
if storage.Node == nodeName {
|
|
storages = append(storages, storage.Name)
|
|
continue
|
|
}
|
|
for _, node := range storage.Nodes {
|
|
if node == nodeName {
|
|
storages = append(storages, storage.Name)
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
return storages
|
|
}
|
|
|
|
func preserveFailedStorageBackups(instanceName string, snapshot models.StateSnapshot, storagesToPreserve map[string]struct{}, current []models.StorageBackup) ([]models.StorageBackup, []string) {
|
|
if len(storagesToPreserve) == 0 {
|
|
return current, nil
|
|
}
|
|
|
|
existing := make(map[string]struct{}, len(current))
|
|
for _, backup := range current {
|
|
existing[backup.ID] = struct{}{}
|
|
}
|
|
|
|
preserved := make(map[string]struct{})
|
|
for _, backup := range snapshot.PVEBackups.StorageBackups {
|
|
if backup.Instance != instanceName {
|
|
continue
|
|
}
|
|
if _, ok := storagesToPreserve[backup.Storage]; !ok {
|
|
continue
|
|
}
|
|
if _, duplicate := existing[backup.ID]; duplicate {
|
|
continue
|
|
}
|
|
current = append(current, backup)
|
|
existing[backup.ID] = struct{}{}
|
|
preserved[backup.Storage] = struct{}{}
|
|
}
|
|
|
|
if len(preserved) == 0 {
|
|
return current, nil
|
|
}
|
|
|
|
storages := make([]string, 0, len(preserved))
|
|
for storage := range preserved {
|
|
storages = append(storages, storage)
|
|
}
|
|
sort.Strings(storages)
|
|
return current, storages
|
|
}
|
|
|
|
func buildGuestLookups(snapshot models.StateSnapshot, metadataStore *config.GuestMetadataStore) (map[string]alerts.GuestLookup, map[string][]alerts.GuestLookup) {
|
|
byKey := make(map[string]alerts.GuestLookup)
|
|
byVMID := make(map[string][]alerts.GuestLookup)
|
|
|
|
for _, vm := range snapshot.VMs {
|
|
info := alerts.GuestLookup{
|
|
Name: vm.Name,
|
|
Instance: vm.Instance,
|
|
Node: vm.Node,
|
|
Type: vm.Type,
|
|
VMID: vm.VMID,
|
|
}
|
|
key := alerts.BuildGuestKey(vm.Instance, vm.Node, vm.VMID)
|
|
byKey[key] = info
|
|
|
|
vmidKey := fmt.Sprintf("%d", vm.VMID)
|
|
byVMID[vmidKey] = append(byVMID[vmidKey], info)
|
|
|
|
// Persist last-known name and type for this guest
|
|
if metadataStore != nil && vm.Name != "" {
|
|
persistGuestIdentity(metadataStore, key, vm.Name, vm.Type)
|
|
}
|
|
}
|
|
|
|
for _, ct := range snapshot.Containers {
|
|
info := alerts.GuestLookup{
|
|
Name: ct.Name,
|
|
Instance: ct.Instance,
|
|
Node: ct.Node,
|
|
Type: ct.Type,
|
|
VMID: int(ct.VMID),
|
|
}
|
|
key := alerts.BuildGuestKey(ct.Instance, ct.Node, int(ct.VMID))
|
|
if _, exists := byKey[key]; !exists {
|
|
byKey[key] = info
|
|
}
|
|
|
|
vmidKey := fmt.Sprintf("%d", ct.VMID)
|
|
byVMID[vmidKey] = append(byVMID[vmidKey], info)
|
|
|
|
// Persist last-known name and type for this guest
|
|
if metadataStore != nil && ct.Name != "" {
|
|
persistGuestIdentity(metadataStore, key, ct.Name, ct.Type)
|
|
}
|
|
}
|
|
|
|
// Augment byVMID with persisted metadata for deleted guests
|
|
if metadataStore != nil {
|
|
enrichWithPersistedMetadata(metadataStore, byVMID)
|
|
}
|
|
|
|
return byKey, byVMID
|
|
}
|
|
|
|
// enrichWithPersistedMetadata adds entries from the metadata store for guests
|
|
// that no longer exist in the live inventory but have persisted identity data
|
|
func enrichWithPersistedMetadata(metadataStore *config.GuestMetadataStore, byVMID map[string][]alerts.GuestLookup) {
|
|
allMetadata := metadataStore.GetAll()
|
|
for guestKey, meta := range allMetadata {
|
|
if meta.LastKnownName == "" {
|
|
continue // No name persisted, skip
|
|
}
|
|
|
|
// Parse the guest key (format: instance:node:vmid)
|
|
// We need to extract instance, node, and vmid
|
|
var instance, node string
|
|
var vmid int
|
|
if _, err := fmt.Sscanf(guestKey, "%[^:]:%[^:]:%d", &instance, &node, &vmid); err != nil {
|
|
continue // Invalid key format
|
|
}
|
|
|
|
vmidKey := fmt.Sprintf("%d", vmid)
|
|
|
|
// Check if we already have a live entry for this exact guest
|
|
hasLiveEntry := false
|
|
for _, existing := range byVMID[vmidKey] {
|
|
if existing.Instance == instance && existing.Node == node && existing.VMID == vmid {
|
|
hasLiveEntry = true
|
|
break
|
|
}
|
|
}
|
|
|
|
// Only add persisted metadata if no live entry exists
|
|
if !hasLiveEntry {
|
|
byVMID[vmidKey] = append(byVMID[vmidKey], alerts.GuestLookup{
|
|
Name: meta.LastKnownName,
|
|
Instance: instance,
|
|
Node: node,
|
|
Type: meta.LastKnownType,
|
|
VMID: vmid,
|
|
})
|
|
}
|
|
}
|
|
}
|
|
|
|
// persistGuestIdentity updates the metadata store with the last-known name and type for a guest
|
|
func persistGuestIdentity(metadataStore *config.GuestMetadataStore, guestKey, name, guestType string) {
|
|
existing := metadataStore.Get(guestKey)
|
|
if existing == nil {
|
|
existing = &config.GuestMetadata{
|
|
ID: guestKey,
|
|
Tags: []string{},
|
|
}
|
|
}
|
|
|
|
// Only update if the name or type has changed
|
|
if existing.LastKnownName != name || existing.LastKnownType != guestType {
|
|
existing.LastKnownName = name
|
|
existing.LastKnownType = guestType
|
|
// Save asynchronously to avoid blocking the monitor
|
|
go func() {
|
|
if err := metadataStore.Set(guestKey, existing); err != nil {
|
|
log.Error().Err(err).Str("guestKey", guestKey).Msg("Failed to persist guest identity")
|
|
}
|
|
}()
|
|
}
|
|
}
|
|
|
|
func (m *Monitor) calculateBackupOperationTimeout(instanceName string) time.Duration {
|
|
const (
|
|
minTimeout = 2 * time.Minute
|
|
maxTimeout = 5 * time.Minute
|
|
timeoutPerGuest = 2 * time.Second
|
|
)
|
|
|
|
timeout := minTimeout
|
|
snapshot := m.state.GetSnapshot()
|
|
|
|
guestCount := 0
|
|
for _, vm := range snapshot.VMs {
|
|
if vm.Instance == instanceName && !vm.Template {
|
|
guestCount++
|
|
}
|
|
}
|
|
for _, ct := range snapshot.Containers {
|
|
if ct.Instance == instanceName && !ct.Template {
|
|
guestCount++
|
|
}
|
|
}
|
|
|
|
if guestCount > 0 {
|
|
dynamic := time.Duration(guestCount) * timeoutPerGuest
|
|
if dynamic > timeout {
|
|
timeout = dynamic
|
|
}
|
|
}
|
|
|
|
if timeout > maxTimeout {
|
|
return maxTimeout
|
|
}
|
|
|
|
return timeout
|
|
}
|
|
|
|
// pollGuestSnapshots polls snapshots for all VMs and containers
|
|
func (m *Monitor) pollGuestSnapshots(ctx context.Context, instanceName string, client PVEClientInterface) {
|
|
log.Debug().Str("instance", instanceName).Msg("Polling guest snapshots")
|
|
|
|
// Get current VMs and containers from state for this instance
|
|
m.mu.RLock()
|
|
var vms []models.VM
|
|
for _, vm := range m.state.VMs {
|
|
if vm.Instance == instanceName {
|
|
vms = append(vms, vm)
|
|
}
|
|
}
|
|
var containers []models.Container
|
|
for _, ct := range m.state.Containers {
|
|
if ct.Instance == instanceName {
|
|
containers = append(containers, ct)
|
|
}
|
|
}
|
|
m.mu.RUnlock()
|
|
|
|
guestKey := func(instance, node string, vmid int) string {
|
|
if instance == node {
|
|
return fmt.Sprintf("%s-%d", node, vmid)
|
|
}
|
|
return fmt.Sprintf("%s-%s-%d", instance, node, vmid)
|
|
}
|
|
|
|
guestNames := make(map[string]string, len(vms)+len(containers))
|
|
for _, vm := range vms {
|
|
guestNames[guestKey(instanceName, vm.Node, vm.VMID)] = vm.Name
|
|
}
|
|
for _, ct := range containers {
|
|
guestNames[guestKey(instanceName, ct.Node, ct.VMID)] = ct.Name
|
|
}
|
|
|
|
activeGuests := 0
|
|
for _, vm := range vms {
|
|
if !vm.Template {
|
|
activeGuests++
|
|
}
|
|
}
|
|
for _, ct := range containers {
|
|
if !ct.Template {
|
|
activeGuests++
|
|
}
|
|
}
|
|
|
|
const (
|
|
minSnapshotTimeout = 60 * time.Second
|
|
maxSnapshotTimeout = 4 * time.Minute
|
|
snapshotTimeoutPerGuest = 2 * time.Second
|
|
)
|
|
|
|
timeout := minSnapshotTimeout
|
|
if activeGuests > 0 {
|
|
dynamic := time.Duration(activeGuests) * snapshotTimeoutPerGuest
|
|
if dynamic > timeout {
|
|
timeout = dynamic
|
|
}
|
|
}
|
|
if timeout > maxSnapshotTimeout {
|
|
timeout = maxSnapshotTimeout
|
|
}
|
|
|
|
if deadline, ok := ctx.Deadline(); ok {
|
|
remaining := time.Until(deadline)
|
|
if remaining <= 0 {
|
|
log.Warn().
|
|
Str("instance", instanceName).
|
|
Msg("Skipping guest snapshot polling; backup context deadline exceeded")
|
|
return
|
|
}
|
|
if timeout > remaining {
|
|
timeout = remaining
|
|
}
|
|
}
|
|
|
|
snapshotCtx, cancel := context.WithTimeout(ctx, timeout)
|
|
defer cancel()
|
|
|
|
log.Debug().
|
|
Str("instance", instanceName).
|
|
Int("guestCount", activeGuests).
|
|
Dur("timeout", timeout).
|
|
Msg("Guest snapshot polling budget established")
|
|
|
|
var allSnapshots []models.GuestSnapshot
|
|
deadlineExceeded := false
|
|
|
|
// Poll VM snapshots
|
|
for _, vm := range vms {
|
|
// Skip templates
|
|
if vm.Template {
|
|
continue
|
|
}
|
|
|
|
snapshots, err := client.GetVMSnapshots(snapshotCtx, vm.Node, vm.VMID)
|
|
if err != nil {
|
|
if snapshotCtx.Err() != nil {
|
|
log.Warn().
|
|
Str("instance", instanceName).
|
|
Str("node", vm.Node).
|
|
Int("vmid", vm.VMID).
|
|
Err(snapshotCtx.Err()).
|
|
Msg("Aborting guest snapshot polling due to context cancellation while fetching VM snapshots")
|
|
deadlineExceeded = true
|
|
break
|
|
}
|
|
// This is common for VMs without snapshots, so use debug level
|
|
monErr := errors.NewMonitorError(errors.ErrorTypeAPI, "get_vm_snapshots", instanceName, err).WithNode(vm.Node)
|
|
log.Debug().
|
|
Err(monErr).
|
|
Str("node", vm.Node).
|
|
Int("vmid", vm.VMID).
|
|
Msg("Failed to get VM snapshots")
|
|
continue
|
|
}
|
|
|
|
for _, snap := range snapshots {
|
|
snapshot := models.GuestSnapshot{
|
|
ID: fmt.Sprintf("%s-%s-%d-%s", instanceName, vm.Node, vm.VMID, snap.Name),
|
|
Name: snap.Name,
|
|
Node: vm.Node,
|
|
Instance: instanceName,
|
|
Type: "qemu",
|
|
VMID: vm.VMID,
|
|
Time: time.Unix(snap.SnapTime, 0),
|
|
Description: snap.Description,
|
|
Parent: snap.Parent,
|
|
VMState: true, // VM state support enabled
|
|
}
|
|
|
|
allSnapshots = append(allSnapshots, snapshot)
|
|
}
|
|
}
|
|
|
|
if deadlineExceeded {
|
|
log.Warn().
|
|
Str("instance", instanceName).
|
|
Msg("Guest snapshot polling timed out before completing VM collection; retaining previous snapshots")
|
|
return
|
|
}
|
|
|
|
// Poll container snapshots
|
|
for _, ct := range containers {
|
|
// Skip templates
|
|
if ct.Template {
|
|
continue
|
|
}
|
|
|
|
snapshots, err := client.GetContainerSnapshots(snapshotCtx, ct.Node, ct.VMID)
|
|
if err != nil {
|
|
if snapshotCtx.Err() != nil {
|
|
log.Warn().
|
|
Str("instance", instanceName).
|
|
Str("node", ct.Node).
|
|
Int("vmid", ct.VMID).
|
|
Err(snapshotCtx.Err()).
|
|
Msg("Aborting guest snapshot polling due to context cancellation while fetching container snapshots")
|
|
deadlineExceeded = true
|
|
break
|
|
}
|
|
// API error 596 means snapshots not supported/available - this is expected for many containers
|
|
errStr := err.Error()
|
|
if strings.Contains(errStr, "596") || strings.Contains(errStr, "not available") {
|
|
// Silently skip containers without snapshot support
|
|
continue
|
|
}
|
|
// Log other errors at debug level
|
|
monErr := errors.NewMonitorError(errors.ErrorTypeAPI, "get_container_snapshots", instanceName, err).WithNode(ct.Node)
|
|
log.Debug().
|
|
Err(monErr).
|
|
Str("node", ct.Node).
|
|
Int("vmid", ct.VMID).
|
|
Msg("Failed to get container snapshots")
|
|
continue
|
|
}
|
|
|
|
for _, snap := range snapshots {
|
|
snapshot := models.GuestSnapshot{
|
|
ID: fmt.Sprintf("%s-%s-%d-%s", instanceName, ct.Node, ct.VMID, snap.Name),
|
|
Name: snap.Name,
|
|
Node: ct.Node,
|
|
Instance: instanceName,
|
|
Type: "lxc",
|
|
VMID: ct.VMID,
|
|
Time: time.Unix(snap.SnapTime, 0),
|
|
Description: snap.Description,
|
|
Parent: snap.Parent,
|
|
VMState: false,
|
|
}
|
|
|
|
allSnapshots = append(allSnapshots, snapshot)
|
|
}
|
|
}
|
|
|
|
if deadlineExceeded || snapshotCtx.Err() != nil {
|
|
log.Warn().
|
|
Str("instance", instanceName).
|
|
Msg("Guest snapshot polling timed out before completion; retaining previous snapshots")
|
|
return
|
|
}
|
|
|
|
if len(allSnapshots) > 0 {
|
|
sizeMap := m.collectSnapshotSizes(snapshotCtx, instanceName, client, allSnapshots)
|
|
if len(sizeMap) > 0 {
|
|
for i := range allSnapshots {
|
|
if size, ok := sizeMap[allSnapshots[i].ID]; ok && size > 0 {
|
|
allSnapshots[i].SizeBytes = size
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Update state with guest snapshots for this instance
|
|
m.state.UpdateGuestSnapshotsForInstance(instanceName, allSnapshots)
|
|
|
|
if m.alertManager != nil {
|
|
m.alertManager.CheckSnapshotsForInstance(instanceName, allSnapshots, guestNames)
|
|
}
|
|
|
|
log.Debug().
|
|
Str("instance", instanceName).
|
|
Int("count", len(allSnapshots)).
|
|
Msg("Guest snapshots polled")
|
|
}
|
|
|
|
func (m *Monitor) collectSnapshotSizes(ctx context.Context, instanceName string, client PVEClientInterface, snapshots []models.GuestSnapshot) map[string]int64 {
|
|
sizes := make(map[string]int64, len(snapshots))
|
|
if len(snapshots) == 0 {
|
|
return sizes
|
|
}
|
|
|
|
validSnapshots := make(map[string]struct{}, len(snapshots))
|
|
nodes := make(map[string]struct{})
|
|
|
|
for _, snap := range snapshots {
|
|
validSnapshots[snap.ID] = struct{}{}
|
|
if snap.Node != "" {
|
|
nodes[snap.Node] = struct{}{}
|
|
}
|
|
}
|
|
|
|
if len(nodes) == 0 {
|
|
return sizes
|
|
}
|
|
|
|
seenVolids := make(map[string]struct{})
|
|
|
|
for nodeName := range nodes {
|
|
if ctx.Err() != nil {
|
|
break
|
|
}
|
|
|
|
storages, err := client.GetStorage(ctx, nodeName)
|
|
if err != nil {
|
|
log.Debug().
|
|
Err(err).
|
|
Str("node", nodeName).
|
|
Str("instance", instanceName).
|
|
Msg("Failed to get storage list for snapshot sizing")
|
|
continue
|
|
}
|
|
|
|
for _, storage := range storages {
|
|
if ctx.Err() != nil {
|
|
break
|
|
}
|
|
|
|
contentTypes := strings.ToLower(storage.Content)
|
|
if !strings.Contains(contentTypes, "images") && !strings.Contains(contentTypes, "rootdir") {
|
|
continue
|
|
}
|
|
if !storageContentQueryable(storage) {
|
|
continue
|
|
}
|
|
|
|
contents, err := client.GetStorageContent(ctx, nodeName, storage.Storage)
|
|
if err != nil {
|
|
log.Debug().
|
|
Err(err).
|
|
Str("node", nodeName).
|
|
Str("storage", storage.Storage).
|
|
Str("instance", instanceName).
|
|
Msg("Failed to get storage content for snapshot sizing")
|
|
continue
|
|
}
|
|
|
|
for _, item := range contents {
|
|
if item.VMID <= 0 {
|
|
continue
|
|
}
|
|
|
|
if _, seen := seenVolids[item.Volid]; seen {
|
|
continue
|
|
}
|
|
|
|
snapName := extractSnapshotName(item.Volid)
|
|
if snapName == "" {
|
|
continue
|
|
}
|
|
|
|
key := fmt.Sprintf("%s-%s-%d-%s", instanceName, nodeName, item.VMID, snapName)
|
|
if _, ok := validSnapshots[key]; !ok {
|
|
continue
|
|
}
|
|
|
|
seenVolids[item.Volid] = struct{}{}
|
|
|
|
size := int64(item.Size)
|
|
if size < 0 {
|
|
size = 0
|
|
}
|
|
|
|
sizes[key] += size
|
|
}
|
|
}
|
|
}
|
|
|
|
return sizes
|
|
}
|
|
|
|
func extractSnapshotName(volid string) string {
|
|
if volid == "" {
|
|
return ""
|
|
}
|
|
|
|
parts := strings.SplitN(volid, ":", 2)
|
|
remainder := volid
|
|
if len(parts) == 2 {
|
|
remainder = parts[1]
|
|
}
|
|
|
|
if idx := strings.Index(remainder, "@"); idx >= 0 && idx+1 < len(remainder) {
|
|
return strings.TrimSpace(remainder[idx+1:])
|
|
}
|
|
|
|
return ""
|
|
}
|
|
|
|
// Stop gracefully stops the monitor
|
|
func (m *Monitor) Stop() {
|
|
log.Info().Msg("Stopping monitor")
|
|
|
|
// Stop the alert manager to save history
|
|
if m.alertManager != nil {
|
|
m.alertManager.Stop()
|
|
}
|
|
|
|
// Stop notification manager
|
|
if m.notificationMgr != nil {
|
|
m.notificationMgr.Stop()
|
|
}
|
|
|
|
log.Info().Msg("Monitor stopped")
|
|
}
|
|
|
|
// recordAuthFailure records an authentication failure for a node
|
|
func (m *Monitor) recordAuthFailure(instanceName string, nodeType string) {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
nodeID := instanceName
|
|
if nodeType != "" {
|
|
nodeID = nodeType + "-" + instanceName
|
|
}
|
|
|
|
// Increment failure count
|
|
m.authFailures[nodeID]++
|
|
m.lastAuthAttempt[nodeID] = time.Now()
|
|
|
|
log.Warn().
|
|
Str("node", nodeID).
|
|
Int("failures", m.authFailures[nodeID]).
|
|
Msg("Authentication failure recorded")
|
|
|
|
// If we've exceeded the threshold, remove the node
|
|
const maxAuthFailures = 5
|
|
if m.authFailures[nodeID] >= maxAuthFailures {
|
|
log.Error().
|
|
Str("node", nodeID).
|
|
Int("failures", m.authFailures[nodeID]).
|
|
Msg("Maximum authentication failures reached, removing node from state")
|
|
|
|
// Remove from state based on type
|
|
if nodeType == "pve" {
|
|
m.removeFailedPVENode(instanceName)
|
|
} else if nodeType == "pbs" {
|
|
m.removeFailedPBSNode(instanceName)
|
|
} else if nodeType == "pmg" {
|
|
m.removeFailedPMGInstance(instanceName)
|
|
}
|
|
|
|
// Reset the counter since we've removed the node
|
|
delete(m.authFailures, nodeID)
|
|
delete(m.lastAuthAttempt, nodeID)
|
|
}
|
|
}
|
|
|
|
// resetAuthFailures resets the failure count for a node after successful auth
|
|
func (m *Monitor) resetAuthFailures(instanceName string, nodeType string) {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
nodeID := instanceName
|
|
if nodeType != "" {
|
|
nodeID = nodeType + "-" + instanceName
|
|
}
|
|
|
|
if count, exists := m.authFailures[nodeID]; exists && count > 0 {
|
|
log.Info().
|
|
Str("node", nodeID).
|
|
Int("previousFailures", count).
|
|
Msg("Authentication succeeded, resetting failure count")
|
|
|
|
delete(m.authFailures, nodeID)
|
|
delete(m.lastAuthAttempt, nodeID)
|
|
}
|
|
}
|
|
|
|
// removeFailedPVENode updates a PVE node to show failed authentication status
|
|
func (m *Monitor) removeFailedPVENode(instanceName string) {
|
|
// Get instance config to get host URL
|
|
var hostURL string
|
|
for _, cfg := range m.config.PVEInstances {
|
|
if cfg.Name == instanceName {
|
|
hostURL = cfg.Host
|
|
break
|
|
}
|
|
}
|
|
|
|
// Create a failed node entry to show in UI with error status
|
|
failedNode := models.Node{
|
|
ID: instanceName + "-failed",
|
|
Name: instanceName,
|
|
DisplayName: instanceName,
|
|
Instance: instanceName,
|
|
Host: hostURL, // Include host URL even for failed nodes
|
|
Status: "offline",
|
|
Type: "node",
|
|
ConnectionHealth: "error",
|
|
LastSeen: time.Now(),
|
|
// Set other fields to zero values to indicate no data
|
|
CPU: 0,
|
|
Memory: models.Memory{},
|
|
Disk: models.Disk{},
|
|
}
|
|
|
|
// Update with just the failed node
|
|
m.state.UpdateNodesForInstance(instanceName, []models.Node{failedNode})
|
|
|
|
// Remove all other resources associated with this instance
|
|
m.state.UpdateVMsForInstance(instanceName, []models.VM{})
|
|
m.state.UpdateContainersForInstance(instanceName, []models.Container{})
|
|
m.state.UpdateStorageForInstance(instanceName, []models.Storage{})
|
|
m.state.UpdateCephClustersForInstance(instanceName, []models.CephCluster{})
|
|
m.state.UpdateBackupTasksForInstance(instanceName, []models.BackupTask{})
|
|
m.state.UpdateStorageBackupsForInstance(instanceName, []models.StorageBackup{})
|
|
m.state.UpdateGuestSnapshotsForInstance(instanceName, []models.GuestSnapshot{})
|
|
|
|
// Set connection health to false
|
|
m.state.SetConnectionHealth(instanceName, false)
|
|
}
|
|
|
|
// removeFailedPBSNode removes a PBS node and all its resources from state
|
|
func (m *Monitor) removeFailedPBSNode(instanceName string) {
|
|
// Remove PBS instance by passing empty array
|
|
currentInstances := m.state.PBSInstances
|
|
var updatedInstances []models.PBSInstance
|
|
for _, inst := range currentInstances {
|
|
if inst.Name != instanceName {
|
|
updatedInstances = append(updatedInstances, inst)
|
|
}
|
|
}
|
|
m.state.UpdatePBSInstances(updatedInstances)
|
|
|
|
// Remove PBS backups
|
|
m.state.UpdatePBSBackups(instanceName, []models.PBSBackup{})
|
|
|
|
// Set connection health to false
|
|
m.state.SetConnectionHealth("pbs-"+instanceName, false)
|
|
}
|
|
|
|
// removeFailedPMGInstance removes PMG data from state when authentication fails repeatedly
|
|
func (m *Monitor) removeFailedPMGInstance(instanceName string) {
|
|
currentInstances := m.state.PMGInstances
|
|
updated := make([]models.PMGInstance, 0, len(currentInstances))
|
|
for _, inst := range currentInstances {
|
|
if inst.Name != instanceName {
|
|
updated = append(updated, inst)
|
|
}
|
|
}
|
|
|
|
m.state.UpdatePMGInstances(updated)
|
|
m.state.UpdatePMGBackups(instanceName, nil)
|
|
m.state.SetConnectionHealth("pmg-"+instanceName, false)
|
|
}
|
|
|
|
type pbsBackupGroupKey struct {
|
|
datastore string
|
|
namespace string
|
|
backupType string
|
|
backupID string
|
|
}
|
|
|
|
type cachedPBSGroup struct {
|
|
snapshots []models.PBSBackup
|
|
latest time.Time
|
|
}
|
|
|
|
type pbsBackupFetchRequest struct {
|
|
datastore string
|
|
namespace string
|
|
group pbs.BackupGroup
|
|
cached cachedPBSGroup
|
|
}
|
|
|
|
// pollPBSBackups fetches all backups from PBS datastores
|
|
func (m *Monitor) pollPBSBackups(ctx context.Context, instanceName string, client *pbs.Client, datastores []models.PBSDatastore) {
|
|
log.Debug().Str("instance", instanceName).Msg("Polling PBS backups")
|
|
|
|
// Cache existing PBS backups so we can avoid redundant API calls when no changes occurred.
|
|
existingGroups := m.buildPBSBackupCache(instanceName)
|
|
|
|
var allBackups []models.PBSBackup
|
|
datastoreCount := len(datastores) // Number of datastores to query
|
|
datastoreFetches := 0 // Number of successful datastore fetches
|
|
datastoreErrors := 0 // Number of failed datastore fetches
|
|
|
|
// Process each datastore
|
|
for _, ds := range datastores {
|
|
if ctx.Err() != nil {
|
|
log.Warn().
|
|
Str("instance", instanceName).
|
|
Msg("PBS backup polling cancelled before completion")
|
|
return
|
|
}
|
|
|
|
namespacePaths := namespacePathsForDatastore(ds)
|
|
|
|
log.Info().
|
|
Str("instance", instanceName).
|
|
Str("datastore", ds.Name).
|
|
Int("namespaces", len(namespacePaths)).
|
|
Strs("namespace_paths", namespacePaths).
|
|
Msg("Processing datastore namespaces")
|
|
|
|
datastoreHadSuccess := false
|
|
groupsReused := 0
|
|
groupsRequested := 0
|
|
|
|
for _, namespace := range namespacePaths {
|
|
if ctx.Err() != nil {
|
|
log.Warn().
|
|
Str("instance", instanceName).
|
|
Msg("PBS backup polling cancelled mid-datastore")
|
|
return
|
|
}
|
|
|
|
groups, err := client.ListBackupGroups(ctx, ds.Name, namespace)
|
|
if err != nil {
|
|
log.Error().
|
|
Err(err).
|
|
Str("instance", instanceName).
|
|
Str("datastore", ds.Name).
|
|
Str("namespace", namespace).
|
|
Msg("Failed to list PBS backup groups")
|
|
continue
|
|
}
|
|
|
|
datastoreHadSuccess = true
|
|
requests := make([]pbsBackupFetchRequest, 0, len(groups))
|
|
|
|
for _, group := range groups {
|
|
key := pbsBackupGroupKey{
|
|
datastore: ds.Name,
|
|
namespace: namespace,
|
|
backupType: group.BackupType,
|
|
backupID: group.BackupID,
|
|
}
|
|
cached := existingGroups[key]
|
|
|
|
// Group deleted (no backups left) - ensure cached data is dropped.
|
|
if group.BackupCount == 0 {
|
|
continue
|
|
}
|
|
|
|
lastBackupTime := time.Unix(group.LastBackup, 0)
|
|
hasCachedData := len(cached.snapshots) > 0
|
|
// Only re-fetch when the backup count changes or the most recent backup is newer.
|
|
if hasCachedData &&
|
|
len(cached.snapshots) == group.BackupCount &&
|
|
!lastBackupTime.After(cached.latest) {
|
|
|
|
allBackups = append(allBackups, cached.snapshots...)
|
|
groupsReused++
|
|
continue
|
|
}
|
|
|
|
requests = append(requests, pbsBackupFetchRequest{
|
|
datastore: ds.Name,
|
|
namespace: namespace,
|
|
group: group,
|
|
cached: cached,
|
|
})
|
|
}
|
|
|
|
if len(requests) == 0 {
|
|
continue
|
|
}
|
|
|
|
groupsRequested += len(requests)
|
|
fetched := m.fetchPBSBackupSnapshots(ctx, client, instanceName, requests)
|
|
if len(fetched) > 0 {
|
|
allBackups = append(allBackups, fetched...)
|
|
}
|
|
}
|
|
|
|
if datastoreHadSuccess {
|
|
datastoreFetches++
|
|
log.Info().
|
|
Str("instance", instanceName).
|
|
Str("datastore", ds.Name).
|
|
Int("namespaces", len(namespacePaths)).
|
|
Int("groups_reused", groupsReused).
|
|
Int("groups_refreshed", groupsRequested).
|
|
Msg("PBS datastore processed")
|
|
} else {
|
|
// Preserve cached data for this datastore if we couldn't fetch anything new.
|
|
log.Warn().
|
|
Str("instance", instanceName).
|
|
Str("datastore", ds.Name).
|
|
Msg("No namespaces succeeded for PBS datastore; using cached backups")
|
|
for key, entry := range existingGroups {
|
|
if key.datastore != ds.Name || len(entry.snapshots) == 0 {
|
|
continue
|
|
}
|
|
allBackups = append(allBackups, entry.snapshots...)
|
|
}
|
|
datastoreErrors++
|
|
}
|
|
}
|
|
|
|
log.Info().
|
|
Str("instance", instanceName).
|
|
Int("count", len(allBackups)).
|
|
Msg("PBS backups fetched")
|
|
|
|
// Decide whether to keep existing backups when all queries failed
|
|
if shouldPreservePBSBackups(datastoreCount, datastoreFetches) {
|
|
log.Warn().
|
|
Str("instance", instanceName).
|
|
Int("datastores", datastoreCount).
|
|
Int("errors", datastoreErrors).
|
|
Msg("All PBS datastore queries failed; keeping previous backup list")
|
|
return
|
|
}
|
|
|
|
// Update state
|
|
m.state.UpdatePBSBackups(instanceName, allBackups)
|
|
|
|
if m.alertManager != nil {
|
|
snapshot := m.state.GetSnapshot()
|
|
guestsByKey, guestsByVMID := buildGuestLookups(snapshot, m.guestMetadataStore)
|
|
pveStorage := snapshot.Backups.PVE.StorageBackups
|
|
if len(pveStorage) == 0 && len(snapshot.PVEBackups.StorageBackups) > 0 {
|
|
pveStorage = snapshot.PVEBackups.StorageBackups
|
|
}
|
|
pbsBackups := snapshot.Backups.PBS
|
|
if len(pbsBackups) == 0 && len(snapshot.PBSBackups) > 0 {
|
|
pbsBackups = snapshot.PBSBackups
|
|
}
|
|
pmgBackups := snapshot.Backups.PMG
|
|
if len(pmgBackups) == 0 && len(snapshot.PMGBackups) > 0 {
|
|
pmgBackups = snapshot.PMGBackups
|
|
}
|
|
m.alertManager.CheckBackups(pveStorage, pbsBackups, pmgBackups, guestsByKey, guestsByVMID)
|
|
}
|
|
}
|
|
|
|
func (m *Monitor) buildPBSBackupCache(instanceName string) map[pbsBackupGroupKey]cachedPBSGroup {
|
|
snapshot := m.state.GetSnapshot()
|
|
cache := make(map[pbsBackupGroupKey]cachedPBSGroup)
|
|
for _, backup := range snapshot.PBSBackups {
|
|
if backup.Instance != instanceName {
|
|
continue
|
|
}
|
|
key := pbsBackupGroupKey{
|
|
datastore: backup.Datastore,
|
|
namespace: normalizePBSNamespacePath(backup.Namespace),
|
|
backupType: backup.BackupType,
|
|
backupID: backup.VMID,
|
|
}
|
|
entry := cache[key]
|
|
entry.snapshots = append(entry.snapshots, backup)
|
|
if backup.BackupTime.After(entry.latest) {
|
|
entry.latest = backup.BackupTime
|
|
}
|
|
cache[key] = entry
|
|
}
|
|
return cache
|
|
}
|
|
|
|
func normalizePBSNamespacePath(ns string) string {
|
|
if ns == "/" {
|
|
return ""
|
|
}
|
|
return ns
|
|
}
|
|
|
|
func namespacePathsForDatastore(ds models.PBSDatastore) []string {
|
|
if len(ds.Namespaces) == 0 {
|
|
return []string{""}
|
|
}
|
|
|
|
seen := make(map[string]struct{}, len(ds.Namespaces))
|
|
var paths []string
|
|
for _, ns := range ds.Namespaces {
|
|
path := normalizePBSNamespacePath(ns.Path)
|
|
if _, ok := seen[path]; ok {
|
|
continue
|
|
}
|
|
seen[path] = struct{}{}
|
|
paths = append(paths, path)
|
|
}
|
|
if len(paths) == 0 {
|
|
paths = append(paths, "")
|
|
}
|
|
return paths
|
|
}
|
|
|
|
func (m *Monitor) fetchPBSBackupSnapshots(ctx context.Context, client *pbs.Client, instanceName string, requests []pbsBackupFetchRequest) []models.PBSBackup {
|
|
if len(requests) == 0 {
|
|
return nil
|
|
}
|
|
|
|
results := make(chan []models.PBSBackup, len(requests))
|
|
var wg sync.WaitGroup
|
|
sem := make(chan struct{}, 5)
|
|
|
|
for _, req := range requests {
|
|
req := req
|
|
wg.Add(1)
|
|
go func() {
|
|
defer wg.Done()
|
|
|
|
select {
|
|
case sem <- struct{}{}:
|
|
case <-ctx.Done():
|
|
return
|
|
}
|
|
defer func() { <-sem }()
|
|
|
|
log.Debug().
|
|
Str("instance", instanceName).
|
|
Str("datastore", req.datastore).
|
|
Str("namespace", req.namespace).
|
|
Str("type", req.group.BackupType).
|
|
Str("id", req.group.BackupID).
|
|
Msg("Refreshing PBS backup group")
|
|
|
|
snapshots, err := client.ListBackupSnapshots(ctx, req.datastore, req.namespace, req.group.BackupType, req.group.BackupID)
|
|
if err != nil {
|
|
log.Error().
|
|
Err(err).
|
|
Str("instance", instanceName).
|
|
Str("datastore", req.datastore).
|
|
Str("namespace", req.namespace).
|
|
Str("type", req.group.BackupType).
|
|
Str("id", req.group.BackupID).
|
|
Msg("Failed to list PBS backup snapshots")
|
|
|
|
if len(req.cached.snapshots) > 0 {
|
|
results <- req.cached.snapshots
|
|
}
|
|
return
|
|
}
|
|
|
|
results <- convertPBSSnapshots(instanceName, req.datastore, req.namespace, snapshots)
|
|
}()
|
|
}
|
|
|
|
go func() {
|
|
wg.Wait()
|
|
close(results)
|
|
}()
|
|
|
|
var combined []models.PBSBackup
|
|
for backups := range results {
|
|
if len(backups) == 0 {
|
|
continue
|
|
}
|
|
combined = append(combined, backups...)
|
|
}
|
|
|
|
return combined
|
|
}
|
|
|
|
func convertPBSSnapshots(instanceName, datastore, namespace string, snapshots []pbs.BackupSnapshot) []models.PBSBackup {
|
|
backups := make([]models.PBSBackup, 0, len(snapshots))
|
|
for _, snapshot := range snapshots {
|
|
backupTime := time.Unix(snapshot.BackupTime, 0)
|
|
id := fmt.Sprintf("pbs-%s-%s-%s-%s-%s-%d",
|
|
instanceName, datastore, namespace,
|
|
snapshot.BackupType, snapshot.BackupID,
|
|
snapshot.BackupTime)
|
|
|
|
var fileNames []string
|
|
for _, file := range snapshot.Files {
|
|
switch f := file.(type) {
|
|
case string:
|
|
fileNames = append(fileNames, f)
|
|
case map[string]interface{}:
|
|
if filename, ok := f["filename"].(string); ok {
|
|
fileNames = append(fileNames, filename)
|
|
}
|
|
}
|
|
}
|
|
|
|
verified := false
|
|
if snapshot.Verification != nil {
|
|
switch v := snapshot.Verification.(type) {
|
|
case string:
|
|
verified = v == "ok"
|
|
case map[string]interface{}:
|
|
if state, ok := v["state"].(string); ok {
|
|
verified = state == "ok"
|
|
}
|
|
}
|
|
|
|
log.Debug().
|
|
Str("vmid", snapshot.BackupID).
|
|
Int64("time", snapshot.BackupTime).
|
|
Interface("verification", snapshot.Verification).
|
|
Bool("verified", verified).
|
|
Msg("PBS backup verification status")
|
|
}
|
|
|
|
backups = append(backups, models.PBSBackup{
|
|
ID: id,
|
|
Instance: instanceName,
|
|
Datastore: datastore,
|
|
Namespace: namespace,
|
|
BackupType: snapshot.BackupType,
|
|
VMID: snapshot.BackupID,
|
|
BackupTime: backupTime,
|
|
Size: snapshot.Size,
|
|
Protected: snapshot.Protected,
|
|
Verified: verified,
|
|
Comment: snapshot.Comment,
|
|
Files: fileNames,
|
|
Owner: snapshot.Owner,
|
|
})
|
|
}
|
|
|
|
return backups
|
|
}
|
|
|
|
// checkMockAlerts checks alerts for mock data
|
|
func (m *Monitor) checkMockAlerts() {
|
|
defer recoverFromPanic("checkMockAlerts")
|
|
|
|
log.Info().Bool("mockEnabled", mock.IsMockEnabled()).Msg("checkMockAlerts called")
|
|
if !mock.IsMockEnabled() {
|
|
log.Info().Msg("Mock mode not enabled, skipping mock alert check")
|
|
return
|
|
}
|
|
|
|
// Get mock state
|
|
state := mock.GetMockState()
|
|
|
|
log.Info().
|
|
Int("vms", len(state.VMs)).
|
|
Int("containers", len(state.Containers)).
|
|
Int("nodes", len(state.Nodes)).
|
|
Msg("Checking alerts for mock data")
|
|
|
|
// Clean up alerts for nodes that no longer exist
|
|
existingNodes := make(map[string]bool)
|
|
for _, node := range state.Nodes {
|
|
existingNodes[node.Name] = true
|
|
if node.Host != "" {
|
|
existingNodes[node.Host] = true
|
|
}
|
|
}
|
|
for _, pbsInst := range state.PBSInstances {
|
|
existingNodes[pbsInst.Name] = true
|
|
existingNodes["pbs-"+pbsInst.Name] = true
|
|
if pbsInst.Host != "" {
|
|
existingNodes[pbsInst.Host] = true
|
|
}
|
|
}
|
|
log.Info().
|
|
Int("trackedNodes", len(existingNodes)).
|
|
Msg("Collecting resources for alert cleanup in mock mode")
|
|
m.alertManager.CleanupAlertsForNodes(existingNodes)
|
|
|
|
guestsByKey, guestsByVMID := buildGuestLookups(state, m.guestMetadataStore)
|
|
pveStorage := state.Backups.PVE.StorageBackups
|
|
if len(pveStorage) == 0 && len(state.PVEBackups.StorageBackups) > 0 {
|
|
pveStorage = state.PVEBackups.StorageBackups
|
|
}
|
|
pbsBackups := state.Backups.PBS
|
|
if len(pbsBackups) == 0 && len(state.PBSBackups) > 0 {
|
|
pbsBackups = state.PBSBackups
|
|
}
|
|
pmgBackups := state.Backups.PMG
|
|
if len(pmgBackups) == 0 && len(state.PMGBackups) > 0 {
|
|
pmgBackups = state.PMGBackups
|
|
}
|
|
m.alertManager.CheckBackups(pveStorage, pbsBackups, pmgBackups, guestsByKey, guestsByVMID)
|
|
|
|
// Limit how many guests we check per cycle to prevent blocking with large datasets
|
|
const maxGuestsPerCycle = 50
|
|
guestsChecked := 0
|
|
|
|
// Check alerts for VMs (up to limit)
|
|
for _, vm := range state.VMs {
|
|
if guestsChecked >= maxGuestsPerCycle {
|
|
log.Debug().
|
|
Int("checked", guestsChecked).
|
|
Int("total", len(state.VMs)+len(state.Containers)).
|
|
Msg("Reached guest check limit for this cycle")
|
|
break
|
|
}
|
|
m.alertManager.CheckGuest(vm, "mock")
|
|
guestsChecked++
|
|
}
|
|
|
|
// Check alerts for containers (if we haven't hit the limit)
|
|
for _, container := range state.Containers {
|
|
if guestsChecked >= maxGuestsPerCycle {
|
|
break
|
|
}
|
|
m.alertManager.CheckGuest(container, "mock")
|
|
guestsChecked++
|
|
}
|
|
|
|
// Check alerts for each node
|
|
for _, node := range state.Nodes {
|
|
m.alertManager.CheckNode(node)
|
|
}
|
|
|
|
// Check alerts for storage
|
|
log.Info().Int("storageCount", len(state.Storage)).Msg("Checking storage alerts")
|
|
for _, storage := range state.Storage {
|
|
log.Debug().
|
|
Str("name", storage.Name).
|
|
Float64("usage", storage.Usage).
|
|
Msg("Checking storage for alerts")
|
|
m.alertManager.CheckStorage(storage)
|
|
}
|
|
|
|
// Check alerts for PBS instances
|
|
log.Info().Int("pbsCount", len(state.PBSInstances)).Msg("Checking PBS alerts")
|
|
for _, pbsInst := range state.PBSInstances {
|
|
m.alertManager.CheckPBS(pbsInst)
|
|
}
|
|
|
|
// Check alerts for PMG instances
|
|
log.Info().Int("pmgCount", len(state.PMGInstances)).Msg("Checking PMG alerts")
|
|
for _, pmgInst := range state.PMGInstances {
|
|
m.alertManager.CheckPMG(pmgInst)
|
|
}
|
|
|
|
// Cache the latest alert snapshots directly in the mock data so the API can serve
|
|
// mock state without needing to grab the alert manager lock again.
|
|
mock.UpdateAlertSnapshots(m.alertManager.GetActiveAlerts(), m.alertManager.GetRecentlyResolved())
|
|
}
|
|
func isLegacyHostAgent(agentType string) bool {
|
|
// Unified agent reports type="unified"
|
|
// Legacy standalone agents have empty type
|
|
return agentType != "unified"
|
|
}
|
|
|
|
func isLegacyDockerAgent(agentType string) bool {
|
|
// Unified agent reports type="unified"
|
|
// Legacy standalone agents have empty type
|
|
return agentType != "unified"
|
|
}
|