Files
Pulse/internal/monitoring/monitor.go
rcourtman 495e6c7945 feat: comprehensive diagnostics and observability improvements
Upgrade diagnostics infrastructure from 5/10 to 8/10 production readiness
with enhanced metrics, logging, and request correlation capabilities.

**Request Correlation**
- Wire request IDs through context in middleware
- Return X-Request-ID header in all API responses
- Enable downstream log correlation across request lifecycle

**HTTP/API Metrics** (18 new Prometheus metrics)
- pulse_http_request_duration_seconds - API latency histogram
- pulse_http_requests_total - request counter by method/route/status
- pulse_http_request_errors_total - error counter by type
- Path normalization to control label cardinality

**Per-Node Poll Metrics**
- pulse_monitor_node_poll_duration_seconds - per-node timing
- pulse_monitor_node_poll_total - success/error counts per node
- pulse_monitor_node_poll_errors_total - error breakdown per node
- pulse_monitor_node_poll_last_success_timestamp - freshness tracking
- pulse_monitor_node_poll_staleness_seconds - age since last success
- Enables multi-node hotspot identification

**Scheduler Health Metrics**
- pulse_scheduler_queue_due_soon - ready queue depth
- pulse_scheduler_queue_depth - by instance type
- pulse_scheduler_queue_wait_seconds - time in queue histogram
- pulse_scheduler_dead_letter_depth - failed task tracking
- pulse_scheduler_breaker_state - circuit breaker state
- pulse_scheduler_breaker_failure_count - consecutive failures
- pulse_scheduler_breaker_retry_seconds - time until retry
- Enable alerting on DLQ spikes, breaker opens, queue backlogs

**Diagnostics Endpoint Caching**
- pulse_diagnostics_cache_hits_total - cache performance
- pulse_diagnostics_cache_misses_total - cache misses
- pulse_diagnostics_refresh_duration_seconds - probe timing
- 45-second TTL prevents thundering herd on /api/diagnostics
- Thread-safe with RWMutex
- X-Diagnostics-Cached-At header shows cache freshness

**Debug Log Performance**
- Gate high-frequency debug logs behind IsLevelEnabled() checks
- Reduces CPU waste in production when debug disabled
- Covers scheduler loops, poll cycles, API handlers

**Persistent Logging**
- File logging with automatic rotation
- LOG_FILE, LOG_MAX_SIZE, LOG_MAX_AGE, LOG_COMPRESS env vars
- MultiWriter sends logs to both stderr and file
- Gzip compression support for rotated logs

Files modified:
- internal/api/diagnostics.go (caching layer)
- internal/api/middleware.go (request IDs, HTTP metrics)
- internal/api/http_metrics.go (NEW - HTTP metric definitions)
- internal/logging/logging.go (file logging with rotation)
- internal/monitoring/metrics.go (node + scheduler metrics)
- internal/monitoring/monitor.go (instrumentation, debug gating)

Impact: Dramatically improved production troubleshooting with per-node
visibility, scheduler health metrics, persistent logs, and cached
diagnostics. Fast incident response now possible for multi-node deployments.
2025-10-21 12:37:39 +00:00

6583 lines
200 KiB
Go

package monitoring
import (
"context"
"encoding/json"
stderrors "errors"
"fmt"
"math"
"math/rand"
"net"
"net/url"
"os"
"path/filepath"
"sort"
"strconv"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/rcourtman/pulse-go-rewrite/internal/alerts"
"github.com/rcourtman/pulse-go-rewrite/internal/config"
"github.com/rcourtman/pulse-go-rewrite/internal/discovery"
"github.com/rcourtman/pulse-go-rewrite/internal/errors"
"github.com/rcourtman/pulse-go-rewrite/internal/logging"
"github.com/rcourtman/pulse-go-rewrite/internal/mock"
"github.com/rcourtman/pulse-go-rewrite/internal/models"
"github.com/rcourtman/pulse-go-rewrite/internal/notifications"
"github.com/rcourtman/pulse-go-rewrite/internal/websocket"
agentsdocker "github.com/rcourtman/pulse-go-rewrite/pkg/agents/docker"
"github.com/rcourtman/pulse-go-rewrite/pkg/pbs"
"github.com/rcourtman/pulse-go-rewrite/pkg/pmg"
"github.com/rcourtman/pulse-go-rewrite/pkg/proxmox"
"github.com/rs/zerolog"
"github.com/rs/zerolog/log"
)
// PVEClientInterface defines the interface for PVE clients (both regular and cluster)
type PVEClientInterface interface {
GetNodes(ctx context.Context) ([]proxmox.Node, error)
GetNodeStatus(ctx context.Context, node string) (*proxmox.NodeStatus, error)
GetNodeRRDData(ctx context.Context, node string, timeframe string, cf string, ds []string) ([]proxmox.NodeRRDPoint, error)
GetVMs(ctx context.Context, node string) ([]proxmox.VM, error)
GetContainers(ctx context.Context, node string) ([]proxmox.Container, error)
GetStorage(ctx context.Context, node string) ([]proxmox.Storage, error)
GetAllStorage(ctx context.Context) ([]proxmox.Storage, error)
GetBackupTasks(ctx context.Context) ([]proxmox.Task, error)
GetStorageContent(ctx context.Context, node, storage string) ([]proxmox.StorageContent, error)
GetVMSnapshots(ctx context.Context, node string, vmid int) ([]proxmox.Snapshot, error)
GetContainerSnapshots(ctx context.Context, node string, vmid int) ([]proxmox.Snapshot, error)
GetVMStatus(ctx context.Context, node string, vmid int) (*proxmox.VMStatus, error)
GetContainerStatus(ctx context.Context, node string, vmid int) (*proxmox.Container, error)
GetClusterResources(ctx context.Context, resourceType string) ([]proxmox.ClusterResource, error)
IsClusterMember(ctx context.Context) (bool, error)
GetVMFSInfo(ctx context.Context, node string, vmid int) ([]proxmox.VMFileSystem, error)
GetVMNetworkInterfaces(ctx context.Context, node string, vmid int) ([]proxmox.VMNetworkInterface, error)
GetVMAgentInfo(ctx context.Context, node string, vmid int) (map[string]interface{}, error)
GetZFSPoolStatus(ctx context.Context, node string) ([]proxmox.ZFSPoolStatus, error)
GetZFSPoolsWithDetails(ctx context.Context, node string) ([]proxmox.ZFSPoolInfo, error)
GetDisks(ctx context.Context, node string) ([]proxmox.Disk, error)
GetCephStatus(ctx context.Context) (*proxmox.CephStatus, error)
GetCephDF(ctx context.Context) (*proxmox.CephDF, error)
}
func getNodeDisplayName(instance *config.PVEInstance, nodeName string) string {
baseName := strings.TrimSpace(nodeName)
if baseName == "" {
baseName = "unknown-node"
}
if instance == nil {
return baseName
}
friendly := strings.TrimSpace(instance.Name)
if instance.IsCluster {
if endpointLabel := lookupClusterEndpointLabel(instance, nodeName); endpointLabel != "" {
return endpointLabel
}
if baseName != "" && baseName != "unknown-node" {
return baseName
}
if friendly != "" {
return friendly
}
return baseName
}
if friendly != "" {
return friendly
}
if baseName != "" && baseName != "unknown-node" {
return baseName
}
if label := normalizeEndpointHost(instance.Host); label != "" && !isLikelyIPAddress(label) {
return label
}
return baseName
}
func mergeNVMeTempsIntoDisks(disks []models.PhysicalDisk, nodes []models.Node) []models.PhysicalDisk {
if len(disks) == 0 || len(nodes) == 0 {
return disks
}
nvmeTempsByNode := make(map[string][]models.NVMeTemp)
for _, node := range nodes {
if node.Temperature == nil || !node.Temperature.Available || len(node.Temperature.NVMe) == 0 {
continue
}
temps := make([]models.NVMeTemp, len(node.Temperature.NVMe))
copy(temps, node.Temperature.NVMe)
sort.Slice(temps, func(i, j int) bool {
return temps[i].Device < temps[j].Device
})
nvmeTempsByNode[node.Name] = temps
}
if len(nvmeTempsByNode) == 0 {
return disks
}
updated := make([]models.PhysicalDisk, len(disks))
copy(updated, disks)
disksByNode := make(map[string][]int)
for i := range updated {
if strings.EqualFold(updated[i].Type, "nvme") {
disksByNode[updated[i].Node] = append(disksByNode[updated[i].Node], i)
}
}
for nodeName, diskIndexes := range disksByNode {
temps, ok := nvmeTempsByNode[nodeName]
if !ok || len(temps) == 0 {
for _, idx := range diskIndexes {
updated[idx].Temperature = 0
}
continue
}
sort.Slice(diskIndexes, func(i, j int) bool {
return updated[diskIndexes[i]].DevPath < updated[diskIndexes[j]].DevPath
})
for _, idx := range diskIndexes {
updated[idx].Temperature = 0
}
for idx, diskIdx := range diskIndexes {
if idx >= len(temps) {
break
}
tempVal := temps[idx].Temp
if tempVal <= 0 || math.IsNaN(tempVal) {
continue
}
updated[diskIdx].Temperature = int(math.Round(tempVal))
}
}
return updated
}
func lookupClusterEndpointLabel(instance *config.PVEInstance, nodeName string) string {
if instance == nil {
return ""
}
for _, endpoint := range instance.ClusterEndpoints {
if !strings.EqualFold(endpoint.NodeName, nodeName) {
continue
}
if host := strings.TrimSpace(endpoint.Host); host != "" {
if label := normalizeEndpointHost(host); label != "" && !isLikelyIPAddress(label) {
return label
}
}
if nodeNameLabel := strings.TrimSpace(endpoint.NodeName); nodeNameLabel != "" {
return nodeNameLabel
}
if ip := strings.TrimSpace(endpoint.IP); ip != "" {
return ip
}
}
return ""
}
func normalizeEndpointHost(raw string) string {
value := strings.TrimSpace(raw)
if value == "" {
return ""
}
if parsed, err := url.Parse(value); err == nil && parsed.Host != "" {
host := parsed.Hostname()
if host != "" {
return host
}
return parsed.Host
}
value = strings.TrimPrefix(value, "https://")
value = strings.TrimPrefix(value, "http://")
value = strings.TrimSpace(value)
if value == "" {
return ""
}
if idx := strings.Index(value, "/"); idx >= 0 {
value = strings.TrimSpace(value[:idx])
}
if idx := strings.Index(value, ":"); idx >= 0 {
value = strings.TrimSpace(value[:idx])
}
return value
}
func isLikelyIPAddress(value string) bool {
if value == "" {
return false
}
if ip := net.ParseIP(value); ip != nil {
return true
}
// Handle IPv6 with zone identifier (fe80::1%eth0)
if i := strings.Index(value, "%"); i > 0 {
if ip := net.ParseIP(value[:i]); ip != nil {
return true
}
}
return false
}
// PollExecutor defines the contract for executing polling tasks.
type PollExecutor interface {
Execute(ctx context.Context, task PollTask)
}
type realExecutor struct {
monitor *Monitor
}
func newRealExecutor(m *Monitor) PollExecutor {
return &realExecutor{monitor: m}
}
func (r *realExecutor) Execute(ctx context.Context, task PollTask) {
if r == nil || r.monitor == nil {
return
}
switch strings.ToLower(task.InstanceType) {
case "pve":
if task.PVEClient == nil {
log.Warn().
Str("instance", task.InstanceName).
Msg("PollExecutor received nil PVE client")
return
}
r.monitor.pollPVEInstance(ctx, task.InstanceName, task.PVEClient)
case "pbs":
if task.PBSClient == nil {
log.Warn().
Str("instance", task.InstanceName).
Msg("PollExecutor received nil PBS client")
return
}
r.monitor.pollPBSInstance(ctx, task.InstanceName, task.PBSClient)
case "pmg":
if task.PMGClient == nil {
log.Warn().
Str("instance", task.InstanceName).
Msg("PollExecutor received nil PMG client")
return
}
r.monitor.pollPMGInstance(ctx, task.InstanceName, task.PMGClient)
default:
if logging.IsLevelEnabled(zerolog.DebugLevel) {
log.Debug().
Str("instance", task.InstanceName).
Str("type", task.InstanceType).
Msg("PollExecutor received unsupported task type")
}
}
}
type instanceInfo struct {
Key string
Type InstanceType
DisplayName string
Connection string
Metadata map[string]string
}
type pollStatus struct {
LastSuccess time.Time
LastErrorAt time.Time
LastErrorMessage string
LastErrorCategory string
ConsecutiveFailures int
FirstFailureAt time.Time
}
type dlqInsight struct {
Reason string
FirstAttempt time.Time
LastAttempt time.Time
RetryCount int
NextRetry time.Time
}
type ErrorDetail struct {
At time.Time `json:"at"`
Message string `json:"message"`
Category string `json:"category"`
}
type InstancePollStatus struct {
LastSuccess *time.Time `json:"lastSuccess,omitempty"`
LastError *ErrorDetail `json:"lastError,omitempty"`
ConsecutiveFailures int `json:"consecutiveFailures"`
FirstFailureAt *time.Time `json:"firstFailureAt,omitempty"`
}
type InstanceBreaker struct {
State string `json:"state"`
Since *time.Time `json:"since,omitempty"`
LastTransition *time.Time `json:"lastTransition,omitempty"`
RetryAt *time.Time `json:"retryAt,omitempty"`
FailureCount int `json:"failureCount"`
}
type InstanceDLQ struct {
Present bool `json:"present"`
Reason string `json:"reason,omitempty"`
FirstAttempt *time.Time `json:"firstAttempt,omitempty"`
LastAttempt *time.Time `json:"lastAttempt,omitempty"`
RetryCount int `json:"retryCount,omitempty"`
NextRetry *time.Time `json:"nextRetry,omitempty"`
}
type InstanceHealth struct {
Key string `json:"key"`
Type string `json:"type"`
DisplayName string `json:"displayName"`
Instance string `json:"instance"`
Connection string `json:"connection"`
PollStatus InstancePollStatus `json:"pollStatus"`
Breaker InstanceBreaker `json:"breaker"`
DeadLetter InstanceDLQ `json:"deadLetter"`
}
func schedulerKey(instanceType InstanceType, name string) string {
return string(instanceType) + "::" + name
}
func timePtr(t time.Time) *time.Time {
if t.IsZero() {
return nil
}
copy := t
return &copy
}
// Monitor handles all monitoring operations
type Monitor struct {
config *config.Config
state *models.State
pveClients map[string]PVEClientInterface
pbsClients map[string]*pbs.Client
pmgClients map[string]*pmg.Client
pollMetrics *PollMetrics
scheduler *AdaptiveScheduler
stalenessTracker *StalenessTracker
taskQueue *TaskQueue
circuitBreakers map[string]*circuitBreaker
deadLetterQueue *TaskQueue
failureCounts map[string]int
lastOutcome map[string]taskOutcome
backoffCfg backoffConfig
rng *rand.Rand
maxRetryAttempts int
tempCollector *TemperatureCollector // SSH-based temperature collector
mu sync.RWMutex
startTime time.Time
rateTracker *RateTracker
metricsHistory *MetricsHistory
alertManager *alerts.Manager
notificationMgr *notifications.NotificationManager
configPersist *config.ConfigPersistence
discoveryService *discovery.Service // Background discovery service
activePollCount int32 // Number of active polling operations
pollCounter int64 // Counter for polling cycles
authFailures map[string]int // Track consecutive auth failures per node
lastAuthAttempt map[string]time.Time // Track last auth attempt time
lastClusterCheck map[string]time.Time // Track last cluster check for standalone nodes
lastPhysicalDiskPoll map[string]time.Time // Track last physical disk poll time per instance
lastPVEBackupPoll map[string]time.Time // Track last PVE backup poll per instance
lastPBSBackupPoll map[string]time.Time // Track last PBS backup poll per instance
persistence *config.ConfigPersistence // Add persistence for saving updated configs
pbsBackupPollers map[string]bool // Track PBS backup polling goroutines per instance
runtimeCtx context.Context // Context used while monitor is running
wsHub *websocket.Hub // Hub used for broadcasting state
diagMu sync.RWMutex // Protects diagnostic snapshot maps
nodeSnapshots map[string]NodeMemorySnapshot
guestSnapshots map[string]GuestMemorySnapshot
rrdCacheMu sync.RWMutex // Protects RRD memavailable cache
nodeRRDMemCache map[string]rrdMemCacheEntry
removedDockerHosts map[string]time.Time // Track deliberately removed Docker hosts (ID -> removal time)
dockerCommands map[string]*dockerHostCommand
dockerCommandIndex map[string]string
guestMetadataMu sync.RWMutex
guestMetadataCache map[string]guestMetadataCacheEntry
executor PollExecutor
breakerBaseRetry time.Duration
breakerMaxDelay time.Duration
breakerHalfOpenWindow time.Duration
instanceInfoCache map[string]*instanceInfo
pollStatusMap map[string]*pollStatus
dlqInsightMap map[string]*dlqInsight
}
type rrdMemCacheEntry struct {
value uint64
fetchedAt time.Time
}
// safePercentage calculates percentage safely, returning 0 if divisor is 0
func safePercentage(used, total float64) float64 {
if total == 0 {
return 0
}
result := used / total * 100
if math.IsNaN(result) || math.IsInf(result, 0) {
return 0
}
return result
}
// maxInt64 returns the maximum of two int64 values
func maxInt64(a, b int64) int64 {
if a > b {
return a
}
return b
}
// safeFloat ensures a float value is not NaN or Inf
func safeFloat(val float64) float64 {
if math.IsNaN(val) || math.IsInf(val, 0) {
return 0
}
return val
}
// shouldRunBackupPoll determines whether a backup polling cycle should execute.
// Returns whether polling should run, a human-readable skip reason, and the timestamp to record.
func (m *Monitor) shouldRunBackupPoll(last time.Time, now time.Time) (bool, string, time.Time) {
if m == nil || m.config == nil {
return false, "configuration unavailable", last
}
if !m.config.EnableBackupPolling {
return false, "backup polling globally disabled", last
}
interval := m.config.BackupPollingInterval
if interval > 0 {
if !last.IsZero() && now.Sub(last) < interval {
next := last.Add(interval)
return false, fmt.Sprintf("next run scheduled for %s", next.Format(time.RFC3339)), last
}
return true, "", now
}
backupCycles := m.config.BackupPollingCycles
if backupCycles <= 0 {
backupCycles = 10
}
if m.pollCounter%int64(backupCycles) == 0 || m.pollCounter == 1 {
return true, "", now
}
remaining := int64(backupCycles) - (m.pollCounter % int64(backupCycles))
if remaining <= 0 {
remaining = int64(backupCycles)
}
return false, fmt.Sprintf("next run in %d polling cycles", remaining), last
}
const (
dockerConnectionPrefix = "docker-"
dockerOfflineGraceMultiplier = 4
dockerMinimumHealthWindow = 30 * time.Second
dockerMaximumHealthWindow = 10 * time.Minute
nodeRRDCacheTTL = 30 * time.Second
nodeRRDRequestTimeout = 2 * time.Second
guestMetadataCacheTTL = 5 * time.Minute
)
type guestMetadataCacheEntry struct {
ipAddresses []string
networkInterfaces []models.GuestNetworkInterface
osName string
osVersion string
fetchedAt time.Time
}
type taskOutcome struct {
success bool
transient bool
err error
recordedAt time.Time
}
func (m *Monitor) getNodeRRDMemAvailable(ctx context.Context, client PVEClientInterface, nodeName string) (uint64, error) {
if client == nil || nodeName == "" {
return 0, fmt.Errorf("invalid arguments for RRD lookup")
}
now := time.Now()
m.rrdCacheMu.RLock()
if entry, ok := m.nodeRRDMemCache[nodeName]; ok && now.Sub(entry.fetchedAt) < nodeRRDCacheTTL {
m.rrdCacheMu.RUnlock()
return entry.value, nil
}
m.rrdCacheMu.RUnlock()
requestCtx, cancel := context.WithTimeout(ctx, nodeRRDRequestTimeout)
defer cancel()
points, err := client.GetNodeRRDData(requestCtx, nodeName, "hour", "AVERAGE", []string{"memavailable", "memtotal"})
if err != nil {
return 0, err
}
var memAvailable uint64
var memTotal uint64
for i := len(points) - 1; i >= 0; i-- {
point := points[i]
if point.MemTotal != nil && !math.IsNaN(*point.MemTotal) && *point.MemTotal > 0 {
memTotal = uint64(math.Round(*point.MemTotal))
}
if point.MemAvailable == nil || math.IsNaN(*point.MemAvailable) || *point.MemAvailable <= 0 {
continue
}
memAvailable = uint64(math.Round(*point.MemAvailable))
break
}
if memAvailable == 0 {
return 0, fmt.Errorf("rrd memavailable not present")
}
if memTotal > 0 && memAvailable > memTotal {
memAvailable = memTotal
}
m.rrdCacheMu.Lock()
m.nodeRRDMemCache[nodeName] = rrdMemCacheEntry{
value: memAvailable,
fetchedAt: now,
}
m.rrdCacheMu.Unlock()
return memAvailable, nil
}
// RemoveDockerHost removes a docker host from the shared state and clears related alerts.
func (m *Monitor) RemoveDockerHost(hostID string) (models.DockerHost, error) {
hostID = strings.TrimSpace(hostID)
if hostID == "" {
return models.DockerHost{}, fmt.Errorf("docker host id is required")
}
host, removed := m.state.RemoveDockerHost(hostID)
if !removed {
if logging.IsLevelEnabled(zerolog.DebugLevel) {
log.Debug().Str("dockerHostID", hostID).Msg("Docker host not present in state during removal; proceeding to clear alerts")
}
host = models.DockerHost{
ID: hostID,
Hostname: hostID,
DisplayName: hostID,
}
}
// Track removal to prevent resurrection from cached reports
m.mu.Lock()
m.removedDockerHosts[hostID] = time.Now()
if cmd, ok := m.dockerCommands[hostID]; ok {
delete(m.dockerCommandIndex, cmd.status.ID)
}
delete(m.dockerCommands, hostID)
m.mu.Unlock()
m.state.RemoveConnectionHealth(dockerConnectionPrefix + hostID)
if m.alertManager != nil {
m.alertManager.HandleDockerHostRemoved(host)
m.SyncAlertState()
}
log.Info().
Str("dockerHost", host.Hostname).
Str("dockerHostID", hostID).
Bool("removed", removed).
Msg("Docker host removed and alerts cleared")
return host, nil
}
// HideDockerHost marks a docker host as hidden without removing it from state.
// Hidden hosts will not be shown in the frontend but will continue to accept updates.
func (m *Monitor) HideDockerHost(hostID string) (models.DockerHost, error) {
hostID = strings.TrimSpace(hostID)
if hostID == "" {
return models.DockerHost{}, fmt.Errorf("docker host id is required")
}
host, ok := m.state.SetDockerHostHidden(hostID, true)
if !ok {
return models.DockerHost{}, fmt.Errorf("docker host %q not found", hostID)
}
log.Info().
Str("dockerHost", host.Hostname).
Str("dockerHostID", hostID).
Msg("Docker host hidden from view")
return host, nil
}
// UnhideDockerHost marks a docker host as visible again.
func (m *Monitor) UnhideDockerHost(hostID string) (models.DockerHost, error) {
hostID = strings.TrimSpace(hostID)
if hostID == "" {
return models.DockerHost{}, fmt.Errorf("docker host id is required")
}
host, ok := m.state.SetDockerHostHidden(hostID, false)
if !ok {
return models.DockerHost{}, fmt.Errorf("docker host %q not found", hostID)
}
// Clear removal tracking if it was marked as removed
m.mu.Lock()
delete(m.removedDockerHosts, hostID)
m.mu.Unlock()
log.Info().
Str("dockerHost", host.Hostname).
Str("dockerHostID", hostID).
Msg("Docker host unhidden")
return host, nil
}
// MarkDockerHostPendingUninstall marks a docker host as pending uninstall.
// This is used when the user has run the uninstall command and is waiting for the host to go offline.
func (m *Monitor) MarkDockerHostPendingUninstall(hostID string) (models.DockerHost, error) {
hostID = strings.TrimSpace(hostID)
if hostID == "" {
return models.DockerHost{}, fmt.Errorf("docker host id is required")
}
host, ok := m.state.SetDockerHostPendingUninstall(hostID, true)
if !ok {
return models.DockerHost{}, fmt.Errorf("docker host %q not found", hostID)
}
log.Info().
Str("dockerHost", host.Hostname).
Str("dockerHostID", hostID).
Msg("Docker host marked as pending uninstall")
return host, nil
}
// AllowDockerHostReenroll removes a host ID from the removal blocklist so it can report again.
func (m *Monitor) AllowDockerHostReenroll(hostID string) error {
hostID = strings.TrimSpace(hostID)
if hostID == "" {
return fmt.Errorf("docker host id is required")
}
m.mu.Lock()
defer m.mu.Unlock()
if _, exists := m.removedDockerHosts[hostID]; !exists {
log.Debug().
Str("dockerHostID", hostID).
Msg("Allow re-enroll requested for docker host that was not blocked")
return nil
}
delete(m.removedDockerHosts, hostID)
if cmd, exists := m.dockerCommands[hostID]; exists {
delete(m.dockerCommandIndex, cmd.status.ID)
delete(m.dockerCommands, hostID)
}
m.state.SetDockerHostCommand(hostID, nil)
log.Info().
Str("dockerHostID", hostID).
Msg("Docker host removal block cleared; host may report again")
return nil
}
// GetDockerHost retrieves a docker host by identifier if present in state.
func (m *Monitor) GetDockerHost(hostID string) (models.DockerHost, bool) {
hostID = strings.TrimSpace(hostID)
if hostID == "" {
return models.DockerHost{}, false
}
hosts := m.state.GetDockerHosts()
for _, host := range hosts {
if host.ID == hostID {
return host, true
}
}
return models.DockerHost{}, false
}
// GetDockerHosts returns a point-in-time snapshot of all Docker hosts Pulse knows about.
func (m *Monitor) GetDockerHosts() []models.DockerHost {
if m == nil || m.state == nil {
return nil
}
return m.state.GetDockerHosts()
}
// QueueDockerHostStop queues a stop command for the specified docker host.
func (m *Monitor) QueueDockerHostStop(hostID string) (models.DockerHostCommandStatus, error) {
return m.queueDockerStopCommand(hostID)
}
// FetchDockerCommandForHost retrieves the next command payload (if any) for the host.
func (m *Monitor) FetchDockerCommandForHost(hostID string) (map[string]any, *models.DockerHostCommandStatus) {
return m.getDockerCommandPayload(hostID)
}
// AcknowledgeDockerHostCommand updates the lifecycle status for a docker host command.
func (m *Monitor) AcknowledgeDockerHostCommand(commandID, hostID, status, message string) (models.DockerHostCommandStatus, string, bool, error) {
return m.acknowledgeDockerCommand(commandID, hostID, status, message)
}
func tokenHintFromRecord(record *config.APITokenRecord) string {
if record == nil {
return ""
}
switch {
case record.Prefix != "" && record.Suffix != "":
return fmt.Sprintf("%s…%s", record.Prefix, record.Suffix)
case record.Prefix != "":
return record.Prefix + "…"
case record.Suffix != "":
return "…" + record.Suffix
default:
return ""
}
}
// ApplyDockerReport ingests a docker agent report into the shared state.
func (m *Monitor) ApplyDockerReport(report agentsdocker.Report, tokenRecord *config.APITokenRecord) (models.DockerHost, error) {
identifier := strings.TrimSpace(report.AgentKey())
if identifier == "" {
return models.DockerHost{}, fmt.Errorf("docker report missing agent identifier")
}
// Check if this host was deliberately removed - reject report to prevent resurrection
m.mu.RLock()
removedAt, wasRemoved := m.removedDockerHosts[identifier]
m.mu.RUnlock()
if wasRemoved {
log.Info().
Str("dockerHostID", identifier).
Time("removedAt", removedAt).
Msg("Rejecting report from deliberately removed Docker host")
return models.DockerHost{}, fmt.Errorf("docker host %q was removed at %v and cannot report again", identifier, removedAt.Format(time.RFC3339))
}
hostname := strings.TrimSpace(report.Host.Hostname)
if hostname == "" {
return models.DockerHost{}, fmt.Errorf("docker report missing hostname")
}
timestamp := report.Timestamp
if timestamp.IsZero() {
timestamp = time.Now()
}
agentID := strings.TrimSpace(report.Agent.ID)
if agentID == "" {
agentID = identifier
}
displayName := strings.TrimSpace(report.Host.Name)
if displayName == "" {
displayName = hostname
}
var previous models.DockerHost
var hasPrevious bool
for _, existing := range m.state.GetDockerHosts() {
if existing.ID == identifier {
previous = existing
hasPrevious = true
break
}
}
containers := make([]models.DockerContainer, 0, len(report.Containers))
for _, payload := range report.Containers {
container := models.DockerContainer{
ID: payload.ID,
Name: payload.Name,
Image: payload.Image,
State: payload.State,
Status: payload.Status,
Health: payload.Health,
CPUPercent: safeFloat(payload.CPUPercent),
MemoryUsage: payload.MemoryUsageBytes,
MemoryLimit: payload.MemoryLimitBytes,
MemoryPercent: safeFloat(payload.MemoryPercent),
UptimeSeconds: payload.UptimeSeconds,
RestartCount: payload.RestartCount,
ExitCode: payload.ExitCode,
CreatedAt: payload.CreatedAt,
StartedAt: payload.StartedAt,
FinishedAt: payload.FinishedAt,
}
if len(payload.Ports) > 0 {
ports := make([]models.DockerContainerPort, len(payload.Ports))
for i, port := range payload.Ports {
ports[i] = models.DockerContainerPort{
PrivatePort: port.PrivatePort,
PublicPort: port.PublicPort,
Protocol: port.Protocol,
IP: port.IP,
}
}
container.Ports = ports
}
if len(payload.Labels) > 0 {
labels := make(map[string]string, len(payload.Labels))
for k, v := range payload.Labels {
labels[k] = v
}
container.Labels = labels
}
if len(payload.Networks) > 0 {
networks := make([]models.DockerContainerNetworkLink, len(payload.Networks))
for i, net := range payload.Networks {
networks[i] = models.DockerContainerNetworkLink{
Name: net.Name,
IPv4: net.IPv4,
IPv6: net.IPv6,
}
}
container.Networks = networks
}
containers = append(containers, container)
}
host := models.DockerHost{
ID: identifier,
AgentID: agentID,
Hostname: hostname,
DisplayName: displayName,
MachineID: strings.TrimSpace(report.Host.MachineID),
OS: report.Host.OS,
KernelVersion: report.Host.KernelVersion,
Architecture: report.Host.Architecture,
DockerVersion: report.Host.DockerVersion,
CPUs: report.Host.TotalCPU,
TotalMemoryBytes: report.Host.TotalMemoryBytes,
UptimeSeconds: report.Host.UptimeSeconds,
Status: "online",
LastSeen: timestamp,
IntervalSeconds: report.Agent.IntervalSeconds,
AgentVersion: report.Agent.Version,
Containers: containers,
}
if tokenRecord != nil {
host.TokenID = tokenRecord.ID
host.TokenName = tokenRecord.Name
host.TokenHint = tokenHintFromRecord(tokenRecord)
if tokenRecord.LastUsedAt != nil {
t := tokenRecord.LastUsedAt.UTC()
host.TokenLastUsedAt = &t
} else {
t := time.Now().UTC()
host.TokenLastUsedAt = &t
}
} else if hasPrevious {
host.TokenID = previous.TokenID
host.TokenName = previous.TokenName
host.TokenHint = previous.TokenHint
host.TokenLastUsedAt = previous.TokenLastUsedAt
}
m.state.UpsertDockerHost(host)
m.state.SetConnectionHealth(dockerConnectionPrefix+host.ID, true)
// Check if the host was previously hidden and is now visible again
if hasPrevious && previous.Hidden && !host.Hidden {
log.Info().
Str("dockerHost", host.Hostname).
Str("dockerHostID", host.ID).
Msg("Docker host auto-unhidden after receiving report")
}
// Check if the host was pending uninstall - if so, log a warning that uninstall failed and clear the flag
if hasPrevious && previous.PendingUninstall {
log.Warn().
Str("dockerHost", host.Hostname).
Str("dockerHostID", host.ID).
Msg("Docker host reporting again after pending uninstall - uninstall may have failed")
// Clear the pending uninstall flag since the host is clearly still active
m.state.SetDockerHostPendingUninstall(host.ID, false)
}
if m.alertManager != nil {
m.alertManager.CheckDockerHost(host)
}
log.Debug().
Str("dockerHost", host.Hostname).
Int("containers", len(containers)).
Msg("Docker host report processed")
return host, nil
}
const (
removedDockerHostsTTL = 24 * time.Hour // Clean up removed hosts tracking after 24 hours
)
// cleanupRemovedDockerHosts removes entries from the removed hosts map that are older than 24 hours.
func (m *Monitor) cleanupRemovedDockerHosts(now time.Time) {
m.mu.Lock()
defer m.mu.Unlock()
for hostID, removedAt := range m.removedDockerHosts {
if now.Sub(removedAt) > removedDockerHostsTTL {
delete(m.removedDockerHosts, hostID)
log.Debug().
Str("dockerHostID", hostID).
Time("removedAt", removedAt).
Msg("Cleaned up old removed Docker host entry")
}
}
}
// evaluateDockerAgents updates health for Docker hosts based on last report time.
func (m *Monitor) evaluateDockerAgents(now time.Time) {
hosts := m.state.GetDockerHosts()
for _, host := range hosts {
interval := host.IntervalSeconds
if interval <= 0 {
interval = int(dockerMinimumHealthWindow / time.Second)
}
window := time.Duration(interval) * time.Second * dockerOfflineGraceMultiplier
if window < dockerMinimumHealthWindow {
window = dockerMinimumHealthWindow
} else if window > dockerMaximumHealthWindow {
window = dockerMaximumHealthWindow
}
healthy := !host.LastSeen.IsZero() && now.Sub(host.LastSeen) <= window
key := dockerConnectionPrefix + host.ID
m.state.SetConnectionHealth(key, healthy)
hostCopy := host
if healthy {
hostCopy.Status = "online"
m.state.SetDockerHostStatus(host.ID, "online")
if m.alertManager != nil {
m.alertManager.HandleDockerHostOnline(hostCopy)
}
} else {
hostCopy.Status = "offline"
m.state.SetDockerHostStatus(host.ID, "offline")
if m.alertManager != nil {
m.alertManager.HandleDockerHostOffline(hostCopy)
}
}
}
}
// sortContent sorts comma-separated content values for consistent display
func sortContent(content string) string {
if content == "" {
return ""
}
parts := strings.Split(content, ",")
sort.Strings(parts)
return strings.Join(parts, ",")
}
func (m *Monitor) fetchGuestAgentMetadata(ctx context.Context, client PVEClientInterface, instanceName, nodeName, vmName string, vmid int, vmStatus *proxmox.VMStatus) ([]string, []models.GuestNetworkInterface, string, string) {
if vmStatus == nil || client == nil {
m.clearGuestMetadataCache(instanceName, nodeName, vmid)
return nil, nil, "", ""
}
if vmStatus.Agent <= 0 {
m.clearGuestMetadataCache(instanceName, nodeName, vmid)
return nil, nil, "", ""
}
key := guestMetadataCacheKey(instanceName, nodeName, vmid)
now := time.Now()
m.guestMetadataMu.RLock()
cached, ok := m.guestMetadataCache[key]
m.guestMetadataMu.RUnlock()
if ok && now.Sub(cached.fetchedAt) < guestMetadataCacheTTL {
return cloneStringSlice(cached.ipAddresses), cloneGuestNetworkInterfaces(cached.networkInterfaces), cached.osName, cached.osVersion
}
// Start with cached values as fallback in case new calls fail
ipAddresses := cloneStringSlice(cached.ipAddresses)
networkIfaces := cloneGuestNetworkInterfaces(cached.networkInterfaces)
osName := cached.osName
osVersion := cached.osVersion
ifaceCtx, cancelIface := context.WithTimeout(ctx, 5*time.Second)
interfaces, err := client.GetVMNetworkInterfaces(ifaceCtx, nodeName, vmid)
cancelIface()
if err != nil {
log.Debug().
Str("instance", instanceName).
Str("vm", vmName).
Int("vmid", vmid).
Err(err).
Msg("Guest agent network interfaces unavailable")
} else if len(interfaces) > 0 {
ipAddresses, networkIfaces = processGuestNetworkInterfaces(interfaces)
} else {
ipAddresses = nil
networkIfaces = nil
}
osCtx, cancelOS := context.WithTimeout(ctx, 3*time.Second)
agentInfo, err := client.GetVMAgentInfo(osCtx, nodeName, vmid)
cancelOS()
if err != nil {
log.Debug().
Str("instance", instanceName).
Str("vm", vmName).
Int("vmid", vmid).
Err(err).
Msg("Guest agent OS info unavailable")
} else if len(agentInfo) > 0 {
osName, osVersion = extractGuestOSInfo(agentInfo)
} else {
osName = ""
osVersion = ""
}
entry := guestMetadataCacheEntry{
ipAddresses: cloneStringSlice(ipAddresses),
networkInterfaces: cloneGuestNetworkInterfaces(networkIfaces),
osName: osName,
osVersion: osVersion,
fetchedAt: time.Now(),
}
m.guestMetadataMu.Lock()
if m.guestMetadataCache == nil {
m.guestMetadataCache = make(map[string]guestMetadataCacheEntry)
}
m.guestMetadataCache[key] = entry
m.guestMetadataMu.Unlock()
return ipAddresses, networkIfaces, osName, osVersion
}
func guestMetadataCacheKey(instanceName, nodeName string, vmid int) string {
return fmt.Sprintf("%s|%s|%d", instanceName, nodeName, vmid)
}
func (m *Monitor) clearGuestMetadataCache(instanceName, nodeName string, vmid int) {
if m == nil {
return
}
key := guestMetadataCacheKey(instanceName, nodeName, vmid)
m.guestMetadataMu.Lock()
if m.guestMetadataCache != nil {
delete(m.guestMetadataCache, key)
}
m.guestMetadataMu.Unlock()
}
func cloneStringSlice(src []string) []string {
if len(src) == 0 {
return nil
}
dst := make([]string, len(src))
copy(dst, src)
return dst
}
func cloneGuestNetworkInterfaces(src []models.GuestNetworkInterface) []models.GuestNetworkInterface {
if len(src) == 0 {
return nil
}
dst := make([]models.GuestNetworkInterface, len(src))
for i, iface := range src {
dst[i] = iface
if len(iface.Addresses) > 0 {
dst[i].Addresses = cloneStringSlice(iface.Addresses)
}
}
return dst
}
func processGuestNetworkInterfaces(raw []proxmox.VMNetworkInterface) ([]string, []models.GuestNetworkInterface) {
ipSet := make(map[string]struct{})
ipAddresses := make([]string, 0)
guestIfaces := make([]models.GuestNetworkInterface, 0, len(raw))
for _, iface := range raw {
ifaceName := strings.TrimSpace(iface.Name)
mac := strings.TrimSpace(iface.HardwareAddr)
addrSet := make(map[string]struct{})
addresses := make([]string, 0, len(iface.IPAddresses))
for _, addr := range iface.IPAddresses {
ip := strings.TrimSpace(addr.Address)
if ip == "" {
continue
}
lower := strings.ToLower(ip)
if strings.HasPrefix(ip, "127.") || strings.HasPrefix(lower, "fe80") || ip == "::1" {
continue
}
if _, exists := addrSet[ip]; !exists {
addrSet[ip] = struct{}{}
addresses = append(addresses, ip)
}
if _, exists := ipSet[ip]; !exists {
ipSet[ip] = struct{}{}
ipAddresses = append(ipAddresses, ip)
}
}
if len(addresses) > 1 {
sort.Strings(addresses)
}
rxBytes := parseInterfaceStat(iface.Statistics, "rx-bytes")
txBytes := parseInterfaceStat(iface.Statistics, "tx-bytes")
if len(addresses) == 0 && rxBytes == 0 && txBytes == 0 {
continue
}
guestIfaces = append(guestIfaces, models.GuestNetworkInterface{
Name: ifaceName,
MAC: mac,
Addresses: addresses,
RXBytes: rxBytes,
TXBytes: txBytes,
})
}
if len(ipAddresses) > 1 {
sort.Strings(ipAddresses)
}
if len(guestIfaces) > 1 {
sort.SliceStable(guestIfaces, func(i, j int) bool {
return guestIfaces[i].Name < guestIfaces[j].Name
})
}
return ipAddresses, guestIfaces
}
func parseInterfaceStat(stats interface{}, key string) int64 {
if stats == nil {
return 0
}
statsMap, ok := stats.(map[string]interface{})
if !ok {
return 0
}
val, ok := statsMap[key]
if !ok {
return 0
}
return anyToInt64(val)
}
func extractGuestOSInfo(data map[string]interface{}) (string, string) {
if data == nil {
return "", ""
}
if result, ok := data["result"]; ok {
if resultMap, ok := result.(map[string]interface{}); ok {
data = resultMap
}
}
name := stringValue(data["name"])
prettyName := stringValue(data["pretty-name"])
version := stringValue(data["version"])
versionID := stringValue(data["version-id"])
osName := name
if osName == "" {
osName = prettyName
}
if osName == "" {
osName = stringValue(data["id"])
}
osVersion := version
if osVersion == "" && versionID != "" {
osVersion = versionID
}
if osVersion == "" && prettyName != "" && prettyName != osName {
osVersion = prettyName
}
if osVersion == "" {
osVersion = stringValue(data["kernel-release"])
}
if osVersion == osName {
osVersion = ""
}
return osName, osVersion
}
func stringValue(val interface{}) string {
switch v := val.(type) {
case string:
return strings.TrimSpace(v)
case json.Number:
return strings.TrimSpace(v.String())
case fmt.Stringer:
return strings.TrimSpace(v.String())
case float64:
return strings.TrimSpace(strconv.FormatFloat(v, 'f', -1, 64))
case float32:
return strings.TrimSpace(strconv.FormatFloat(float64(v), 'f', -1, 32))
case int:
return strconv.Itoa(v)
case int32:
return strconv.FormatInt(int64(v), 10)
case int64:
return strconv.FormatInt(v, 10)
case uint32:
return strconv.FormatUint(uint64(v), 10)
case uint64:
return strconv.FormatUint(v, 10)
default:
return ""
}
}
func anyToInt64(val interface{}) int64 {
switch v := val.(type) {
case int:
return int64(v)
case int32:
return int64(v)
case int64:
return v
case uint32:
return int64(v)
case uint64:
if v > math.MaxInt64 {
return math.MaxInt64
}
return int64(v)
case float32:
return int64(v)
case float64:
return int64(v)
case string:
if v == "" {
return 0
}
if parsed, err := strconv.ParseInt(v, 10, 64); err == nil {
return parsed
}
if parsedFloat, err := strconv.ParseFloat(v, 64); err == nil {
return int64(parsedFloat)
}
case json.Number:
if parsed, err := v.Int64(); err == nil {
return parsed
}
if parsedFloat, err := v.Float64(); err == nil {
return int64(parsedFloat)
}
}
return 0
}
// GetConnectionStatuses returns the current connection status for all nodes
func (m *Monitor) GetConnectionStatuses() map[string]bool {
if mock.IsMockEnabled() {
statuses := make(map[string]bool)
state := mock.GetMockState()
for _, node := range state.Nodes {
key := "pve-" + node.Name
statuses[key] = strings.ToLower(node.Status) == "online"
if node.Host != "" {
statuses[node.Host] = strings.ToLower(node.Status) == "online"
}
}
for _, pbsInst := range state.PBSInstances {
key := "pbs-" + pbsInst.Name
statuses[key] = strings.ToLower(pbsInst.Status) != "offline"
if pbsInst.Host != "" {
statuses[pbsInst.Host] = strings.ToLower(pbsInst.Status) != "offline"
}
}
for _, dockerHost := range state.DockerHosts {
key := dockerConnectionPrefix + dockerHost.ID
statuses[key] = strings.ToLower(dockerHost.Status) == "online"
}
return statuses
}
m.mu.RLock()
defer m.mu.RUnlock()
statuses := make(map[string]bool)
// Check all configured PVE nodes (not just ones with clients)
for _, pve := range m.config.PVEInstances {
key := "pve-" + pve.Name
// Check if we have a client for this node
if client, exists := m.pveClients[pve.Name]; exists && client != nil {
// We have a client, check actual connection health from state
if m.state != nil && m.state.ConnectionHealth != nil {
statuses[key] = m.state.ConnectionHealth[pve.Name]
} else {
statuses[key] = true // Assume connected if we have a client
}
} else {
// No client means disconnected
statuses[key] = false
}
}
// Check all configured PBS nodes (not just ones with clients)
for _, pbs := range m.config.PBSInstances {
key := "pbs-" + pbs.Name
// Check if we have a client for this node
if client, exists := m.pbsClients[pbs.Name]; exists && client != nil {
// We have a client, check actual connection health from state
if m.state != nil && m.state.ConnectionHealth != nil {
statuses[key] = m.state.ConnectionHealth["pbs-"+pbs.Name]
} else {
statuses[key] = true // Assume connected if we have a client
}
} else {
// No client means disconnected
statuses[key] = false
}
}
return statuses
}
// checkContainerizedTempMonitoring logs a security warning if Pulse is running
// in a container with SSH-based temperature monitoring enabled
func checkContainerizedTempMonitoring() {
// Check if running in container
isContainer := os.Getenv("PULSE_DOCKER") == "true" || isRunningInContainer()
if !isContainer {
return
}
// Check if SSH keys exist (indicates temperature monitoring is configured)
homeDir := os.Getenv("HOME")
if homeDir == "" {
homeDir = "/home/pulse"
}
sshKeyPath := homeDir + "/.ssh/id_ed25519"
if _, err := os.Stat(sshKeyPath); err != nil {
// No SSH key found, temperature monitoring not configured
return
}
// Log warning
log.Warn().
Msg("🔐 SECURITY NOTICE: Pulse is running in a container with SSH-based temperature monitoring enabled. " +
"SSH private keys are stored inside the container, which could be a security risk if the container is compromised. " +
"Future versions will use agent-based architecture for better security. " +
"See documentation for hardening recommendations.")
}
// isRunningInContainer detects if running inside a container
func isRunningInContainer() bool {
// Check for /.dockerenv
if _, err := os.Stat("/.dockerenv"); err == nil {
return true
}
// Check cgroup for container indicators
data, err := os.ReadFile("/proc/1/cgroup")
if err == nil {
content := string(data)
if strings.Contains(content, "docker") ||
strings.Contains(content, "lxc") ||
strings.Contains(content, "containerd") {
return true
}
}
return false
}
// New creates a new Monitor instance
func New(cfg *config.Config) (*Monitor, error) {
// Initialize temperature collector with sensors SSH key
// Will use root user for now - can be made configurable later
homeDir := os.Getenv("HOME")
if homeDir == "" {
homeDir = "/home/pulse"
}
sshKeyPath := filepath.Join(homeDir, ".ssh/id_ed25519_sensors")
tempCollector := NewTemperatureCollector("root", sshKeyPath)
// Security warning if running in container with SSH temperature monitoring
checkContainerizedTempMonitoring()
stalenessTracker := NewStalenessTracker(getPollMetrics())
stalenessTracker.SetBounds(cfg.AdaptivePollingBaseInterval, cfg.AdaptivePollingMaxInterval)
taskQueue := NewTaskQueue()
deadLetterQueue := NewTaskQueue()
breakers := make(map[string]*circuitBreaker)
failureCounts := make(map[string]int)
lastOutcome := make(map[string]taskOutcome)
backoff := backoffConfig{
Initial: 5 * time.Second,
Multiplier: 2,
Jitter: 0.2,
Max: 5 * time.Minute,
}
if cfg.AdaptivePollingEnabled && cfg.AdaptivePollingMaxInterval > 0 && cfg.AdaptivePollingMaxInterval <= 15*time.Second {
backoff.Initial = 750 * time.Millisecond
backoff.Max = 6 * time.Second
}
var scheduler *AdaptiveScheduler
if cfg.AdaptivePollingEnabled {
scheduler = NewAdaptiveScheduler(SchedulerConfig{
BaseInterval: cfg.AdaptivePollingBaseInterval,
MinInterval: cfg.AdaptivePollingMinInterval,
MaxInterval: cfg.AdaptivePollingMaxInterval,
}, stalenessTracker, nil, nil)
}
m := &Monitor{
config: cfg,
state: models.NewState(),
pveClients: make(map[string]PVEClientInterface),
pbsClients: make(map[string]*pbs.Client),
pmgClients: make(map[string]*pmg.Client),
pollMetrics: getPollMetrics(),
scheduler: scheduler,
stalenessTracker: stalenessTracker,
taskQueue: taskQueue,
deadLetterQueue: deadLetterQueue,
circuitBreakers: breakers,
failureCounts: failureCounts,
lastOutcome: lastOutcome,
backoffCfg: backoff,
rng: rand.New(rand.NewSource(time.Now().UnixNano())),
maxRetryAttempts: 5,
tempCollector: tempCollector,
startTime: time.Now(),
rateTracker: NewRateTracker(),
metricsHistory: NewMetricsHistory(1000, 24*time.Hour), // Keep up to 1000 points or 24 hours
alertManager: alerts.NewManager(),
notificationMgr: notifications.NewNotificationManager(cfg.PublicURL),
configPersist: config.NewConfigPersistence(cfg.DataPath),
discoveryService: nil, // Will be initialized in Start()
authFailures: make(map[string]int),
lastAuthAttempt: make(map[string]time.Time),
lastClusterCheck: make(map[string]time.Time),
lastPhysicalDiskPoll: make(map[string]time.Time),
lastPVEBackupPoll: make(map[string]time.Time),
lastPBSBackupPoll: make(map[string]time.Time),
persistence: config.NewConfigPersistence(cfg.DataPath),
pbsBackupPollers: make(map[string]bool),
nodeSnapshots: make(map[string]NodeMemorySnapshot),
guestSnapshots: make(map[string]GuestMemorySnapshot),
nodeRRDMemCache: make(map[string]rrdMemCacheEntry),
removedDockerHosts: make(map[string]time.Time),
dockerCommands: make(map[string]*dockerHostCommand),
dockerCommandIndex: make(map[string]string),
guestMetadataCache: make(map[string]guestMetadataCacheEntry),
instanceInfoCache: make(map[string]*instanceInfo),
pollStatusMap: make(map[string]*pollStatus),
dlqInsightMap: make(map[string]*dlqInsight),
}
m.breakerBaseRetry = 5 * time.Second
m.breakerMaxDelay = 5 * time.Minute
m.breakerHalfOpenWindow = 30 * time.Second
if cfg.AdaptivePollingEnabled && cfg.AdaptivePollingMaxInterval > 0 && cfg.AdaptivePollingMaxInterval <= 15*time.Second {
m.breakerBaseRetry = 2 * time.Second
m.breakerMaxDelay = 10 * time.Second
m.breakerHalfOpenWindow = 2 * time.Second
}
m.executor = newRealExecutor(m)
m.buildInstanceInfoCache(cfg)
if m.pollMetrics != nil {
m.pollMetrics.ResetQueueDepth(0)
}
// Load saved configurations
if alertConfig, err := m.configPersist.LoadAlertConfig(); err == nil {
m.alertManager.UpdateConfig(*alertConfig)
// Apply schedule settings to notification manager
if alertConfig.Schedule.Cooldown > 0 {
m.notificationMgr.SetCooldown(alertConfig.Schedule.Cooldown)
}
if alertConfig.Schedule.GroupingWindow > 0 {
m.notificationMgr.SetGroupingWindow(alertConfig.Schedule.GroupingWindow)
} else if alertConfig.Schedule.Grouping.Window > 0 {
m.notificationMgr.SetGroupingWindow(alertConfig.Schedule.Grouping.Window)
}
m.notificationMgr.SetGroupingOptions(
alertConfig.Schedule.Grouping.ByNode,
alertConfig.Schedule.Grouping.ByGuest,
)
} else {
log.Warn().Err(err).Msg("Failed to load alert configuration")
}
if emailConfig, err := m.configPersist.LoadEmailConfig(); err == nil {
m.notificationMgr.SetEmailConfig(*emailConfig)
} else {
log.Warn().Err(err).Msg("Failed to load email configuration")
}
if appriseConfig, err := m.configPersist.LoadAppriseConfig(); err == nil {
m.notificationMgr.SetAppriseConfig(*appriseConfig)
} else {
log.Warn().Err(err).Msg("Failed to load Apprise configuration")
}
// Migrate webhooks if needed (from unencrypted to encrypted)
if err := m.configPersist.MigrateWebhooksIfNeeded(); err != nil {
log.Warn().Err(err).Msg("Failed to migrate webhooks")
}
if webhooks, err := m.configPersist.LoadWebhooks(); err == nil {
for _, webhook := range webhooks {
m.notificationMgr.AddWebhook(webhook)
}
} else {
log.Warn().Err(err).Msg("Failed to load webhook configuration")
}
// Check if mock mode is enabled before initializing clients
mockEnabled := mock.IsMockEnabled()
if mockEnabled {
log.Info().Msg("Mock mode enabled - skipping PVE/PBS client initialization")
} else {
// Initialize PVE clients
log.Info().Int("count", len(cfg.PVEInstances)).Msg("Initializing PVE clients")
for _, pve := range cfg.PVEInstances {
log.Info().
Str("name", pve.Name).
Str("host", pve.Host).
Str("user", pve.User).
Bool("hasToken", pve.TokenName != "").
Msg("Configuring PVE instance")
// Check if this is a cluster
if pve.IsCluster && len(pve.ClusterEndpoints) > 0 {
// For clusters, check if endpoints have IPs/resolvable hosts
// If not, use the main host for all connections (Proxmox will route cluster API calls)
hasValidEndpoints := false
endpoints := make([]string, 0, len(pve.ClusterEndpoints))
for _, ep := range pve.ClusterEndpoints {
// Use IP if available, otherwise use host
host := ep.IP
if host == "" {
host = ep.Host
}
// Skip if no host information
if host == "" {
log.Warn().
Str("node", ep.NodeName).
Msg("Skipping cluster endpoint with no host/IP")
continue
}
// Check if we have a valid IP or a fully qualified hostname (with dots)
if strings.Contains(host, ".") || net.ParseIP(host) != nil {
hasValidEndpoints = true
}
// Ensure we have the full URL
if !strings.HasPrefix(host, "http") {
if pve.VerifySSL {
host = fmt.Sprintf("https://%s:8006", host)
} else {
host = fmt.Sprintf("https://%s:8006", host)
}
}
endpoints = append(endpoints, host)
}
// If endpoints are just node names (not FQDNs or IPs), use main host only
// This is common when cluster nodes are discovered but not directly reachable
if !hasValidEndpoints || len(endpoints) == 0 {
log.Info().
Str("instance", pve.Name).
Str("mainHost", pve.Host).
Msg("Cluster endpoints are not resolvable, using main host for all cluster operations")
endpoints = []string{pve.Host}
if !strings.HasPrefix(endpoints[0], "http") {
endpoints[0] = fmt.Sprintf("https://%s:8006", endpoints[0])
}
}
log.Info().
Str("cluster", pve.ClusterName).
Strs("endpoints", endpoints).
Msg("Creating cluster-aware client")
clientConfig := config.CreateProxmoxConfig(&pve)
clientConfig.Timeout = cfg.ConnectionTimeout
clusterClient := proxmox.NewClusterClient(
pve.Name,
clientConfig,
endpoints,
)
m.pveClients[pve.Name] = clusterClient
log.Info().
Str("instance", pve.Name).
Str("cluster", pve.ClusterName).
Int("endpoints", len(endpoints)).
Msg("Cluster client created successfully")
// Set initial connection health to true for cluster
m.state.SetConnectionHealth(pve.Name, true)
} else {
// Create regular client
clientConfig := config.CreateProxmoxConfig(&pve)
clientConfig.Timeout = cfg.ConnectionTimeout
client, err := proxmox.NewClient(clientConfig)
if err != nil {
monErr := errors.WrapConnectionError("create_pve_client", pve.Name, err)
log.Error().
Err(monErr).
Str("instance", pve.Name).
Str("host", pve.Host).
Str("user", pve.User).
Bool("hasPassword", pve.Password != "").
Bool("hasToken", pve.TokenValue != "").
Msg("Failed to create PVE client - node will show as disconnected")
// Set initial connection health to false for this node
m.state.SetConnectionHealth(pve.Name, false)
continue
}
m.pveClients[pve.Name] = client
log.Info().Str("instance", pve.Name).Msg("PVE client created successfully")
// Set initial connection health to true
m.state.SetConnectionHealth(pve.Name, true)
}
}
// Initialize PBS clients
log.Info().Int("count", len(cfg.PBSInstances)).Msg("Initializing PBS clients")
for _, pbsInst := range cfg.PBSInstances {
log.Info().
Str("name", pbsInst.Name).
Str("host", pbsInst.Host).
Str("user", pbsInst.User).
Bool("hasToken", pbsInst.TokenName != "").
Msg("Configuring PBS instance")
clientConfig := config.CreatePBSConfig(&pbsInst)
clientConfig.Timeout = 60 * time.Second // Very generous timeout for slow PBS servers
client, err := pbs.NewClient(clientConfig)
if err != nil {
monErr := errors.WrapConnectionError("create_pbs_client", pbsInst.Name, err)
log.Error().
Err(monErr).
Str("instance", pbsInst.Name).
Str("host", pbsInst.Host).
Str("user", pbsInst.User).
Bool("hasPassword", pbsInst.Password != "").
Bool("hasToken", pbsInst.TokenValue != "").
Msg("Failed to create PBS client - node will show as disconnected")
// Set initial connection health to false for this node
m.state.SetConnectionHealth("pbs-"+pbsInst.Name, false)
continue
}
m.pbsClients[pbsInst.Name] = client
log.Info().Str("instance", pbsInst.Name).Msg("PBS client created successfully")
// Set initial connection health to true
m.state.SetConnectionHealth("pbs-"+pbsInst.Name, true)
}
// Initialize PMG clients
log.Info().Int("count", len(cfg.PMGInstances)).Msg("Initializing PMG clients")
for _, pmgInst := range cfg.PMGInstances {
log.Info().
Str("name", pmgInst.Name).
Str("host", pmgInst.Host).
Str("user", pmgInst.User).
Bool("hasToken", pmgInst.TokenName != "").
Msg("Configuring PMG instance")
clientConfig := config.CreatePMGConfig(&pmgInst)
if clientConfig.Timeout <= 0 {
clientConfig.Timeout = 45 * time.Second
}
client, err := pmg.NewClient(clientConfig)
if err != nil {
monErr := errors.WrapConnectionError("create_pmg_client", pmgInst.Name, err)
log.Error().
Err(monErr).
Str("instance", pmgInst.Name).
Str("host", pmgInst.Host).
Str("user", pmgInst.User).
Bool("hasPassword", pmgInst.Password != "").
Bool("hasToken", pmgInst.TokenValue != "").
Msg("Failed to create PMG client - gateway will show as disconnected")
m.state.SetConnectionHealth("pmg-"+pmgInst.Name, false)
continue
}
m.pmgClients[pmgInst.Name] = client
log.Info().Str("instance", pmgInst.Name).Msg("PMG client created successfully")
m.state.SetConnectionHealth("pmg-"+pmgInst.Name, true)
}
} // End of else block for mock mode check
// Initialize state stats
m.state.Stats = models.Stats{
StartTime: m.startTime,
Version: "2.0.0-go",
}
return m, nil
}
// SetExecutor allows tests to override the poll executor; passing nil restores the default executor.
func (m *Monitor) SetExecutor(exec PollExecutor) {
if m == nil {
return
}
m.mu.Lock()
defer m.mu.Unlock()
if exec == nil {
m.executor = newRealExecutor(m)
return
}
m.executor = exec
}
func (m *Monitor) buildInstanceInfoCache(cfg *config.Config) {
if m == nil || cfg == nil {
return
}
m.mu.Lock()
defer m.mu.Unlock()
if m.instanceInfoCache == nil {
m.instanceInfoCache = make(map[string]*instanceInfo)
}
add := func(instType InstanceType, name string, displayName string, connection string, metadata map[string]string) {
key := schedulerKey(instType, name)
m.instanceInfoCache[key] = &instanceInfo{
Key: key,
Type: instType,
DisplayName: displayName,
Connection: connection,
Metadata: metadata,
}
}
// PVE instances
for _, inst := range cfg.PVEInstances {
name := strings.TrimSpace(inst.Name)
if name == "" {
name = strings.TrimSpace(inst.Host)
}
if name == "" {
name = "pve-instance"
}
display := name
if display == "" {
display = strings.TrimSpace(inst.Host)
}
connection := strings.TrimSpace(inst.Host)
add(InstanceTypePVE, name, display, connection, nil)
}
// PBS instances
for _, inst := range cfg.PBSInstances {
name := strings.TrimSpace(inst.Name)
if name == "" {
name = strings.TrimSpace(inst.Host)
}
if name == "" {
name = "pbs-instance"
}
display := name
if display == "" {
display = strings.TrimSpace(inst.Host)
}
connection := strings.TrimSpace(inst.Host)
add(InstanceTypePBS, name, display, connection, nil)
}
// PMG instances
for _, inst := range cfg.PMGInstances {
name := strings.TrimSpace(inst.Name)
if name == "" {
name = strings.TrimSpace(inst.Host)
}
if name == "" {
name = "pmg-instance"
}
display := name
if display == "" {
display = strings.TrimSpace(inst.Host)
}
connection := strings.TrimSpace(inst.Host)
add(InstanceTypePMG, name, display, connection, nil)
}
}
func (m *Monitor) getExecutor() PollExecutor {
m.mu.RLock()
exec := m.executor
m.mu.RUnlock()
return exec
}
// Start begins the monitoring loop
func (m *Monitor) Start(ctx context.Context, wsHub *websocket.Hub) {
log.Info().
Dur("pollingInterval", 10*time.Second).
Msg("Starting monitoring loop")
m.mu.Lock()
m.runtimeCtx = ctx
m.wsHub = wsHub
m.mu.Unlock()
// Initialize and start discovery service if enabled
if mock.IsMockEnabled() {
log.Info().Msg("Mock mode enabled - skipping discovery service")
m.discoveryService = nil
} else if m.config.DiscoveryEnabled {
discoverySubnet := m.config.DiscoverySubnet
if discoverySubnet == "" {
discoverySubnet = "auto"
}
cfgProvider := func() config.DiscoveryConfig {
m.mu.RLock()
defer m.mu.RUnlock()
if m.config == nil {
return config.DefaultDiscoveryConfig()
}
return config.CloneDiscoveryConfig(m.config.Discovery)
}
m.discoveryService = discovery.NewService(wsHub, 5*time.Minute, discoverySubnet, cfgProvider)
if m.discoveryService != nil {
m.discoveryService.Start(ctx)
log.Info().Msg("Discovery service initialized and started")
} else {
log.Error().Msg("Failed to initialize discovery service")
}
} else {
log.Info().Msg("Discovery service disabled by configuration")
m.discoveryService = nil
}
// Set up alert callbacks
m.alertManager.SetAlertCallback(func(alert *alerts.Alert) {
wsHub.BroadcastAlert(alert)
// Send notifications
log.Debug().
Str("alertID", alert.ID).
Str("level", string(alert.Level)).
Msg("Alert raised, sending to notification manager")
go m.notificationMgr.SendAlert(alert)
})
m.alertManager.SetResolvedCallback(func(alertID string) {
wsHub.BroadcastAlertResolved(alertID)
m.notificationMgr.CancelAlert(alertID)
// Don't broadcast full state here - it causes a cascade with many guests
// The frontend will get the updated alerts through the regular broadcast ticker
// state := m.GetState()
// wsHub.BroadcastState(state)
})
m.alertManager.SetEscalateCallback(func(alert *alerts.Alert, level int) {
log.Info().
Str("alertID", alert.ID).
Int("level", level).
Msg("Alert escalated - sending notifications")
// Get escalation config
config := m.alertManager.GetConfig()
if level <= 0 || level > len(config.Schedule.Escalation.Levels) {
return
}
escalationLevel := config.Schedule.Escalation.Levels[level-1]
// Send notifications based on escalation level
switch escalationLevel.Notify {
case "email":
// Only send email
if emailConfig := m.notificationMgr.GetEmailConfig(); emailConfig.Enabled {
m.notificationMgr.SendAlert(alert)
}
case "webhook":
// Only send webhooks
for _, webhook := range m.notificationMgr.GetWebhooks() {
if webhook.Enabled {
m.notificationMgr.SendAlert(alert)
break
}
}
case "all":
// Send all notifications
m.notificationMgr.SendAlert(alert)
}
// Update WebSocket with escalation
wsHub.BroadcastAlert(alert)
})
// Create separate tickers for polling and broadcasting
// Hardcoded to 10 seconds since Proxmox updates cluster/resources every 10 seconds
const pollingInterval = 10 * time.Second
workerCount := len(m.pveClients) + len(m.pbsClients) + len(m.pmgClients)
m.startTaskWorkers(ctx, workerCount)
pollTicker := time.NewTicker(pollingInterval)
defer pollTicker.Stop()
broadcastTicker := time.NewTicker(pollingInterval)
defer broadcastTicker.Stop()
// Start connection retry mechanism for failed clients
// This handles cases where network/Proxmox isn't ready on initial startup
if !mock.IsMockEnabled() {
go m.retryFailedConnections(ctx)
}
// Do an immediate poll on start (only if not in mock mode)
if mock.IsMockEnabled() {
log.Info().Msg("Mock mode enabled - skipping real node polling")
go m.checkMockAlerts()
} else {
go m.poll(ctx, wsHub)
}
for {
select {
case <-pollTicker.C:
now := time.Now()
m.evaluateDockerAgents(now)
m.cleanupRemovedDockerHosts(now)
if mock.IsMockEnabled() {
// In mock mode, keep synthetic alerts fresh
go m.checkMockAlerts()
} else {
// Poll real infrastructure
go m.poll(ctx, wsHub)
}
case <-broadcastTicker.C:
// Broadcast current state regardless of polling status
// Use GetState() instead of m.state.GetSnapshot() to respect mock mode
state := m.GetState()
log.Info().
Int("nodes", len(state.Nodes)).
Int("vms", len(state.VMs)).
Int("containers", len(state.Containers)).
Int("pbs", len(state.PBSInstances)).
Int("pbsBackups", len(state.Backups.PBS)).
Int("physicalDisks", len(state.PhysicalDisks)).
Msg("Broadcasting state update (ticker)")
// Convert to frontend format before broadcasting (converts time.Time to int64, etc.)
wsHub.BroadcastState(state.ToFrontend())
case <-ctx.Done():
log.Info().Msg("Monitoring loop stopped")
return
}
}
}
// retryFailedConnections attempts to recreate clients that failed during initialization
// This handles cases where Proxmox/network isn't ready when Pulse starts
func (m *Monitor) retryFailedConnections(ctx context.Context) {
// Retry schedule: 5s, 10s, 20s, 40s, 60s, then every 60s for up to 5 minutes total
retryDelays := []time.Duration{
5 * time.Second,
10 * time.Second,
20 * time.Second,
40 * time.Second,
60 * time.Second,
}
maxRetryDuration := 5 * time.Minute
startTime := time.Now()
retryIndex := 0
for {
// Stop retrying after max duration or if context is cancelled
select {
case <-ctx.Done():
return
default:
}
if time.Since(startTime) > maxRetryDuration {
log.Info().Msg("Connection retry period expired")
return
}
// Calculate next retry delay
var delay time.Duration
if retryIndex < len(retryDelays) {
delay = retryDelays[retryIndex]
retryIndex++
} else {
delay = 60 * time.Second // Continue retrying every 60s
}
// Wait before retry
select {
case <-time.After(delay):
case <-ctx.Done():
return
}
// Check for missing clients and try to recreate them
m.mu.Lock()
missingPVE := []config.PVEInstance{}
missingPBS := []config.PBSInstance{}
// Find PVE instances without clients
for _, pve := range m.config.PVEInstances {
if _, exists := m.pveClients[pve.Name]; !exists {
missingPVE = append(missingPVE, pve)
}
}
// Find PBS instances without clients
for _, pbs := range m.config.PBSInstances {
if _, exists := m.pbsClients[pbs.Name]; !exists {
missingPBS = append(missingPBS, pbs)
}
}
m.mu.Unlock()
// If no missing clients, we're done
if len(missingPVE) == 0 && len(missingPBS) == 0 {
log.Info().Msg("All client connections established successfully")
return
}
log.Info().
Int("missingPVE", len(missingPVE)).
Int("missingPBS", len(missingPBS)).
Dur("nextRetry", delay).
Msg("Attempting to reconnect failed clients")
// Try to recreate PVE clients
for _, pve := range missingPVE {
if pve.IsCluster && len(pve.ClusterEndpoints) > 0 {
// Create cluster client
hasValidEndpoints := false
endpoints := make([]string, 0, len(pve.ClusterEndpoints))
for _, ep := range pve.ClusterEndpoints {
host := ep.IP
if host == "" {
host = ep.Host
}
if host == "" {
continue
}
if strings.Contains(host, ".") || net.ParseIP(host) != nil {
hasValidEndpoints = true
}
if !strings.HasPrefix(host, "http") {
host = fmt.Sprintf("https://%s:8006", host)
}
endpoints = append(endpoints, host)
}
if !hasValidEndpoints || len(endpoints) == 0 {
endpoints = []string{pve.Host}
if !strings.HasPrefix(endpoints[0], "http") {
endpoints[0] = fmt.Sprintf("https://%s:8006", endpoints[0])
}
}
clientConfig := config.CreateProxmoxConfig(&pve)
clientConfig.Timeout = m.config.ConnectionTimeout
clusterClient := proxmox.NewClusterClient(pve.Name, clientConfig, endpoints)
m.mu.Lock()
m.pveClients[pve.Name] = clusterClient
m.state.SetConnectionHealth(pve.Name, true)
m.mu.Unlock()
log.Info().
Str("instance", pve.Name).
Str("cluster", pve.ClusterName).
Msg("Successfully reconnected cluster client")
} else {
// Create regular client
clientConfig := config.CreateProxmoxConfig(&pve)
clientConfig.Timeout = m.config.ConnectionTimeout
client, err := proxmox.NewClient(clientConfig)
if err != nil {
log.Warn().
Err(err).
Str("instance", pve.Name).
Msg("Failed to reconnect PVE client, will retry")
continue
}
m.mu.Lock()
m.pveClients[pve.Name] = client
m.state.SetConnectionHealth(pve.Name, true)
m.mu.Unlock()
log.Info().
Str("instance", pve.Name).
Msg("Successfully reconnected PVE client")
}
}
// Try to recreate PBS clients
for _, pbsInst := range missingPBS {
clientConfig := config.CreatePBSConfig(&pbsInst)
clientConfig.Timeout = 60 * time.Second
client, err := pbs.NewClient(clientConfig)
if err != nil {
log.Warn().
Err(err).
Str("instance", pbsInst.Name).
Msg("Failed to reconnect PBS client, will retry")
continue
}
m.mu.Lock()
m.pbsClients[pbsInst.Name] = client
m.state.SetConnectionHealth("pbs-"+pbsInst.Name, true)
m.mu.Unlock()
log.Info().
Str("instance", pbsInst.Name).
Msg("Successfully reconnected PBS client")
}
}
}
// poll fetches data from all configured instances
func (m *Monitor) poll(ctx context.Context, wsHub *websocket.Hub) {
// Limit concurrent polls to 2 to prevent resource exhaustion
currentCount := atomic.AddInt32(&m.activePollCount, 1)
if currentCount > 2 {
atomic.AddInt32(&m.activePollCount, -1)
if logging.IsLevelEnabled(zerolog.DebugLevel) {
log.Debug().Int32("activePolls", currentCount-1).Msg("Too many concurrent polls, skipping")
}
return
}
defer atomic.AddInt32(&m.activePollCount, -1)
if logging.IsLevelEnabled(zerolog.DebugLevel) {
log.Debug().Msg("Starting polling cycle")
}
startTime := time.Now()
now := startTime
plannedTasks := m.buildScheduledTasks(now)
for _, task := range plannedTasks {
m.taskQueue.Upsert(task)
}
m.updateQueueDepthMetric()
// Update performance metrics
m.state.Performance.LastPollDuration = time.Since(startTime).Seconds()
m.state.Stats.PollingCycles++
m.state.Stats.Uptime = int64(time.Since(m.startTime).Seconds())
m.state.Stats.WebSocketClients = wsHub.GetClientCount()
// Sync alert state so broadcasts include the latest acknowledgement data
m.syncAlertsToState()
// Increment poll counter
m.mu.Lock()
m.pollCounter++
m.mu.Unlock()
if logging.IsLevelEnabled(zerolog.DebugLevel) {
log.Debug().Dur("duration", time.Since(startTime)).Msg("Polling cycle completed")
}
// Broadcasting is now handled by the timer in Start()
}
// syncAlertsToState copies the latest alert manager data into the shared state snapshot.
// This keeps WebSocket broadcasts aligned with in-memory acknowledgement updates.
func (m *Monitor) syncAlertsToState() {
if m.pruneStaleDockerAlerts() {
if logging.IsLevelEnabled(zerolog.DebugLevel) {
log.Debug().Msg("Pruned stale docker alerts during sync")
}
}
activeAlerts := m.alertManager.GetActiveAlerts()
modelAlerts := make([]models.Alert, 0, len(activeAlerts))
for _, alert := range activeAlerts {
modelAlerts = append(modelAlerts, models.Alert{
ID: alert.ID,
Type: alert.Type,
Level: string(alert.Level),
ResourceID: alert.ResourceID,
ResourceName: alert.ResourceName,
Node: alert.Node,
Instance: alert.Instance,
Message: alert.Message,
Value: alert.Value,
Threshold: alert.Threshold,
StartTime: alert.StartTime,
Acknowledged: alert.Acknowledged,
AckTime: alert.AckTime,
AckUser: alert.AckUser,
})
if alert.Acknowledged && logging.IsLevelEnabled(zerolog.DebugLevel) {
log.Debug().Str("alertID", alert.ID).Interface("ackTime", alert.AckTime).Msg("Syncing acknowledged alert")
}
}
m.state.UpdateActiveAlerts(modelAlerts)
recentlyResolved := m.alertManager.GetRecentlyResolved()
if len(recentlyResolved) > 0 {
log.Info().Int("count", len(recentlyResolved)).Msg("Syncing recently resolved alerts")
}
m.state.UpdateRecentlyResolved(recentlyResolved)
}
// SyncAlertState is the exported wrapper used by APIs that mutate alerts outside the poll loop.
func (m *Monitor) SyncAlertState() {
m.syncAlertsToState()
}
// pruneStaleDockerAlerts removes docker alerts that reference hosts no longer present in state.
func (m *Monitor) pruneStaleDockerAlerts() bool {
if m.alertManager == nil {
return false
}
hosts := m.state.GetDockerHosts()
knownHosts := make(map[string]struct{}, len(hosts))
for _, host := range hosts {
id := strings.TrimSpace(host.ID)
if id != "" {
knownHosts[id] = struct{}{}
}
}
if len(knownHosts) == 0 {
// Still allow stale entries to be cleared if no hosts remain.
}
active := m.alertManager.GetActiveAlerts()
processed := make(map[string]struct{})
cleared := false
for _, alert := range active {
var hostID string
switch {
case alert.Type == "docker-host-offline":
hostID = strings.TrimPrefix(alert.ID, "docker-host-offline-")
case strings.HasPrefix(alert.ResourceID, "docker:"):
resource := strings.TrimPrefix(alert.ResourceID, "docker:")
if idx := strings.Index(resource, "/"); idx >= 0 {
hostID = resource[:idx]
} else {
hostID = resource
}
default:
continue
}
hostID = strings.TrimSpace(hostID)
if hostID == "" {
continue
}
if _, known := knownHosts[hostID]; known {
continue
}
if _, alreadyCleared := processed[hostID]; alreadyCleared {
continue
}
host := models.DockerHost{
ID: hostID,
DisplayName: alert.ResourceName,
Hostname: alert.Node,
}
if host.DisplayName == "" {
host.DisplayName = hostID
}
if host.Hostname == "" {
host.Hostname = hostID
}
m.alertManager.HandleDockerHostRemoved(host)
processed[hostID] = struct{}{}
cleared = true
}
return cleared
}
func (m *Monitor) startTaskWorkers(ctx context.Context, workers int) {
if m.taskQueue == nil {
return
}
if workers < 1 {
workers = 1
}
if workers > 10 {
workers = 10
}
for i := 0; i < workers; i++ {
go m.taskWorker(ctx, i)
}
}
func (m *Monitor) taskWorker(ctx context.Context, id int) {
if logging.IsLevelEnabled(zerolog.DebugLevel) {
log.Debug().Int("worker", id).Msg("Task worker started")
}
for {
task, ok := m.taskQueue.WaitNext(ctx)
if !ok {
if logging.IsLevelEnabled(zerolog.DebugLevel) {
log.Debug().Int("worker", id).Msg("Task worker stopping")
}
return
}
m.executeScheduledTask(ctx, task)
m.rescheduleTask(task)
m.updateQueueDepthMetric()
}
}
func (m *Monitor) executeScheduledTask(ctx context.Context, task ScheduledTask) {
if !m.allowExecution(task) {
if logging.IsLevelEnabled(zerolog.DebugLevel) {
log.Debug().
Str("instance", task.InstanceName).
Str("type", string(task.InstanceType)).
Msg("Task blocked by circuit breaker")
}
return
}
if m.pollMetrics != nil {
wait := time.Duration(0)
if !task.NextRun.IsZero() {
wait = time.Since(task.NextRun)
if wait < 0 {
wait = 0
}
}
instanceType := string(task.InstanceType)
if strings.TrimSpace(instanceType) == "" {
instanceType = "unknown"
}
m.pollMetrics.RecordQueueWait(instanceType, wait)
}
executor := m.getExecutor()
if executor == nil {
log.Error().
Str("instance", task.InstanceName).
Str("type", string(task.InstanceType)).
Msg("No poll executor configured; skipping task")
return
}
pollTask := PollTask{
InstanceName: task.InstanceName,
InstanceType: string(task.InstanceType),
}
switch task.InstanceType {
case InstanceTypePVE:
client, ok := m.pveClients[task.InstanceName]
if !ok || client == nil {
log.Warn().Str("instance", task.InstanceName).Msg("PVE client missing for scheduled task")
return
}
pollTask.PVEClient = client
case InstanceTypePBS:
client, ok := m.pbsClients[task.InstanceName]
if !ok || client == nil {
log.Warn().Str("instance", task.InstanceName).Msg("PBS client missing for scheduled task")
return
}
pollTask.PBSClient = client
case InstanceTypePMG:
client, ok := m.pmgClients[task.InstanceName]
if !ok || client == nil {
log.Warn().Str("instance", task.InstanceName).Msg("PMG client missing for scheduled task")
return
}
pollTask.PMGClient = client
default:
log.Debug().
Str("instance", task.InstanceName).
Str("type", string(task.InstanceType)).
Msg("Skipping unsupported task type")
return
}
executor.Execute(ctx, pollTask)
}
func (m *Monitor) rescheduleTask(task ScheduledTask) {
if m.taskQueue == nil {
return
}
key := schedulerKey(task.InstanceType, task.InstanceName)
m.mu.Lock()
outcome, hasOutcome := m.lastOutcome[key]
failureCount := m.failureCounts[key]
m.mu.Unlock()
if hasOutcome && !outcome.success {
if !outcome.transient || failureCount >= m.maxRetryAttempts {
m.sendToDeadLetter(task, outcome.err)
return
}
delay := m.backoffCfg.nextDelay(failureCount-1, m.randomFloat())
if delay <= 0 {
delay = 5 * time.Second
}
if m.config != nil && m.config.AdaptivePollingEnabled && m.config.AdaptivePollingMaxInterval > 0 && m.config.AdaptivePollingMaxInterval <= 15*time.Second {
maxDelay := 4 * time.Second
if delay > maxDelay {
delay = maxDelay
}
}
next := task
next.Interval = delay
next.NextRun = time.Now().Add(delay)
m.taskQueue.Upsert(next)
return
}
if m.scheduler == nil {
nextInterval := task.Interval
if nextInterval <= 0 && m.config != nil {
nextInterval = m.config.AdaptivePollingBaseInterval
}
if nextInterval <= 0 {
nextInterval = DefaultSchedulerConfig().BaseInterval
}
next := task
next.NextRun = time.Now().Add(nextInterval)
next.Interval = nextInterval
m.taskQueue.Upsert(next)
return
}
desc := InstanceDescriptor{
Name: task.InstanceName,
Type: task.InstanceType,
LastInterval: task.Interval,
LastScheduled: task.NextRun,
}
if m.stalenessTracker != nil {
if snap, ok := m.stalenessTracker.snapshot(task.InstanceType, task.InstanceName); ok {
desc.LastSuccess = snap.LastSuccess
desc.LastFailure = snap.LastError
if snap.ChangeHash != "" {
desc.Metadata = map[string]any{"changeHash": snap.ChangeHash}
}
}
}
tasks := m.scheduler.BuildPlan(time.Now(), []InstanceDescriptor{desc}, m.taskQueue.Size())
if len(tasks) == 0 {
next := task
nextInterval := task.Interval
if nextInterval <= 0 && m.config != nil {
nextInterval = m.config.AdaptivePollingBaseInterval
}
if nextInterval <= 0 {
nextInterval = DefaultSchedulerConfig().BaseInterval
}
next.Interval = nextInterval
next.NextRun = time.Now().Add(nextInterval)
m.taskQueue.Upsert(next)
return
}
for _, next := range tasks {
m.taskQueue.Upsert(next)
}
}
func (m *Monitor) sendToDeadLetter(task ScheduledTask, err error) {
if m.deadLetterQueue == nil {
log.Error().
Str("instance", task.InstanceName).
Str("type", string(task.InstanceType)).
Err(err).
Msg("Dead-letter queue unavailable; dropping task")
return
}
log.Error().
Str("instance", task.InstanceName).
Str("type", string(task.InstanceType)).
Err(err).
Msg("Routing task to dead-letter queue after repeated failures")
next := task
next.Interval = 30 * time.Minute
next.NextRun = time.Now().Add(next.Interval)
m.deadLetterQueue.Upsert(next)
m.updateDeadLetterMetrics()
key := schedulerKey(task.InstanceType, task.InstanceName)
now := time.Now()
m.mu.Lock()
if m.dlqInsightMap == nil {
m.dlqInsightMap = make(map[string]*dlqInsight)
}
info, ok := m.dlqInsightMap[key]
if !ok {
info = &dlqInsight{}
m.dlqInsightMap[key] = info
}
if info.FirstAttempt.IsZero() {
info.FirstAttempt = now
}
info.LastAttempt = now
info.RetryCount++
info.NextRetry = next.NextRun
if err != nil {
info.Reason = classifyDLQReason(err)
}
m.mu.Unlock()
}
func classifyDLQReason(err error) string {
if err == nil {
return ""
}
if errors.IsRetryableError(err) {
return "max_retry_attempts"
}
return "permanent_failure"
}
func (m *Monitor) updateDeadLetterMetrics() {
if m.pollMetrics == nil || m.deadLetterQueue == nil {
return
}
size := m.deadLetterQueue.Size()
if size <= 0 {
m.pollMetrics.UpdateDeadLetterCounts(nil)
return
}
tasks := m.deadLetterQueue.PeekAll(size)
m.pollMetrics.UpdateDeadLetterCounts(tasks)
}
func (m *Monitor) updateBreakerMetric(instanceType InstanceType, instance string, breaker *circuitBreaker) {
if m.pollMetrics == nil || breaker == nil {
return
}
state, failures, retryAt, _, _ := breaker.stateDetails()
m.pollMetrics.SetBreakerState(string(instanceType), instance, state, failures, retryAt)
}
func (m *Monitor) randomFloat() float64 {
if m.rng == nil {
m.rng = rand.New(rand.NewSource(time.Now().UnixNano()))
}
return m.rng.Float64()
}
func (m *Monitor) updateQueueDepthMetric() {
if m.pollMetrics == nil || m.taskQueue == nil {
return
}
snapshot := m.taskQueue.Snapshot()
m.pollMetrics.SetQueueDepth(snapshot.Depth)
m.pollMetrics.UpdateQueueSnapshot(snapshot)
}
func (m *Monitor) allowExecution(task ScheduledTask) bool {
if m.circuitBreakers == nil {
return true
}
key := schedulerKey(task.InstanceType, task.InstanceName)
breaker := m.ensureBreaker(key)
allowed := breaker.allow(time.Now())
m.updateBreakerMetric(task.InstanceType, task.InstanceName, breaker)
return allowed
}
func (m *Monitor) ensureBreaker(key string) *circuitBreaker {
m.mu.Lock()
defer m.mu.Unlock()
if m.circuitBreakers == nil {
m.circuitBreakers = make(map[string]*circuitBreaker)
}
if breaker, ok := m.circuitBreakers[key]; ok {
return breaker
}
baseRetry := m.breakerBaseRetry
if baseRetry <= 0 {
baseRetry = 5 * time.Second
}
maxDelay := m.breakerMaxDelay
if maxDelay <= 0 {
maxDelay = 5 * time.Minute
}
halfOpen := m.breakerHalfOpenWindow
if halfOpen <= 0 {
halfOpen = 30 * time.Second
}
breaker := newCircuitBreaker(3, baseRetry, maxDelay, halfOpen)
m.circuitBreakers[key] = breaker
return breaker
}
func (m *Monitor) recordTaskResult(instanceType InstanceType, instance string, pollErr error) {
if m == nil {
return
}
key := schedulerKey(instanceType, instance)
now := time.Now()
breaker := m.ensureBreaker(key)
m.mu.Lock()
status, ok := m.pollStatusMap[key]
if !ok {
status = &pollStatus{}
m.pollStatusMap[key] = status
}
if pollErr == nil {
if m.failureCounts != nil {
m.failureCounts[key] = 0
}
if m.lastOutcome != nil {
m.lastOutcome[key] = taskOutcome{
success: true,
transient: true,
err: nil,
recordedAt: now,
}
}
status.LastSuccess = now
status.ConsecutiveFailures = 0
status.FirstFailureAt = time.Time{}
m.mu.Unlock()
if breaker != nil {
breaker.recordSuccess()
m.updateBreakerMetric(instanceType, instance, breaker)
}
return
}
transient := isTransientError(pollErr)
category := "permanent"
if transient {
category = "transient"
}
if m.failureCounts != nil {
m.failureCounts[key] = m.failureCounts[key] + 1
}
if m.lastOutcome != nil {
m.lastOutcome[key] = taskOutcome{
success: false,
transient: transient,
err: pollErr,
recordedAt: now,
}
}
status.LastErrorAt = now
status.LastErrorMessage = pollErr.Error()
status.LastErrorCategory = category
status.ConsecutiveFailures++
if status.ConsecutiveFailures == 1 {
status.FirstFailureAt = now
}
m.mu.Unlock()
if breaker != nil {
breaker.recordFailure(now)
m.updateBreakerMetric(instanceType, instance, breaker)
}
}
// SchedulerHealthResponse contains complete scheduler health data for API exposure.
type SchedulerHealthResponse struct {
UpdatedAt time.Time `json:"updatedAt"`
Enabled bool `json:"enabled"`
Queue QueueSnapshot `json:"queue"`
DeadLetter DeadLetterSnapshot `json:"deadLetter"`
Breakers []BreakerSnapshot `json:"breakers,omitempty"`
Staleness []StalenessSnapshot `json:"staleness,omitempty"`
Instances []InstanceHealth `json:"instances"`
}
// DeadLetterSnapshot contains dead-letter queue data.
type DeadLetterSnapshot struct {
Count int `json:"count"`
Tasks []DeadLetterTask `json:"tasks"`
}
// SchedulerHealth returns a complete snapshot of scheduler health for API exposure.
func (m *Monitor) SchedulerHealth() SchedulerHealthResponse {
response := SchedulerHealthResponse{
UpdatedAt: time.Now(),
Enabled: m.config != nil && m.config.AdaptivePollingEnabled,
}
// Queue snapshot
if m.taskQueue != nil {
response.Queue = m.taskQueue.Snapshot()
if m.pollMetrics != nil {
m.pollMetrics.UpdateQueueSnapshot(response.Queue)
}
}
// Dead-letter queue snapshot
if m.deadLetterQueue != nil {
deadLetterTasks := m.deadLetterQueue.PeekAll(25) // limit to top 25
m.mu.RLock()
for i := range deadLetterTasks {
key := schedulerKey(InstanceType(deadLetterTasks[i].Type), deadLetterTasks[i].Instance)
if outcome, ok := m.lastOutcome[key]; ok && outcome.err != nil {
deadLetterTasks[i].LastError = outcome.err.Error()
}
if count, ok := m.failureCounts[key]; ok {
deadLetterTasks[i].Failures = count
}
}
m.mu.RUnlock()
response.DeadLetter = DeadLetterSnapshot{
Count: m.deadLetterQueue.Size(),
Tasks: deadLetterTasks,
}
m.updateDeadLetterMetrics()
}
// Circuit breaker snapshots
m.mu.RLock()
breakerSnapshots := make([]BreakerSnapshot, 0, len(m.circuitBreakers))
for key, breaker := range m.circuitBreakers {
state, failures, retryAt := breaker.State()
// Only include breakers that are not in default closed state with 0 failures
if state != "closed" || failures > 0 {
// Parse instance type and name from key
parts := strings.SplitN(key, "::", 2)
instanceType, instanceName := "unknown", key
if len(parts) == 2 {
instanceType, instanceName = parts[0], parts[1]
}
breakerSnapshots = append(breakerSnapshots, BreakerSnapshot{
Instance: instanceName,
Type: instanceType,
State: state,
Failures: failures,
RetryAt: retryAt,
})
}
}
m.mu.RUnlock()
response.Breakers = breakerSnapshots
// Staleness snapshots
if m.stalenessTracker != nil {
response.Staleness = m.stalenessTracker.Snapshot()
}
instanceInfos := make(map[string]*instanceInfo)
pollStatuses := make(map[string]pollStatus)
dlqInsights := make(map[string]dlqInsight)
breakerRefs := make(map[string]*circuitBreaker)
m.mu.RLock()
for k, v := range m.instanceInfoCache {
if v == nil {
continue
}
copyVal := *v
instanceInfos[k] = &copyVal
}
for k, v := range m.pollStatusMap {
if v == nil {
continue
}
pollStatuses[k] = *v
}
for k, v := range m.dlqInsightMap {
if v == nil {
continue
}
dlqInsights[k] = *v
}
for k, v := range m.circuitBreakers {
if v != nil {
breakerRefs[k] = v
}
}
m.mu.RUnlock()
for key, breaker := range breakerRefs {
instanceType := InstanceType("unknown")
instanceName := key
if parts := strings.SplitN(key, "::", 2); len(parts) == 2 {
if parts[0] != "" {
instanceType = InstanceType(parts[0])
}
if parts[1] != "" {
instanceName = parts[1]
}
}
m.updateBreakerMetric(instanceType, instanceName, breaker)
}
keySet := make(map[string]struct{})
for k := range instanceInfos {
if k != "" {
keySet[k] = struct{}{}
}
}
for k := range pollStatuses {
if k != "" {
keySet[k] = struct{}{}
}
}
for k := range dlqInsights {
if k != "" {
keySet[k] = struct{}{}
}
}
for k := range breakerRefs {
if k != "" {
keySet[k] = struct{}{}
}
}
for _, task := range response.DeadLetter.Tasks {
if task.Instance == "" {
continue
}
keySet[schedulerKey(InstanceType(task.Type), task.Instance)] = struct{}{}
}
for _, snap := range response.Staleness {
if snap.Instance == "" {
continue
}
keySet[schedulerKey(InstanceType(snap.Type), snap.Instance)] = struct{}{}
}
if len(keySet) > 0 {
keys := make([]string, 0, len(keySet))
for k := range keySet {
keys = append(keys, k)
}
sort.Strings(keys)
instances := make([]InstanceHealth, 0, len(keys))
for _, key := range keys {
instType := "unknown"
instName := key
if parts := strings.SplitN(key, "::", 2); len(parts) == 2 {
if parts[0] != "" {
instType = parts[0]
}
if parts[1] != "" {
instName = parts[1]
}
}
instType = strings.TrimSpace(instType)
instName = strings.TrimSpace(instName)
info := instanceInfos[key]
display := instName
connection := ""
if info != nil {
if instType == "unknown" || instType == "" {
if info.Type != "" {
instType = string(info.Type)
}
}
if strings.Contains(info.Key, "::") {
if parts := strings.SplitN(info.Key, "::", 2); len(parts) == 2 {
if instName == key {
instName = parts[1]
}
if (instType == "" || instType == "unknown") && parts[0] != "" {
instType = parts[0]
}
}
}
if info.DisplayName != "" {
display = info.DisplayName
}
if info.Connection != "" {
connection = info.Connection
}
}
display = strings.TrimSpace(display)
connection = strings.TrimSpace(connection)
if display == "" {
display = instName
}
if display == "" {
display = connection
}
if instType == "" {
instType = "unknown"
}
if instName == "" {
instName = key
}
status, hasStatus := pollStatuses[key]
instanceStatus := InstancePollStatus{}
if hasStatus {
instanceStatus.ConsecutiveFailures = status.ConsecutiveFailures
instanceStatus.LastSuccess = timePtr(status.LastSuccess)
if !status.FirstFailureAt.IsZero() {
instanceStatus.FirstFailureAt = timePtr(status.FirstFailureAt)
}
if !status.LastErrorAt.IsZero() && status.LastErrorMessage != "" {
instanceStatus.LastError = &ErrorDetail{
At: status.LastErrorAt,
Message: status.LastErrorMessage,
Category: status.LastErrorCategory,
}
}
}
breakerInfo := InstanceBreaker{
State: "closed",
FailureCount: 0,
}
if br, ok := breakerRefs[key]; ok && br != nil {
state, failures, retryAt, since, lastTransition := br.stateDetails()
if state != "" {
breakerInfo.State = state
}
breakerInfo.FailureCount = failures
breakerInfo.RetryAt = timePtr(retryAt)
breakerInfo.Since = timePtr(since)
breakerInfo.LastTransition = timePtr(lastTransition)
}
dlqInfo := InstanceDLQ{Present: false}
if dlq, ok := dlqInsights[key]; ok {
dlqInfo.Present = true
dlqInfo.Reason = dlq.Reason
dlqInfo.FirstAttempt = timePtr(dlq.FirstAttempt)
dlqInfo.LastAttempt = timePtr(dlq.LastAttempt)
dlqInfo.RetryCount = dlq.RetryCount
dlqInfo.NextRetry = timePtr(dlq.NextRetry)
}
instances = append(instances, InstanceHealth{
Key: key,
Type: instType,
DisplayName: display,
Instance: instName,
Connection: connection,
PollStatus: instanceStatus,
Breaker: breakerInfo,
DeadLetter: dlqInfo,
})
}
response.Instances = instances
} else {
response.Instances = []InstanceHealth{}
}
return response
}
func isTransientError(err error) bool {
if err == nil {
return true
}
if errors.IsRetryableError(err) {
return true
}
if stderrors.Is(err, context.Canceled) || stderrors.Is(err, context.DeadlineExceeded) {
return true
}
return false
}
// pollPVEInstance polls a single PVE instance
func (m *Monitor) pollPVEInstance(ctx context.Context, instanceName string, client PVEClientInterface) {
start := time.Now()
debugEnabled := logging.IsLevelEnabled(zerolog.DebugLevel)
var pollErr error
if m.pollMetrics != nil {
m.pollMetrics.IncInFlight("pve")
defer m.pollMetrics.DecInFlight("pve")
defer func() {
m.pollMetrics.RecordResult(PollResult{
InstanceName: instanceName,
InstanceType: "pve",
Success: pollErr == nil,
Error: pollErr,
StartTime: start,
EndTime: time.Now(),
})
}()
}
if m.stalenessTracker != nil {
defer func() {
if pollErr == nil {
m.stalenessTracker.UpdateSuccess(InstanceTypePVE, instanceName, nil)
} else {
m.stalenessTracker.UpdateError(InstanceTypePVE, instanceName)
}
}()
}
defer m.recordTaskResult(InstanceTypePVE, instanceName, pollErr)
// Check if context is cancelled
select {
case <-ctx.Done():
pollErr = ctx.Err()
if debugEnabled {
log.Debug().Str("instance", instanceName).Msg("Polling cancelled")
}
return
default:
}
if debugEnabled {
log.Debug().Str("instance", instanceName).Msg("Polling PVE instance")
}
// Get instance config
var instanceCfg *config.PVEInstance
for _, cfg := range m.config.PVEInstances {
if cfg.Name == instanceName {
instanceCfg = &cfg
break
}
}
if instanceCfg == nil {
pollErr = fmt.Errorf("pve instance config not found for %s", instanceName)
return
}
// Poll nodes
nodes, err := client.GetNodes(ctx)
if err != nil {
monErr := errors.WrapConnectionError("poll_nodes", instanceName, err)
pollErr = monErr
log.Error().Err(monErr).Str("instance", instanceName).Msg("Failed to get nodes")
m.state.SetConnectionHealth(instanceName, false)
// Track auth failure if it's an authentication error
if errors.IsAuthError(err) {
m.recordAuthFailure(instanceName, "pve")
}
return
}
// Reset auth failures on successful connection
m.resetAuthFailures(instanceName, "pve")
// Check if client is a ClusterClient to determine health status
connectionHealthStr := "healthy"
if clusterClient, ok := client.(*proxmox.ClusterClient); ok {
// For cluster clients, check if all endpoints are healthy
healthStatus := clusterClient.GetHealthStatus()
healthyCount := 0
totalCount := len(healthStatus)
for _, isHealthy := range healthStatus {
if isHealthy {
healthyCount++
}
}
if healthyCount == 0 {
// All endpoints are down
connectionHealthStr = "error"
m.state.SetConnectionHealth(instanceName, false)
} else if healthyCount < totalCount {
// Some endpoints are down - degraded state
connectionHealthStr = "degraded"
m.state.SetConnectionHealth(instanceName, true) // Still functional but degraded
log.Warn().
Str("instance", instanceName).
Int("healthy", healthyCount).
Int("total", totalCount).
Msg("Cluster is in degraded state - some nodes are unreachable")
} else {
// All endpoints are healthy
connectionHealthStr = "healthy"
m.state.SetConnectionHealth(instanceName, true)
}
} else {
// Regular client - simple healthy/unhealthy
m.state.SetConnectionHealth(instanceName, true)
}
// Capture previous memory metrics so we can preserve them if detailed status fails
prevState := m.GetState()
prevNodeMemory := make(map[string]models.Memory)
prevInstanceNodes := make([]models.Node, 0)
for _, existingNode := range prevState.Nodes {
if existingNode.Instance != instanceName {
continue
}
prevNodeMemory[existingNode.ID] = existingNode.Memory
prevInstanceNodes = append(prevInstanceNodes, existingNode)
}
// Convert to models
var modelNodes []models.Node
for _, node := range nodes {
nodeStart := time.Now()
displayName := getNodeDisplayName(instanceCfg, node.Node)
modelNode := models.Node{
ID: instanceName + "-" + node.Node,
Name: node.Node,
DisplayName: displayName,
Instance: instanceName,
Host: instanceCfg.Host, // Add the actual host URL
Status: node.Status,
Type: "node",
CPU: safeFloat(node.CPU), // Already in percentage
Memory: models.Memory{
Total: int64(node.MaxMem),
Used: int64(node.Mem),
Free: int64(node.MaxMem - node.Mem),
Usage: safePercentage(float64(node.Mem), float64(node.MaxMem)),
},
Disk: models.Disk{
Total: int64(node.MaxDisk),
Used: int64(node.Disk),
Free: int64(node.MaxDisk - node.Disk),
Usage: safePercentage(float64(node.Disk), float64(node.MaxDisk)),
},
Uptime: int64(node.Uptime),
LoadAverage: []float64{},
LastSeen: time.Now(),
ConnectionHealth: connectionHealthStr, // Use the determined health status
IsClusterMember: instanceCfg.IsCluster,
ClusterName: instanceCfg.ClusterName,
}
nodeSnapshotRaw := NodeMemoryRaw{
Total: node.MaxMem,
Used: node.Mem,
Free: node.MaxMem - node.Mem,
FallbackTotal: node.MaxMem,
FallbackUsed: node.Mem,
FallbackFree: node.MaxMem - node.Mem,
FallbackCalculated: true,
ProxmoxMemorySource: "nodes-endpoint",
}
nodeMemorySource := "nodes-endpoint"
var nodeFallbackReason string
// Debug logging for disk metrics - note that these values can fluctuate
// due to thin provisioning and dynamic allocation
if node.Disk > 0 && node.MaxDisk > 0 {
log.Debug().
Str("node", node.Node).
Uint64("disk", node.Disk).
Uint64("maxDisk", node.MaxDisk).
Float64("diskUsage", safePercentage(float64(node.Disk), float64(node.MaxDisk))).
Msg("Node disk metrics from /nodes endpoint")
}
// Track whether we successfully replaced memory metrics with detailed status data
memoryUpdated := false
// Get detailed node info if available (skip for offline nodes)
if node.Status == "online" {
nodeInfo, nodeErr := client.GetNodeStatus(ctx, node.Node)
if nodeErr != nil {
nodeFallbackReason = "node-status-unavailable"
// If we can't get node status, log but continue with data from /nodes endpoint
if node.Disk > 0 && node.MaxDisk > 0 {
log.Warn().
Str("instance", instanceName).
Str("node", node.Node).
Err(nodeErr).
Uint64("usingDisk", node.Disk).
Uint64("usingMaxDisk", node.MaxDisk).
Msg("Could not get node status - using fallback metrics (memory will include cache/buffers)")
} else {
log.Warn().
Str("instance", instanceName).
Str("node", node.Node).
Err(nodeErr).
Uint64("disk", node.Disk).
Uint64("maxDisk", node.MaxDisk).
Msg("Could not get node status - no fallback metrics available (memory will include cache/buffers)")
}
} else if nodeInfo != nil {
if nodeInfo.Memory != nil {
nodeSnapshotRaw.Total = nodeInfo.Memory.Total
nodeSnapshotRaw.Used = nodeInfo.Memory.Used
nodeSnapshotRaw.Free = nodeInfo.Memory.Free
nodeSnapshotRaw.Available = nodeInfo.Memory.Available
nodeSnapshotRaw.Avail = nodeInfo.Memory.Avail
nodeSnapshotRaw.Buffers = nodeInfo.Memory.Buffers
nodeSnapshotRaw.Cached = nodeInfo.Memory.Cached
nodeSnapshotRaw.Shared = nodeInfo.Memory.Shared
nodeSnapshotRaw.EffectiveAvailable = nodeInfo.Memory.EffectiveAvailable()
nodeSnapshotRaw.ProxmoxMemorySource = "node-status"
nodeSnapshotRaw.FallbackCalculated = false
}
// Convert LoadAvg from interface{} to float64
loadAvg := make([]float64, 0, len(nodeInfo.LoadAvg))
for _, val := range nodeInfo.LoadAvg {
switch v := val.(type) {
case float64:
loadAvg = append(loadAvg, v)
case string:
if f, err := strconv.ParseFloat(v, 64); err == nil {
loadAvg = append(loadAvg, f)
}
}
}
modelNode.LoadAverage = loadAvg
modelNode.KernelVersion = nodeInfo.KernelVersion
modelNode.PVEVersion = nodeInfo.PVEVersion
// Prefer rootfs data for more accurate disk metrics, but ensure we have valid fallback
if nodeInfo.RootFS != nil && nodeInfo.RootFS.Total > 0 {
modelNode.Disk = models.Disk{
Total: int64(nodeInfo.RootFS.Total),
Used: int64(nodeInfo.RootFS.Used),
Free: int64(nodeInfo.RootFS.Free),
Usage: safePercentage(float64(nodeInfo.RootFS.Used), float64(nodeInfo.RootFS.Total)),
}
log.Debug().
Str("node", node.Node).
Uint64("rootfsUsed", nodeInfo.RootFS.Used).
Uint64("rootfsTotal", nodeInfo.RootFS.Total).
Float64("rootfsUsage", modelNode.Disk.Usage).
Msg("Using rootfs for disk metrics")
} else if node.Disk > 0 && node.MaxDisk > 0 {
// RootFS unavailable but we have valid disk data from /nodes endpoint
// Keep the values we already set from the nodes list
log.Debug().
Str("node", node.Node).
Bool("rootfsNil", nodeInfo.RootFS == nil).
Uint64("fallbackDisk", node.Disk).
Uint64("fallbackMaxDisk", node.MaxDisk).
Msg("RootFS data unavailable - using /nodes endpoint disk metrics")
} else {
// Neither rootfs nor valid node disk data available
log.Warn().
Str("node", node.Node).
Bool("rootfsNil", nodeInfo.RootFS == nil).
Uint64("nodeDisk", node.Disk).
Uint64("nodeMaxDisk", node.MaxDisk).
Msg("No valid disk metrics available for node")
}
// Update memory metrics to use Available field for more accurate usage
if nodeInfo.Memory != nil && nodeInfo.Memory.Total > 0 {
var actualUsed uint64
effectiveAvailable := nodeInfo.Memory.EffectiveAvailable()
usedRRDFallback := false
componentAvailable := nodeInfo.Memory.Free
if nodeInfo.Memory.Buffers > 0 {
if math.MaxUint64-componentAvailable < nodeInfo.Memory.Buffers {
componentAvailable = math.MaxUint64
} else {
componentAvailable += nodeInfo.Memory.Buffers
}
}
if nodeInfo.Memory.Cached > 0 {
if math.MaxUint64-componentAvailable < nodeInfo.Memory.Cached {
componentAvailable = math.MaxUint64
} else {
componentAvailable += nodeInfo.Memory.Cached
}
}
if nodeInfo.Memory.Total > 0 && componentAvailable > nodeInfo.Memory.Total {
componentAvailable = nodeInfo.Memory.Total
}
availableFromUsed := uint64(0)
if nodeInfo.Memory.Total > 0 && nodeInfo.Memory.Used > 0 && nodeInfo.Memory.Total >= nodeInfo.Memory.Used {
availableFromUsed = nodeInfo.Memory.Total - nodeInfo.Memory.Used
}
nodeSnapshotRaw.TotalMinusUsed = availableFromUsed
missingCacheMetrics := nodeInfo.Memory.Available == 0 &&
nodeInfo.Memory.Avail == 0 &&
nodeInfo.Memory.Buffers == 0 &&
nodeInfo.Memory.Cached == 0
if effectiveAvailable == 0 && missingCacheMetrics {
if memAvail, err := m.getNodeRRDMemAvailable(ctx, client, node.Node); err == nil && memAvail > 0 {
effectiveAvailable = memAvail
usedRRDFallback = true
} else if err != nil {
log.Debug().
Err(err).
Str("instance", instanceName).
Str("node", node.Node).
Msg("RRD memavailable fallback unavailable")
}
}
const totalMinusUsedGapTolerance uint64 = 16 * 1024 * 1024
gapGreaterThanComponents := false
if availableFromUsed > componentAvailable {
gap := availableFromUsed - componentAvailable
if componentAvailable == 0 || gap >= totalMinusUsedGapTolerance {
gapGreaterThanComponents = true
}
}
derivedFromTotalMinusUsed := !usedRRDFallback &&
missingCacheMetrics &&
availableFromUsed > 0 &&
gapGreaterThanComponents &&
effectiveAvailable == availableFromUsed
switch {
case effectiveAvailable > 0 && effectiveAvailable <= nodeInfo.Memory.Total:
// Prefer available/avail fields or derived buffers+cache values when present.
actualUsed = nodeInfo.Memory.Total - effectiveAvailable
if actualUsed > nodeInfo.Memory.Total {
actualUsed = nodeInfo.Memory.Total
}
logCtx := log.Debug().
Str("node", node.Node).
Uint64("total", nodeInfo.Memory.Total).
Uint64("effectiveAvailable", effectiveAvailable).
Uint64("actualUsed", actualUsed).
Float64("usage", safePercentage(float64(actualUsed), float64(nodeInfo.Memory.Total)))
if usedRRDFallback {
logCtx.Msg("Node memory: using RRD memavailable fallback (excludes reclaimable cache)")
nodeMemorySource = "rrd-memavailable"
nodeFallbackReason = "rrd-memavailable"
nodeSnapshotRaw.FallbackCalculated = true
nodeSnapshotRaw.ProxmoxMemorySource = "rrd-memavailable"
} else if nodeInfo.Memory.Available > 0 {
logCtx.Msg("Node memory: using available field (excludes reclaimable cache)")
nodeMemorySource = "available-field"
} else if nodeInfo.Memory.Avail > 0 {
logCtx.Msg("Node memory: using avail field (excludes reclaimable cache)")
nodeMemorySource = "avail-field"
} else if derivedFromTotalMinusUsed {
logCtx.
Uint64("availableFromUsed", availableFromUsed).
Uint64("reportedFree", nodeInfo.Memory.Free).
Msg("Node memory: derived available from total-used gap (cache fields missing)")
nodeMemorySource = "derived-total-minus-used"
if nodeFallbackReason == "" {
nodeFallbackReason = "node-status-total-minus-used"
}
nodeSnapshotRaw.FallbackCalculated = true
nodeSnapshotRaw.ProxmoxMemorySource = "node-status-total-minus-used"
} else {
logCtx.
Uint64("free", nodeInfo.Memory.Free).
Uint64("buffers", nodeInfo.Memory.Buffers).
Uint64("cached", nodeInfo.Memory.Cached).
Msg("Node memory: derived available from free+buffers+cached (excludes reclaimable cache)")
nodeMemorySource = "derived-free-buffers-cached"
}
default:
// Fallback to traditional used memory if no cache-aware data is exposed
actualUsed = nodeInfo.Memory.Used
if actualUsed > nodeInfo.Memory.Total {
actualUsed = nodeInfo.Memory.Total
}
log.Debug().
Str("node", node.Node).
Uint64("total", nodeInfo.Memory.Total).
Uint64("used", actualUsed).
Msg("Node memory: no cache-aware metrics - using traditional calculation (includes cache)")
nodeMemorySource = "node-status-used"
}
nodeSnapshotRaw.EffectiveAvailable = effectiveAvailable
free := int64(nodeInfo.Memory.Total - actualUsed)
if free < 0 {
free = 0
}
modelNode.Memory = models.Memory{
Total: int64(nodeInfo.Memory.Total),
Used: int64(actualUsed),
Free: free,
Usage: safePercentage(float64(actualUsed), float64(nodeInfo.Memory.Total)),
}
memoryUpdated = true
}
if nodeInfo.CPUInfo != nil {
// Use MaxCPU from node data for logical CPU count (includes hyperthreading)
// If MaxCPU is not available or 0, fall back to physical cores
logicalCores := node.MaxCPU
if logicalCores == 0 {
logicalCores = nodeInfo.CPUInfo.Cores
}
mhzStr := nodeInfo.CPUInfo.GetMHzString()
log.Debug().
Str("node", node.Node).
Str("model", nodeInfo.CPUInfo.Model).
Int("cores", nodeInfo.CPUInfo.Cores).
Int("logicalCores", logicalCores).
Int("sockets", nodeInfo.CPUInfo.Sockets).
Str("mhz", mhzStr).
Msg("Node CPU info from Proxmox")
modelNode.CPUInfo = models.CPUInfo{
Model: nodeInfo.CPUInfo.Model,
Cores: logicalCores, // Use logical cores for display
Sockets: nodeInfo.CPUInfo.Sockets,
MHz: mhzStr,
}
}
}
}
// If we couldn't update memory metrics using detailed status, preserve previous accurate values if available
if !memoryUpdated && node.Status == "online" {
if prevMem, exists := prevNodeMemory[modelNode.ID]; exists && prevMem.Total > 0 {
total := int64(node.MaxMem)
if total == 0 {
total = prevMem.Total
}
used := prevMem.Used
if total > 0 && used > total {
used = total
}
free := total - used
if free < 0 {
free = 0
}
preserved := prevMem
preserved.Total = total
preserved.Used = used
preserved.Free = free
preserved.Usage = safePercentage(float64(used), float64(total))
modelNode.Memory = preserved
log.Debug().
Str("instance", instanceName).
Str("node", node.Node).
Msg("Preserving previous memory metrics - node status unavailable this cycle")
if nodeFallbackReason == "" {
nodeFallbackReason = "preserved-previous-snapshot"
}
nodeMemorySource = "previous-snapshot"
if nodeSnapshotRaw.ProxmoxMemorySource == "node-status" && nodeSnapshotRaw.Total == 0 {
nodeSnapshotRaw.ProxmoxMemorySource = "previous-snapshot"
}
}
}
m.recordNodeSnapshot(instanceName, node.Node, NodeMemorySnapshot{
RetrievedAt: time.Now(),
MemorySource: nodeMemorySource,
FallbackReason: nodeFallbackReason,
Memory: modelNode.Memory,
Raw: nodeSnapshotRaw,
})
// Collect temperature data via SSH (non-blocking, best effort)
// Only attempt for online nodes
if node.Status == "online" && m.tempCollector != nil {
tempCtx, tempCancel := context.WithTimeout(ctx, 30*time.Second) // Increased to accommodate SSH operations via proxy
// Determine SSH hostname to use (most robust approach):
// 1. For cluster nodes: Try to find the node's specific IP or host from ClusterEndpoints
// 2. For standalone nodes: Use the Host URL from config
// 3. Fallback: Use node name (works for simple DNS/hosts setups)
sshHost := node.Node // Default fallback
if modelNode.IsClusterMember && instanceCfg.IsCluster {
// Look up this specific node in cluster endpoints to get its individual address
for _, ep := range instanceCfg.ClusterEndpoints {
if ep.NodeName == node.Node {
// Prefer IP address for reliability
if ep.IP != "" {
sshHost = ep.IP
} else if ep.Host != "" {
sshHost = ep.Host
}
break
}
}
} else if !modelNode.IsClusterMember {
// Standalone node: use the Host URL from config
sshHost = modelNode.Host
}
temp, err := m.tempCollector.CollectTemperature(tempCtx, sshHost, node.Node)
tempCancel()
if err == nil && temp != nil && temp.Available {
// Get the current CPU temperature (prefer package, fall back to max)
currentTemp := temp.CPUPackage
if currentTemp == 0 && temp.CPUMax > 0 {
currentTemp = temp.CPUMax
}
// Find previous temperature data for this node to preserve min/max
var prevTemp *models.Temperature
for _, prevNode := range prevInstanceNodes {
if prevNode.ID == modelNode.ID && prevNode.Temperature != nil {
prevTemp = prevNode.Temperature
break
}
}
// Initialize or update min/max tracking
if prevTemp != nil && prevTemp.CPUMin > 0 {
// Preserve existing min/max and update if necessary
temp.CPUMin = prevTemp.CPUMin
temp.CPUMaxRecord = prevTemp.CPUMaxRecord
temp.MinRecorded = prevTemp.MinRecorded
temp.MaxRecorded = prevTemp.MaxRecorded
// Update min if current is lower
if currentTemp > 0 && currentTemp < temp.CPUMin {
temp.CPUMin = currentTemp
temp.MinRecorded = time.Now()
}
// Update max if current is higher
if currentTemp > temp.CPUMaxRecord {
temp.CPUMaxRecord = currentTemp
temp.MaxRecorded = time.Now()
}
} else if currentTemp > 0 {
// First reading - initialize min/max to current value
temp.CPUMin = currentTemp
temp.CPUMaxRecord = currentTemp
temp.MinRecorded = time.Now()
temp.MaxRecorded = time.Now()
}
modelNode.Temperature = temp
log.Debug().
Str("node", node.Node).
Str("sshHost", sshHost).
Float64("cpuPackage", temp.CPUPackage).
Float64("cpuMax", temp.CPUMax).
Float64("cpuMin", temp.CPUMin).
Float64("cpuMaxRecord", temp.CPUMaxRecord).
Int("nvmeCount", len(temp.NVMe)).
Msg("Collected temperature data")
} else if err != nil {
log.Debug().
Str("node", node.Node).
Str("sshHost", sshHost).
Bool("isCluster", modelNode.IsClusterMember).
Int("endpointCount", len(instanceCfg.ClusterEndpoints)).
Msg("Temperature collection failed - check SSH access")
}
}
if m.pollMetrics != nil {
nodeNameLabel := strings.TrimSpace(node.Node)
if nodeNameLabel == "" {
nodeNameLabel = strings.TrimSpace(modelNode.DisplayName)
}
if nodeNameLabel == "" {
nodeNameLabel = "unknown-node"
}
success := true
nodeErrReason := ""
health := strings.ToLower(strings.TrimSpace(modelNode.ConnectionHealth))
if health != "" && health != "healthy" {
success = false
nodeErrReason = fmt.Sprintf("connection health %s", health)
}
status := strings.ToLower(strings.TrimSpace(modelNode.Status))
if success && status != "" && status != "online" {
success = false
nodeErrReason = fmt.Sprintf("status %s", status)
}
var nodeErr error
if !success {
if nodeErrReason == "" {
nodeErrReason = "unknown node error"
}
nodeErr = fmt.Errorf(nodeErrReason)
}
m.pollMetrics.RecordNodeResult(NodePollResult{
InstanceName: instanceName,
InstanceType: "pve",
NodeName: nodeNameLabel,
Success: success,
Error: nodeErr,
StartTime: nodeStart,
EndTime: time.Now(),
})
}
modelNodes = append(modelNodes, modelNode)
}
if len(modelNodes) == 0 && len(prevInstanceNodes) > 0 {
log.Warn().
Str("instance", instanceName).
Int("previousCount", len(prevInstanceNodes)).
Msg("No Proxmox nodes returned this cycle - preserving previous state")
// Mark connection health as degraded to reflect polling failure
m.state.SetConnectionHealth(instanceName, false)
preserved := make([]models.Node, 0, len(prevInstanceNodes))
for _, prevNode := range prevInstanceNodes {
nodeCopy := prevNode
nodeCopy.Status = "offline"
nodeCopy.ConnectionHealth = "error"
nodeCopy.Uptime = 0
nodeCopy.CPU = 0
preserved = append(preserved, nodeCopy)
}
modelNodes = preserved
}
// Update state first so we have nodes available
m.state.UpdateNodesForInstance(instanceName, modelNodes)
// Now get storage data to use as fallback for disk metrics if needed
storageByNode := make(map[string]models.Disk)
if instanceCfg.MonitorStorage {
_, err := client.GetAllStorage(ctx)
if err == nil {
for _, node := range nodes {
// Skip offline nodes to avoid 595 errors
if node.Status != "online" {
continue
}
nodeStorages, err := client.GetStorage(ctx, node.Node)
if err == nil {
// Look for local or local-lvm storage as most stable disk metric
for _, storage := range nodeStorages {
if reason, skip := readOnlyFilesystemReason(storage.Type, storage.Total, storage.Used); skip {
log.Debug().
Str("node", node.Node).
Str("storage", storage.Storage).
Str("type", storage.Type).
Str("skipReason", reason).
Uint64("total", storage.Total).
Uint64("used", storage.Used).
Msg("Skipping read-only storage while building disk fallback")
continue
}
if storage.Storage == "local" || storage.Storage == "local-lvm" {
disk := models.Disk{
Total: int64(storage.Total),
Used: int64(storage.Used),
Free: int64(storage.Available),
Usage: safePercentage(float64(storage.Used), float64(storage.Total)),
}
// Prefer "local" over "local-lvm"
if _, exists := storageByNode[node.Node]; !exists || storage.Storage == "local" {
storageByNode[node.Node] = disk
log.Debug().
Str("node", node.Node).
Str("storage", storage.Storage).
Float64("usage", disk.Usage).
Msg("Using storage for disk metrics fallback")
}
}
}
}
}
}
}
// Poll physical disks for health monitoring (enabled by default unless explicitly disabled)
// Skip if MonitorPhysicalDisks is explicitly set to false
if instanceCfg.MonitorPhysicalDisks != nil && !*instanceCfg.MonitorPhysicalDisks {
log.Debug().Str("instance", instanceName).Msg("Physical disk monitoring explicitly disabled")
// Keep any existing disk data visible (don't clear it)
} else {
// Enabled by default (when nil or true)
// Determine polling interval (default 5 minutes to avoid spinning up HDDs too frequently)
pollingInterval := 5 * time.Minute
if instanceCfg.PhysicalDiskPollingMinutes > 0 {
pollingInterval = time.Duration(instanceCfg.PhysicalDiskPollingMinutes) * time.Minute
}
// Check if enough time has elapsed since last poll
m.mu.Lock()
lastPoll, exists := m.lastPhysicalDiskPoll[instanceName]
shouldPoll := !exists || time.Since(lastPoll) >= pollingInterval
if shouldPoll {
m.lastPhysicalDiskPoll[instanceName] = time.Now()
}
m.mu.Unlock()
if !shouldPoll {
log.Debug().
Str("instance", instanceName).
Dur("sinceLastPoll", time.Since(lastPoll)).
Dur("interval", pollingInterval).
Msg("Skipping physical disk poll - interval not elapsed")
// Refresh NVMe temperatures using the latest sensor data even when we skip the disk poll
currentState := m.state.GetSnapshot()
existing := make([]models.PhysicalDisk, 0)
for _, disk := range currentState.PhysicalDisks {
if disk.Instance == instanceName {
existing = append(existing, disk)
}
}
if len(existing) > 0 {
updated := mergeNVMeTempsIntoDisks(existing, modelNodes)
m.state.UpdatePhysicalDisks(instanceName, updated)
}
} else {
log.Debug().
Int("nodeCount", len(nodes)).
Dur("interval", pollingInterval).
Msg("Starting disk health polling")
// Get existing disks from state to preserve data for offline nodes
currentState := m.state.GetSnapshot()
existingDisksMap := make(map[string]models.PhysicalDisk)
for _, disk := range currentState.PhysicalDisks {
if disk.Instance == instanceName {
existingDisksMap[disk.ID] = disk
}
}
var allDisks []models.PhysicalDisk
polledNodes := make(map[string]bool) // Track which nodes we successfully polled
for _, node := range nodes {
// Skip offline nodes but preserve their existing disk data
if node.Status != "online" {
log.Debug().Str("node", node.Node).Msg("Skipping disk poll for offline node - preserving existing data")
continue
}
// Get disk list for this node
log.Debug().Str("node", node.Node).Msg("Getting disk list for node")
disks, err := client.GetDisks(ctx, node.Node)
if err != nil {
// Check if it's a permission error or if the endpoint doesn't exist
if strings.Contains(err.Error(), "401") || strings.Contains(err.Error(), "403") {
log.Warn().
Str("node", node.Node).
Err(err).
Msg("Insufficient permissions to access disk information - check API token permissions")
} else if strings.Contains(err.Error(), "404") || strings.Contains(err.Error(), "501") {
log.Info().
Str("node", node.Node).
Msg("Disk monitoring not available on this node (may be using non-standard storage)")
} else {
log.Warn().
Str("node", node.Node).
Err(err).
Msg("Failed to get disk list")
}
continue
}
log.Debug().
Str("node", node.Node).
Int("diskCount", len(disks)).
Msg("Got disk list for node")
// Mark this node as successfully polled
polledNodes[node.Node] = true
// Check each disk for health issues and add to state
for _, disk := range disks {
// Create PhysicalDisk model
diskID := fmt.Sprintf("%s-%s-%s", instanceName, node.Node, strings.ReplaceAll(disk.DevPath, "/", "-"))
physicalDisk := models.PhysicalDisk{
ID: diskID,
Node: node.Node,
Instance: instanceName,
DevPath: disk.DevPath,
Model: disk.Model,
Serial: disk.Serial,
Type: disk.Type,
Size: disk.Size,
Health: disk.Health,
Wearout: disk.Wearout,
RPM: disk.RPM,
Used: disk.Used,
LastChecked: time.Now(),
}
allDisks = append(allDisks, physicalDisk)
log.Debug().
Str("node", node.Node).
Str("disk", disk.DevPath).
Str("model", disk.Model).
Str("health", disk.Health).
Int("wearout", disk.Wearout).
Msg("Checking disk health")
normalizedHealth := strings.ToUpper(strings.TrimSpace(disk.Health))
if normalizedHealth != "" && normalizedHealth != "UNKNOWN" && normalizedHealth != "PASSED" && normalizedHealth != "OK" {
// Disk has failed or is failing - alert manager will handle this
log.Warn().
Str("node", node.Node).
Str("disk", disk.DevPath).
Str("model", disk.Model).
Str("health", disk.Health).
Int("wearout", disk.Wearout).
Msg("Disk health issue detected")
// Pass disk info to alert manager
m.alertManager.CheckDiskHealth(instanceName, node.Node, disk)
} else if disk.Wearout > 0 && disk.Wearout < 10 {
// Low wearout warning (less than 10% life remaining)
log.Warn().
Str("node", node.Node).
Str("disk", disk.DevPath).
Str("model", disk.Model).
Int("wearout", disk.Wearout).
Msg("SSD wearout critical - less than 10% life remaining")
// Pass to alert manager for wearout alert
m.alertManager.CheckDiskHealth(instanceName, node.Node, disk)
}
}
}
// Preserve existing disk data for nodes that weren't polled (offline or error)
for _, existingDisk := range existingDisksMap {
// Only preserve if we didn't poll this node
if !polledNodes[existingDisk.Node] {
// Keep the existing disk data but update the LastChecked to indicate it's stale
allDisks = append(allDisks, existingDisk)
log.Debug().
Str("node", existingDisk.Node).
Str("disk", existingDisk.DevPath).
Msg("Preserving existing disk data for unpolled node")
}
}
allDisks = mergeNVMeTempsIntoDisks(allDisks, modelNodes)
// Update physical disks in state
log.Debug().
Str("instance", instanceName).
Int("diskCount", len(allDisks)).
Int("preservedCount", len(existingDisksMap)-len(polledNodes)).
Msg("Updating physical disks in state")
m.state.UpdatePhysicalDisks(instanceName, allDisks)
}
}
// Note: Physical disk monitoring is now enabled by default with a 5-minute polling interval.
// Users can explicitly disable it in node settings. Disk data is preserved between polls.
// Update nodes with storage fallback if rootfs was not available
for i := range modelNodes {
if modelNodes[i].Disk.Total == 0 {
if disk, exists := storageByNode[modelNodes[i].Name]; exists {
modelNodes[i].Disk = disk
log.Debug().
Str("node", modelNodes[i].Name).
Float64("usage", disk.Usage).
Msg("Applied storage fallback for disk metrics")
}
}
if modelNodes[i].Status == "online" {
// Record node metrics history only for online nodes
now := time.Now()
m.metricsHistory.AddNodeMetric(modelNodes[i].ID, "cpu", modelNodes[i].CPU*100, now)
m.metricsHistory.AddNodeMetric(modelNodes[i].ID, "memory", modelNodes[i].Memory.Usage, now)
m.metricsHistory.AddNodeMetric(modelNodes[i].ID, "disk", modelNodes[i].Disk.Usage, now)
}
// Check thresholds for alerts
m.alertManager.CheckNode(modelNodes[i])
}
// Update state again with corrected disk metrics
m.state.UpdateNodesForInstance(instanceName, modelNodes)
// Clean up alerts for nodes that no longer exist
// Get all nodes from the global state (includes all instances)
existingNodes := make(map[string]bool)
allState := m.state.GetSnapshot()
for _, node := range allState.Nodes {
existingNodes[node.Name] = true
}
m.alertManager.CleanupAlertsForNodes(existingNodes)
// Periodically re-check cluster status for nodes marked as standalone
// This addresses issue #437 where clusters aren't detected on first attempt
if !instanceCfg.IsCluster {
// Check every 5 minutes if this is actually a cluster
if time.Since(m.lastClusterCheck[instanceName]) > 5*time.Minute {
m.lastClusterCheck[instanceName] = time.Now()
// Try to detect if this is actually a cluster
isActuallyCluster, checkErr := client.IsClusterMember(ctx)
if checkErr == nil && isActuallyCluster {
// This node is actually part of a cluster!
log.Info().
Str("instance", instanceName).
Msg("Detected that standalone node is actually part of a cluster - updating configuration")
// Update the configuration
for i := range m.config.PVEInstances {
if m.config.PVEInstances[i].Name == instanceName {
m.config.PVEInstances[i].IsCluster = true
// Note: We can't get the cluster name here without direct client access
// It will be detected on the next configuration update
log.Info().
Str("instance", instanceName).
Msg("Marked node as cluster member - cluster name will be detected on next update")
// Save the updated configuration
if m.persistence != nil {
if err := m.persistence.SaveNodesConfig(m.config.PVEInstances, m.config.PBSInstances, m.config.PMGInstances); err != nil {
log.Warn().Err(err).Msg("Failed to persist updated node configuration")
}
}
break
}
}
}
}
}
// Update cluster endpoint online status if this is a cluster
if instanceCfg.IsCluster && len(instanceCfg.ClusterEndpoints) > 0 {
// Create a map of online nodes from our polling results
onlineNodes := make(map[string]bool)
for _, node := range modelNodes {
// Node is online if we successfully got its data
onlineNodes[node.Name] = node.Status == "online"
}
// Update the online status for each cluster endpoint
for i := range instanceCfg.ClusterEndpoints {
if online, exists := onlineNodes[instanceCfg.ClusterEndpoints[i].NodeName]; exists {
instanceCfg.ClusterEndpoints[i].Online = online
if online {
instanceCfg.ClusterEndpoints[i].LastSeen = time.Now()
}
}
}
// Update the config with the new online status
// This is needed so the UI can reflect the current status
for idx, cfg := range m.config.PVEInstances {
if cfg.Name == instanceName {
m.config.PVEInstances[idx].ClusterEndpoints = instanceCfg.ClusterEndpoints
break
}
}
}
// Poll VMs and containers together using cluster/resources for efficiency
if instanceCfg.MonitorVMs || instanceCfg.MonitorContainers {
select {
case <-ctx.Done():
pollErr = ctx.Err()
return
default:
// Always try the efficient cluster/resources endpoint first
// This endpoint works on both clustered and standalone nodes
// Testing confirmed it works on standalone nodes like pimox
useClusterEndpoint := m.pollVMsAndContainersEfficient(ctx, instanceName, client)
if !useClusterEndpoint {
// Fall back to traditional polling only if cluster/resources not available
// This should be rare - only for very old Proxmox versions
log.Debug().
Str("instance", instanceName).
Msg("cluster/resources endpoint not available, using traditional polling")
// Check if configuration needs updating
if instanceCfg.IsCluster {
isActuallyCluster, checkErr := client.IsClusterMember(ctx)
if checkErr == nil && !isActuallyCluster {
log.Warn().
Str("instance", instanceName).
Msg("Instance marked as cluster but is actually standalone - consider updating configuration")
instanceCfg.IsCluster = false
}
}
// Use optimized parallel polling for better performance
if instanceCfg.MonitorVMs {
m.pollVMsWithNodes(ctx, instanceName, client, nodes)
}
if instanceCfg.MonitorContainers {
m.pollContainersWithNodes(ctx, instanceName, client, nodes)
}
}
}
}
// Poll storage if enabled
if instanceCfg.MonitorStorage {
select {
case <-ctx.Done():
pollErr = ctx.Err()
return
default:
m.pollStorageWithNodes(ctx, instanceName, client, nodes)
}
}
// Poll backups if enabled - respect configured interval or cycle gating
if instanceCfg.MonitorBackups {
if !m.config.EnableBackupPolling {
log.Debug().
Str("instance", instanceName).
Msg("Skipping backup polling - globally disabled")
} else {
now := time.Now()
m.mu.RLock()
lastPoll := m.lastPVEBackupPoll[instanceName]
m.mu.RUnlock()
shouldPoll, reason, newLast := m.shouldRunBackupPoll(lastPoll, now)
if !shouldPoll {
if reason != "" {
log.Debug().
Str("instance", instanceName).
Str("reason", reason).
Msg("Skipping PVE backup polling this cycle")
}
} else {
select {
case <-ctx.Done():
pollErr = ctx.Err()
return
default:
m.mu.Lock()
m.lastPVEBackupPoll[instanceName] = newLast
m.mu.Unlock()
// Run backup polling in a separate goroutine to avoid blocking real-time stats
go func(startTime time.Time, inst string, pveClient PVEClientInterface) {
timeout := m.calculateBackupOperationTimeout(inst)
log.Info().
Str("instance", inst).
Dur("timeout", timeout).
Msg("Starting background backup/snapshot polling")
// Create a separate context with longer timeout for backup operations
backupCtx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
// Poll backup tasks
m.pollBackupTasks(backupCtx, inst, pveClient)
// Poll storage backups - pass nodes to avoid duplicate API calls
m.pollStorageBackupsWithNodes(backupCtx, inst, pveClient, nodes)
// Poll guest snapshots
m.pollGuestSnapshots(backupCtx, inst, pveClient)
duration := time.Since(startTime)
log.Info().
Str("instance", inst).
Dur("duration", duration).
Msg("Completed background backup/snapshot polling")
// Record actual completion time for interval scheduling
m.mu.Lock()
m.lastPVEBackupPoll[inst] = time.Now()
m.mu.Unlock()
}(now, instanceName, client)
}
}
}
}
}
// pollVMsAndContainersEfficient uses the cluster/resources endpoint to get all VMs and containers in one call
// This works on both clustered and standalone nodes for efficient polling
func (m *Monitor) pollVMsAndContainersEfficient(ctx context.Context, instanceName string, client PVEClientInterface) bool {
log.Info().Str("instance", instanceName).Msg("Polling VMs and containers using efficient cluster/resources endpoint")
// Get all resources in a single API call
resources, err := client.GetClusterResources(ctx, "vm")
if err != nil {
log.Debug().Err(err).Str("instance", instanceName).Msg("cluster/resources not available, falling back to traditional polling")
return false
}
var allVMs []models.VM
var allContainers []models.Container
for _, res := range resources {
// Avoid duplicating node name in ID when instance name equals node name
var guestID string
if instanceName == res.Node {
guestID = fmt.Sprintf("%s-%d", res.Node, res.VMID)
} else {
guestID = fmt.Sprintf("%s-%s-%d", instanceName, res.Node, res.VMID)
}
// Debug log the resource type
log.Debug().
Str("instance", instanceName).
Str("name", res.Name).
Int("vmid", res.VMID).
Str("type", res.Type).
Msg("Processing cluster resource")
// Initialize I/O metrics from cluster resources (may be 0 for VMs)
diskReadBytes := int64(res.DiskRead)
diskWriteBytes := int64(res.DiskWrite)
networkInBytes := int64(res.NetIn)
networkOutBytes := int64(res.NetOut)
var individualDisks []models.Disk // Store individual filesystems for multi-disk monitoring
var ipAddresses []string
var networkInterfaces []models.GuestNetworkInterface
var osName, osVersion string
if res.Type == "qemu" {
// Skip templates if configured
if res.Template == 1 {
continue
}
memTotal := res.MaxMem
memUsed := res.Mem
memorySource := "cluster-resources"
guestRaw := VMMemoryRaw{
ListingMem: res.Mem,
ListingMaxMem: res.MaxMem,
}
var detailedStatus *proxmox.VMStatus
// Try to get actual disk usage from guest agent if VM is running
diskUsed := res.Disk
diskTotal := res.MaxDisk
diskFree := diskTotal - diskUsed
diskUsage := safePercentage(float64(diskUsed), float64(diskTotal))
// If VM shows 0 disk usage but has allocated disk, it's likely guest agent issue
// Set to -1 to indicate "unknown" rather than showing misleading 0%
if res.Type == "qemu" && diskUsed == 0 && diskTotal > 0 && res.Status == "running" {
diskUsage = -1
}
// For running VMs, always try to get filesystem info from guest agent
// The cluster/resources endpoint often returns 0 or incorrect values for disk usage
// We should prefer guest agent data when available for accurate metrics
if res.Status == "running" && res.Type == "qemu" {
// First check if agent is enabled by getting VM status
status, err := client.GetVMStatus(ctx, res.Node, res.VMID)
if err != nil {
log.Debug().
Err(err).
Str("instance", instanceName).
Str("vm", res.Name).
Int("vmid", res.VMID).
Msg("Could not get VM status to check guest agent availability")
} else if status != nil {
detailedStatus = status
guestRaw.StatusMaxMem = detailedStatus.MaxMem
guestRaw.StatusMem = detailedStatus.Mem
guestRaw.StatusFreeMem = detailedStatus.FreeMem
guestRaw.Balloon = detailedStatus.Balloon
guestRaw.BalloonMin = detailedStatus.BalloonMin
guestRaw.Agent = detailedStatus.Agent
memAvailable := uint64(0)
if detailedStatus.MemInfo != nil {
guestRaw.MemInfoUsed = detailedStatus.MemInfo.Used
guestRaw.MemInfoFree = detailedStatus.MemInfo.Free
guestRaw.MemInfoTotal = detailedStatus.MemInfo.Total
guestRaw.MemInfoAvailable = detailedStatus.MemInfo.Available
guestRaw.MemInfoBuffers = detailedStatus.MemInfo.Buffers
guestRaw.MemInfoCached = detailedStatus.MemInfo.Cached
guestRaw.MemInfoShared = detailedStatus.MemInfo.Shared
switch {
case detailedStatus.MemInfo.Available > 0:
memAvailable = detailedStatus.MemInfo.Available
memorySource = "meminfo-available"
case detailedStatus.MemInfo.Free > 0 ||
detailedStatus.MemInfo.Buffers > 0 ||
detailedStatus.MemInfo.Cached > 0:
memAvailable = detailedStatus.MemInfo.Free +
detailedStatus.MemInfo.Buffers +
detailedStatus.MemInfo.Cached
memorySource = "meminfo-derived"
}
}
// Use actual disk I/O values from detailed status
diskReadBytes = int64(detailedStatus.DiskRead)
diskWriteBytes = int64(detailedStatus.DiskWrite)
networkInBytes = int64(detailedStatus.NetIn)
networkOutBytes = int64(detailedStatus.NetOut)
if detailedStatus.Balloon > 0 && detailedStatus.Balloon < detailedStatus.MaxMem {
memTotal = detailedStatus.Balloon
guestRaw.DerivedFromBall = true
} else if detailedStatus.MaxMem > 0 {
memTotal = detailedStatus.MaxMem
guestRaw.DerivedFromBall = false
}
switch {
case memAvailable > 0:
if memAvailable > memTotal {
memAvailable = memTotal
}
memUsed = memTotal - memAvailable
case detailedStatus.FreeMem > 0 && memTotal >= detailedStatus.FreeMem:
memUsed = memTotal - detailedStatus.FreeMem
memorySource = "status-freemem"
case detailedStatus.Mem > 0:
memUsed = detailedStatus.Mem
memorySource = "status-mem"
}
if memUsed > memTotal {
memUsed = memTotal
}
// Gather guest metadata from the agent when available
guestIPs, guestIfaces, guestOSName, guestOSVersion := m.fetchGuestAgentMetadata(ctx, client, instanceName, res.Node, res.Name, res.VMID, detailedStatus)
if len(guestIPs) > 0 {
ipAddresses = guestIPs
}
if len(guestIfaces) > 0 {
networkInterfaces = guestIfaces
}
if guestOSName != "" {
osName = guestOSName
}
if guestOSVersion != "" {
osVersion = guestOSVersion
}
// Always try to get filesystem info if agent is enabled
// Prefer guest agent data over cluster/resources data for accuracy
if detailedStatus.Agent > 0 {
log.Debug().
Str("instance", instanceName).
Str("vm", res.Name).
Int("vmid", res.VMID).
Int("agent", detailedStatus.Agent).
Uint64("current_disk", diskUsed).
Uint64("current_maxdisk", diskTotal).
Msg("Guest agent enabled, querying filesystem info for accurate disk usage")
fsInfo, err := client.GetVMFSInfo(ctx, res.Node, res.VMID)
if err != nil {
// Log more helpful error messages based on the error type
errMsg := err.Error()
if strings.Contains(errMsg, "500") || strings.Contains(errMsg, "QEMU guest agent is not running") {
log.Info().
Str("instance", instanceName).
Str("vm", res.Name).
Int("vmid", res.VMID).
Msg("Guest agent enabled in VM config but not running inside guest OS. Install and start qemu-guest-agent in the VM")
log.Info().
Str("instance", instanceName).
Str("vm", res.Name).
Msg("To verify: ssh into VM and run 'systemctl status qemu-guest-agent' or 'ps aux | grep qemu-ga'")
} else if strings.Contains(errMsg, "timeout") {
log.Info().
Str("instance", instanceName).
Str("vm", res.Name).
Int("vmid", res.VMID).
Msg("Guest agent timeout - agent may be installed but not responding")
} else if strings.Contains(errMsg, "403") || strings.Contains(errMsg, "401") || strings.Contains(errMsg, "authentication error") {
// Permission error - user/token lacks required permissions
log.Info().
Str("instance", instanceName).
Str("vm", res.Name).
Int("vmid", res.VMID).
Msg("VM disk monitoring permission denied. Check permissions:")
log.Info().
Str("instance", instanceName).
Str("vm", res.Name).
Msg("• Proxmox 9: Ensure token/user has VM.GuestAgent.Audit privilege (Pulse setup adds this via PulseMonitor role)")
log.Info().
Str("instance", instanceName).
Str("vm", res.Name).
Msg("• Proxmox 8: Ensure token/user has VM.Monitor privilege (Pulse setup adds this via PulseMonitor role)")
log.Info().
Str("instance", instanceName).
Str("vm", res.Name).
Msg("• All versions: Sys.Audit is recommended for Ceph metrics and applied when available")
log.Info().
Str("instance", instanceName).
Str("vm", res.Name).
Msg("• Re-run Pulse setup script if node was added before v4.7")
log.Info().
Str("instance", instanceName).
Str("vm", res.Name).
Msg("• Verify guest agent is installed and running inside the VM")
} else {
log.Debug().
Err(err).
Str("instance", instanceName).
Str("vm", res.Name).
Int("vmid", res.VMID).
Msg("Failed to get filesystem info from guest agent")
}
} else if len(fsInfo) == 0 {
log.Info().
Str("instance", instanceName).
Str("vm", res.Name).
Int("vmid", res.VMID).
Msg("Guest agent returned no filesystem info - agent may need restart or VM may have no mounted filesystems")
} else {
log.Debug().
Str("instance", instanceName).
Str("vm", res.Name).
Int("filesystems", len(fsInfo)).
Msg("Got filesystem info from guest agent")
// Aggregate disk usage from all filesystems AND preserve individual disk data
var totalBytes, usedBytes uint64
var skippedFS []string
var includedFS []string
// Log all filesystems received for debugging
log.Debug().
Str("instance", instanceName).
Str("vm", res.Name).
Int("vmid", res.VMID).
Int("filesystem_count", len(fsInfo)).
Msg("Processing filesystems from guest agent")
for _, fs := range fsInfo {
// Skip special filesystems and mounts
skipReasons := []string{}
reasonReadOnly := ""
shouldSkip := false
// Check filesystem type
fsTypeLower := strings.ToLower(fs.Type)
if reason, skip := readOnlyFilesystemReason(fs.Type, fs.TotalBytes, fs.UsedBytes); skip {
skipReasons = append(skipReasons, fmt.Sprintf("read-only-%s", reason))
reasonReadOnly = reason
shouldSkip = true
}
if fs.Type == "tmpfs" || fs.Type == "devtmpfs" ||
fs.Type == "cgroup" || fs.Type == "cgroup2" ||
fs.Type == "sysfs" || fs.Type == "proc" ||
fs.Type == "devpts" || fs.Type == "securityfs" ||
fs.Type == "debugfs" || fs.Type == "tracefs" ||
fs.Type == "fusectl" || fs.Type == "configfs" ||
fs.Type == "pstore" || fs.Type == "hugetlbfs" ||
fs.Type == "mqueue" || fs.Type == "bpf" ||
strings.Contains(fsTypeLower, "fuse") || // Skip FUSE mounts (often network/special)
strings.Contains(fsTypeLower, "9p") || // Skip 9p mounts (VM shared folders)
strings.Contains(fsTypeLower, "nfs") || // Skip NFS mounts
strings.Contains(fsTypeLower, "cifs") || // Skip CIFS/SMB mounts
strings.Contains(fsTypeLower, "smb") { // Skip SMB mounts
skipReasons = append(skipReasons, "special-fs-type")
shouldSkip = true
}
// Check mountpoint patterns
if strings.HasPrefix(fs.Mountpoint, "/dev") ||
strings.HasPrefix(fs.Mountpoint, "/proc") ||
strings.HasPrefix(fs.Mountpoint, "/sys") ||
strings.HasPrefix(fs.Mountpoint, "/run") ||
strings.HasPrefix(fs.Mountpoint, "/var/lib/docker") || // Skip Docker volumes
strings.HasPrefix(fs.Mountpoint, "/snap") || // Skip snap mounts
fs.Mountpoint == "/boot/efi" ||
fs.Mountpoint == "System Reserved" || // Windows System Reserved partition
strings.Contains(fs.Mountpoint, "System Reserved") { // Various Windows reserved formats
skipReasons = append(skipReasons, "special-mountpoint")
shouldSkip = true
}
if shouldSkip {
if reasonReadOnly != "" {
log.Debug().
Str("instance", instanceName).
Str("vm", res.Name).
Int("vmid", res.VMID).
Str("mountpoint", fs.Mountpoint).
Str("type", fs.Type).
Float64("total_gb", float64(fs.TotalBytes)/1073741824).
Float64("used_gb", float64(fs.UsedBytes)/1073741824).
Msg("Skipping read-only filesystem from disk aggregation")
}
skippedFS = append(skippedFS, fmt.Sprintf("%s(%s,%s)",
fs.Mountpoint, fs.Type, strings.Join(skipReasons, ",")))
continue
}
// Only count real filesystems with valid data
// Some filesystems report 0 bytes (like unformatted or system partitions)
if fs.TotalBytes > 0 {
totalBytes += fs.TotalBytes
usedBytes += fs.UsedBytes
includedFS = append(includedFS, fmt.Sprintf("%s(%s,%.1fGB)",
fs.Mountpoint, fs.Type, float64(fs.TotalBytes)/1073741824))
// Add to individual disks array
individualDisks = append(individualDisks, models.Disk{
Total: int64(fs.TotalBytes),
Used: int64(fs.UsedBytes),
Free: int64(fs.TotalBytes - fs.UsedBytes),
Usage: safePercentage(float64(fs.UsedBytes), float64(fs.TotalBytes)),
Mountpoint: fs.Mountpoint,
Type: fs.Type,
Device: fs.Disk,
})
log.Debug().
Str("instance", instanceName).
Str("vm", res.Name).
Int("vmid", res.VMID).
Str("mountpoint", fs.Mountpoint).
Str("type", fs.Type).
Uint64("total", fs.TotalBytes).
Uint64("used", fs.UsedBytes).
Float64("total_gb", float64(fs.TotalBytes)/1073741824).
Float64("used_gb", float64(fs.UsedBytes)/1073741824).
Msg("Including filesystem in disk usage calculation")
} else if fs.TotalBytes == 0 && len(fs.Mountpoint) > 0 {
skippedFS = append(skippedFS, fmt.Sprintf("%s(%s,0GB)", fs.Mountpoint, fs.Type))
log.Debug().
Str("instance", instanceName).
Str("vm", res.Name).
Int("vmid", res.VMID).
Str("mountpoint", fs.Mountpoint).
Str("type", fs.Type).
Msg("Skipping filesystem with zero total bytes")
}
}
if len(skippedFS) > 0 {
log.Debug().
Str("instance", instanceName).
Str("vm", res.Name).
Strs("skipped", skippedFS).
Msg("Skipped special filesystems")
}
if len(includedFS) > 0 {
log.Info().
Str("instance", instanceName).
Str("vm", res.Name).
Int("vmid", res.VMID).
Strs("included", includedFS).
Msg("Filesystems included in disk calculation")
}
// If we got valid data from guest agent, use it
if totalBytes > 0 {
// Sanity check: if the reported disk is way larger than allocated disk,
// we might be getting host disk info somehow
allocatedDiskGB := float64(res.MaxDisk) / 1073741824
reportedDiskGB := float64(totalBytes) / 1073741824
// If reported disk is more than 2x the allocated disk, log a warning
// This could indicate we're getting host disk or network shares
if allocatedDiskGB > 0 && reportedDiskGB > allocatedDiskGB*2 {
log.Warn().
Str("instance", instanceName).
Str("vm", res.Name).
Int("vmid", res.VMID).
Float64("allocated_gb", allocatedDiskGB).
Float64("reported_gb", reportedDiskGB).
Float64("ratio", reportedDiskGB/allocatedDiskGB).
Strs("filesystems", includedFS).
Msg("VM reports disk usage significantly larger than allocated disk - possible issue with filesystem detection")
}
diskTotal = totalBytes
diskUsed = usedBytes
diskFree = totalBytes - usedBytes
diskUsage = safePercentage(float64(usedBytes), float64(totalBytes))
log.Info().
Str("instance", instanceName).
Str("vm", res.Name).
Int("vmid", res.VMID).
Uint64("totalBytes", totalBytes).
Uint64("usedBytes", usedBytes).
Float64("total_gb", float64(totalBytes)/1073741824).
Float64("used_gb", float64(usedBytes)/1073741824).
Float64("allocated_gb", allocatedDiskGB).
Float64("usage", diskUsage).
Uint64("old_disk", res.Disk).
Uint64("old_maxdisk", res.MaxDisk).
Msg("Using guest agent data for accurate disk usage (replacing cluster/resources data)")
} else {
// Only special filesystems found - show allocated disk size instead
if diskTotal > 0 {
diskUsage = -1 // Show as allocated size
}
log.Info().
Str("instance", instanceName).
Str("vm", res.Name).
Int("filesystems_found", len(fsInfo)).
Msg("Guest agent provided filesystem info but no usable filesystems found (all were special mounts)")
}
}
} else {
// Agent disabled - show allocated disk size
if diskTotal > 0 {
diskUsage = -1 // Show as allocated size
}
log.Debug().
Str("instance", instanceName).
Str("vm", res.Name).
Int("vmid", res.VMID).
Int("agent", detailedStatus.Agent).
Msg("VM does not have guest agent enabled in config")
}
} else {
// No vmStatus available - keep cluster/resources data
log.Debug().
Str("instance", instanceName).
Str("vm", res.Name).
Int("vmid", res.VMID).
Msg("Could not get VM status, using cluster/resources disk data")
}
}
if res.Status != "running" {
memorySource = "powered-off"
memUsed = 0
}
memFree := uint64(0)
if memTotal >= memUsed {
memFree = memTotal - memUsed
}
sampleTime := time.Now()
currentMetrics := IOMetrics{
DiskRead: diskReadBytes,
DiskWrite: diskWriteBytes,
NetworkIn: networkInBytes,
NetworkOut: networkOutBytes,
Timestamp: sampleTime,
}
diskReadRate, diskWriteRate, netInRate, netOutRate := m.rateTracker.CalculateRates(guestID, currentMetrics)
memoryUsage := safePercentage(float64(memUsed), float64(memTotal))
memory := models.Memory{
Total: int64(memTotal),
Used: int64(memUsed),
Free: int64(memFree),
Usage: memoryUsage,
}
if memory.Free < 0 {
memory.Free = 0
}
if memory.Used > memory.Total {
memory.Used = memory.Total
}
if detailedStatus != nil && detailedStatus.Balloon > 0 {
memory.Balloon = int64(detailedStatus.Balloon)
}
vm := models.VM{
ID: guestID,
VMID: res.VMID,
Name: res.Name,
Node: res.Node,
Instance: instanceName,
Status: res.Status,
Type: "qemu",
CPU: safeFloat(res.CPU),
CPUs: res.MaxCPU,
Memory: memory,
Disk: models.Disk{
Total: int64(diskTotal),
Used: int64(diskUsed),
Free: int64(diskFree),
Usage: diskUsage,
},
Disks: individualDisks, // Individual filesystem data
IPAddresses: ipAddresses,
OSName: osName,
OSVersion: osVersion,
NetworkInterfaces: networkInterfaces,
NetworkIn: maxInt64(0, int64(netInRate)),
NetworkOut: maxInt64(0, int64(netOutRate)),
DiskRead: maxInt64(0, int64(diskReadRate)),
DiskWrite: maxInt64(0, int64(diskWriteRate)),
Uptime: int64(res.Uptime),
Template: res.Template == 1,
LastSeen: sampleTime,
}
// Parse tags
if res.Tags != "" {
vm.Tags = strings.Split(res.Tags, ";")
// Log if Pulse-specific tags are detected
for _, tag := range vm.Tags {
switch tag {
case "pulse-no-alerts", "pulse-monitor-only", "pulse-relaxed":
log.Info().
Str("vm", vm.Name).
Str("node", vm.Node).
Str("tag", tag).
Msg("Pulse control tag detected on VM")
}
}
}
allVMs = append(allVMs, vm)
m.recordGuestSnapshot(instanceName, vm.Type, res.Node, res.VMID, GuestMemorySnapshot{
Name: vm.Name,
Status: vm.Status,
RetrievedAt: sampleTime,
MemorySource: memorySource,
Memory: vm.Memory,
Raw: guestRaw,
})
// For non-running VMs, zero out resource usage metrics to prevent false alerts
// Proxmox may report stale or residual metrics for stopped VMs
if vm.Status != "running" {
log.Debug().
Str("vm", vm.Name).
Str("status", vm.Status).
Float64("originalCpu", vm.CPU).
Float64("originalMemUsage", vm.Memory.Usage).
Msg("Non-running VM detected - zeroing metrics")
// Zero out all usage metrics for stopped/paused/suspended VMs
vm.CPU = 0
vm.Memory.Usage = 0
vm.Disk.Usage = 0
vm.NetworkIn = 0
vm.NetworkOut = 0
vm.DiskRead = 0
vm.DiskWrite = 0
}
// Check thresholds for alerts
m.alertManager.CheckGuest(vm, instanceName)
} else if res.Type == "lxc" {
// Skip templates if configured
if res.Template == 1 {
continue
}
// Calculate I/O rates for container
currentMetrics := IOMetrics{
DiskRead: int64(res.DiskRead),
DiskWrite: int64(res.DiskWrite),
NetworkIn: int64(res.NetIn),
NetworkOut: int64(res.NetOut),
Timestamp: time.Now(),
}
diskReadRate, diskWriteRate, netInRate, netOutRate := m.rateTracker.CalculateRates(guestID, currentMetrics)
container := models.Container{
ID: guestID,
VMID: res.VMID,
Name: res.Name,
Node: res.Node,
Instance: instanceName,
Status: res.Status,
Type: "lxc",
CPU: safeFloat(res.CPU),
CPUs: int(res.MaxCPU),
Memory: models.Memory{
Total: int64(res.MaxMem),
Used: int64(res.Mem),
Free: int64(res.MaxMem - res.Mem),
Usage: safePercentage(float64(res.Mem), float64(res.MaxMem)),
},
Disk: models.Disk{
Total: int64(res.MaxDisk),
Used: int64(res.Disk),
Free: int64(res.MaxDisk - res.Disk),
Usage: safePercentage(float64(res.Disk), float64(res.MaxDisk)),
},
NetworkIn: maxInt64(0, int64(netInRate)),
NetworkOut: maxInt64(0, int64(netOutRate)),
DiskRead: maxInt64(0, int64(diskReadRate)),
DiskWrite: maxInt64(0, int64(diskWriteRate)),
Uptime: int64(res.Uptime),
Template: res.Template == 1,
LastSeen: time.Now(),
}
// Parse tags
if res.Tags != "" {
container.Tags = strings.Split(res.Tags, ";")
// Log if Pulse-specific tags are detected
for _, tag := range container.Tags {
switch tag {
case "pulse-no-alerts", "pulse-monitor-only", "pulse-relaxed":
log.Info().
Str("container", container.Name).
Str("node", container.Node).
Str("tag", tag).
Msg("Pulse control tag detected on container")
}
}
}
allContainers = append(allContainers, container)
// For non-running containers, zero out resource usage metrics to prevent false alerts
// Proxmox may report stale or residual metrics for stopped containers
if container.Status != "running" {
log.Debug().
Str("container", container.Name).
Str("status", container.Status).
Float64("originalCpu", container.CPU).
Float64("originalMemUsage", container.Memory.Usage).
Msg("Non-running container detected - zeroing metrics")
// Zero out all usage metrics for stopped/paused containers
container.CPU = 0
container.Memory.Usage = 0
container.Disk.Usage = 0
container.NetworkIn = 0
container.NetworkOut = 0
container.DiskRead = 0
container.DiskWrite = 0
}
// Check thresholds for alerts
m.alertManager.CheckGuest(container, instanceName)
}
}
// Update state
if len(allVMs) > 0 {
m.state.UpdateVMsForInstance(instanceName, allVMs)
}
if len(allContainers) > 0 {
m.state.UpdateContainersForInstance(instanceName, allContainers)
}
log.Info().
Str("instance", instanceName).
Int("vms", len(allVMs)).
Int("containers", len(allContainers)).
Msg("VMs and containers polled efficiently with cluster/resources")
return true
}
// pollBackupTasks polls backup tasks from a PVE instance
func (m *Monitor) pollBackupTasks(ctx context.Context, instanceName string, client PVEClientInterface) {
log.Debug().Str("instance", instanceName).Msg("Polling backup tasks")
tasks, err := client.GetBackupTasks(ctx)
if err != nil {
monErr := errors.WrapAPIError("get_backup_tasks", instanceName, err, 0)
log.Error().Err(monErr).Str("instance", instanceName).Msg("Failed to get backup tasks")
return
}
var backupTasks []models.BackupTask
for _, task := range tasks {
// Extract VMID from task ID (format: "UPID:node:pid:starttime:type:vmid:user@realm:")
vmid := 0
if task.ID != "" {
if vmidInt, err := strconv.Atoi(task.ID); err == nil {
vmid = vmidInt
}
}
taskID := fmt.Sprintf("%s-%s", instanceName, task.UPID)
backupTask := models.BackupTask{
ID: taskID,
Node: task.Node,
Type: task.Type,
VMID: vmid,
Status: task.Status,
StartTime: time.Unix(task.StartTime, 0),
}
if task.EndTime > 0 {
backupTask.EndTime = time.Unix(task.EndTime, 0)
}
backupTasks = append(backupTasks, backupTask)
}
// Update state with new backup tasks for this instance
m.state.UpdateBackupTasksForInstance(instanceName, backupTasks)
}
// pollPBSInstance polls a single PBS instance
func (m *Monitor) pollPBSInstance(ctx context.Context, instanceName string, client *pbs.Client) {
start := time.Now()
debugEnabled := logging.IsLevelEnabled(zerolog.DebugLevel)
var pollErr error
if m.pollMetrics != nil {
m.pollMetrics.IncInFlight("pbs")
defer m.pollMetrics.DecInFlight("pbs")
defer func() {
m.pollMetrics.RecordResult(PollResult{
InstanceName: instanceName,
InstanceType: "pbs",
Success: pollErr == nil,
Error: pollErr,
StartTime: start,
EndTime: time.Now(),
})
}()
}
if m.stalenessTracker != nil {
defer func() {
if pollErr == nil {
m.stalenessTracker.UpdateSuccess(InstanceTypePBS, instanceName, nil)
} else {
m.stalenessTracker.UpdateError(InstanceTypePBS, instanceName)
}
}()
}
defer m.recordTaskResult(InstanceTypePBS, instanceName, pollErr)
// Check if context is cancelled
select {
case <-ctx.Done():
pollErr = ctx.Err()
if debugEnabled {
log.Debug().Str("instance", instanceName).Msg("Polling cancelled")
}
return
default:
}
if debugEnabled {
log.Debug().Str("instance", instanceName).Msg("Polling PBS instance")
}
// Get instance config
var instanceCfg *config.PBSInstance
for _, cfg := range m.config.PBSInstances {
if cfg.Name == instanceName {
instanceCfg = &cfg
if debugEnabled {
log.Debug().
Str("instance", instanceName).
Bool("monitorDatastores", cfg.MonitorDatastores).
Msg("Found PBS instance config")
}
break
}
}
if instanceCfg == nil {
log.Error().Str("instance", instanceName).Msg("PBS instance config not found")
return
}
// Initialize PBS instance with default values
pbsInst := models.PBSInstance{
ID: "pbs-" + instanceName,
Name: instanceName,
Host: instanceCfg.Host,
Status: "offline",
Version: "unknown",
ConnectionHealth: "unhealthy",
LastSeen: time.Now(),
}
// Try to get version first
version, versionErr := client.GetVersion(ctx)
if versionErr == nil {
pbsInst.Status = "online"
pbsInst.Version = version.Version
pbsInst.ConnectionHealth = "healthy"
m.resetAuthFailures(instanceName, "pbs")
m.state.SetConnectionHealth("pbs-"+instanceName, true)
if debugEnabled {
log.Debug().
Str("instance", instanceName).
Str("version", version.Version).
Bool("monitorDatastores", instanceCfg.MonitorDatastores).
Msg("PBS version retrieved successfully")
}
} else {
if debugEnabled {
log.Debug().Err(versionErr).Str("instance", instanceName).Msg("Failed to get PBS version, trying fallback")
}
ctx2, cancel2 := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel2()
_, datastoreErr := client.GetDatastores(ctx2)
if datastoreErr == nil {
pbsInst.Status = "online"
pbsInst.Version = "connected"
pbsInst.ConnectionHealth = "healthy"
m.resetAuthFailures(instanceName, "pbs")
m.state.SetConnectionHealth("pbs-"+instanceName, true)
log.Info().
Str("instance", instanceName).
Msg("PBS connected (version unavailable but datastores accessible)")
} else {
pbsInst.Status = "offline"
pbsInst.ConnectionHealth = "error"
monErr := errors.WrapConnectionError("get_pbs_version", instanceName, versionErr)
log.Error().Err(monErr).Str("instance", instanceName).Msg("Failed to connect to PBS")
m.state.SetConnectionHealth("pbs-"+instanceName, false)
if errors.IsAuthError(versionErr) || errors.IsAuthError(datastoreErr) {
m.recordAuthFailure(instanceName, "pbs")
return
}
}
}
// Get node status (CPU, memory, etc.)
nodeStatus, err := client.GetNodeStatus(ctx)
if err != nil {
if debugEnabled {
log.Debug().Err(err).Str("instance", instanceName).Msg("Could not get PBS node status (may need Sys.Audit permission)")
}
} else if nodeStatus != nil {
pbsInst.CPU = nodeStatus.CPU
if nodeStatus.Memory.Total > 0 {
pbsInst.Memory = float64(nodeStatus.Memory.Used) / float64(nodeStatus.Memory.Total) * 100
pbsInst.MemoryUsed = nodeStatus.Memory.Used
pbsInst.MemoryTotal = nodeStatus.Memory.Total
}
pbsInst.Uptime = nodeStatus.Uptime
log.Debug().
Str("instance", instanceName).
Float64("cpu", pbsInst.CPU).
Float64("memory", pbsInst.Memory).
Int64("uptime", pbsInst.Uptime).
Msg("PBS node status retrieved")
}
// Poll datastores if enabled
if instanceCfg.MonitorDatastores {
datastores, err := client.GetDatastores(ctx)
if err != nil {
monErr := errors.WrapAPIError("get_datastores", instanceName, err, 0)
log.Error().Err(monErr).Str("instance", instanceName).Msg("Failed to get datastores")
} else {
log.Info().
Str("instance", instanceName).
Int("count", len(datastores)).
Msg("Got PBS datastores")
for _, ds := range datastores {
total := ds.Total
if total == 0 && ds.TotalSpace > 0 {
total = ds.TotalSpace
}
used := ds.Used
if used == 0 && ds.UsedSpace > 0 {
used = ds.UsedSpace
}
avail := ds.Avail
if avail == 0 && ds.AvailSpace > 0 {
avail = ds.AvailSpace
}
if total == 0 && used > 0 && avail > 0 {
total = used + avail
}
log.Debug().
Str("store", ds.Store).
Int64("total", total).
Int64("used", used).
Int64("avail", avail).
Int64("orig_total", ds.Total).
Int64("orig_total_space", ds.TotalSpace).
Msg("PBS datastore details")
modelDS := models.PBSDatastore{
Name: ds.Store,
Total: total,
Used: used,
Free: avail,
Usage: safePercentage(float64(used), float64(total)),
Status: "available",
DeduplicationFactor: ds.DeduplicationFactor,
}
namespaces, err := client.ListNamespaces(ctx, ds.Store, "", 0)
if err != nil {
log.Warn().Err(err).
Str("instance", instanceName).
Str("datastore", ds.Store).
Msg("Failed to list namespaces")
} else {
for _, ns := range namespaces {
nsPath := ns.NS
if nsPath == "" {
nsPath = ns.Path
}
if nsPath == "" {
nsPath = ns.Name
}
modelNS := models.PBSNamespace{
Path: nsPath,
Parent: ns.Parent,
Depth: strings.Count(nsPath, "/"),
}
modelDS.Namespaces = append(modelDS.Namespaces, modelNS)
}
hasRoot := false
for _, ns := range modelDS.Namespaces {
if ns.Path == "" {
hasRoot = true
break
}
}
if !hasRoot {
modelDS.Namespaces = append([]models.PBSNamespace{{Path: "", Depth: 0}}, modelDS.Namespaces...)
}
}
pbsInst.Datastores = append(pbsInst.Datastores, modelDS)
}
}
}
// Update state and run alerts
m.state.UpdatePBSInstance(pbsInst)
log.Info().
Str("instance", instanceName).
Str("id", pbsInst.ID).
Int("datastores", len(pbsInst.Datastores)).
Msg("PBS instance updated in state")
if m.alertManager != nil {
m.alertManager.CheckPBS(pbsInst)
}
// Poll backups if enabled
if instanceCfg.MonitorBackups {
if len(pbsInst.Datastores) == 0 {
log.Debug().
Str("instance", instanceName).
Msg("No PBS datastores available for backup polling")
} else if !m.config.EnableBackupPolling {
log.Debug().
Str("instance", instanceName).
Msg("Skipping PBS backup polling - globally disabled")
} else {
now := time.Now()
m.mu.RLock()
lastPoll := m.lastPBSBackupPoll[instanceName]
inProgress := m.pbsBackupPollers[instanceName]
m.mu.RUnlock()
shouldPoll, reason, newLast := m.shouldRunBackupPoll(lastPoll, now)
if !shouldPoll {
if reason != "" {
log.Debug().
Str("instance", instanceName).
Str("reason", reason).
Msg("Skipping PBS backup polling this cycle")
}
} else if inProgress {
log.Debug().
Str("instance", instanceName).
Msg("PBS backup polling already in progress")
} else {
datastoreSnapshot := make([]models.PBSDatastore, len(pbsInst.Datastores))
copy(datastoreSnapshot, pbsInst.Datastores)
m.mu.Lock()
if m.pbsBackupPollers == nil {
m.pbsBackupPollers = make(map[string]bool)
}
if m.pbsBackupPollers[instanceName] {
m.mu.Unlock()
} else {
m.pbsBackupPollers[instanceName] = true
m.lastPBSBackupPoll[instanceName] = newLast
m.mu.Unlock()
go func(ds []models.PBSDatastore, inst string, start time.Time, pbsClient *pbs.Client) {
defer func() {
m.mu.Lock()
delete(m.pbsBackupPollers, inst)
m.lastPBSBackupPoll[inst] = time.Now()
m.mu.Unlock()
}()
log.Info().
Str("instance", inst).
Int("datastores", len(ds)).
Msg("Starting background PBS backup polling")
backupCtx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
defer cancel()
m.pollPBSBackups(backupCtx, inst, pbsClient, ds)
log.Info().
Str("instance", inst).
Dur("duration", time.Since(start)).
Msg("Completed background PBS backup polling")
}(datastoreSnapshot, instanceName, now, client)
}
}
}
} else {
log.Debug().
Str("instance", instanceName).
Msg("PBS backup monitoring disabled")
}
}
// pollPMGInstance polls a single Proxmox Mail Gateway instance
func (m *Monitor) pollPMGInstance(ctx context.Context, instanceName string, client *pmg.Client) {
start := time.Now()
debugEnabled := logging.IsLevelEnabled(zerolog.DebugLevel)
var pollErr error
if m.pollMetrics != nil {
m.pollMetrics.IncInFlight("pmg")
defer m.pollMetrics.DecInFlight("pmg")
defer func() {
m.pollMetrics.RecordResult(PollResult{
InstanceName: instanceName,
InstanceType: "pmg",
Success: pollErr == nil,
Error: pollErr,
StartTime: start,
EndTime: time.Now(),
})
}()
}
if m.stalenessTracker != nil {
defer func() {
if pollErr == nil {
m.stalenessTracker.UpdateSuccess(InstanceTypePMG, instanceName, nil)
} else {
m.stalenessTracker.UpdateError(InstanceTypePMG, instanceName)
}
}()
}
defer m.recordTaskResult(InstanceTypePMG, instanceName, pollErr)
select {
case <-ctx.Done():
pollErr = ctx.Err()
if debugEnabled {
log.Debug().Str("instance", instanceName).Msg("PMG polling cancelled by context")
}
return
default:
}
if debugEnabled {
log.Debug().Str("instance", instanceName).Msg("Polling PMG instance")
}
var instanceCfg *config.PMGInstance
for idx := range m.config.PMGInstances {
if m.config.PMGInstances[idx].Name == instanceName {
instanceCfg = &m.config.PMGInstances[idx]
break
}
}
if instanceCfg == nil {
log.Error().Str("instance", instanceName).Msg("PMG instance config not found")
pollErr = fmt.Errorf("pmg instance config not found for %s", instanceName)
return
}
now := time.Now()
pmgInst := models.PMGInstance{
ID: "pmg-" + instanceName,
Name: instanceName,
Host: instanceCfg.Host,
Status: "offline",
ConnectionHealth: "unhealthy",
LastSeen: now,
LastUpdated: now,
}
version, err := client.GetVersion(ctx)
if err != nil {
monErr := errors.WrapConnectionError("pmg_get_version", instanceName, err)
pollErr = monErr
log.Error().Err(monErr).Str("instance", instanceName).Msg("Failed to connect to PMG instance")
m.state.SetConnectionHealth("pmg-"+instanceName, false)
m.state.UpdatePMGInstance(pmgInst)
// Check PMG offline status against alert thresholds
if m.alertManager != nil {
m.alertManager.CheckPMG(pmgInst)
}
if errors.IsAuthError(err) {
m.recordAuthFailure(instanceName, "pmg")
}
return
}
pmgInst.Status = "online"
pmgInst.ConnectionHealth = "healthy"
if version != nil {
pmgInst.Version = strings.TrimSpace(version.Version)
}
m.state.SetConnectionHealth("pmg-"+instanceName, true)
m.resetAuthFailures(instanceName, "pmg")
cluster, err := client.GetClusterStatus(ctx, true)
if err != nil {
if debugEnabled {
log.Debug().Err(err).Str("instance", instanceName).Msg("Failed to retrieve PMG cluster status")
}
}
backupNodes := make(map[string]struct{})
if len(cluster) > 0 {
nodes := make([]models.PMGNodeStatus, 0, len(cluster))
for _, entry := range cluster {
status := strings.ToLower(strings.TrimSpace(entry.Type))
if status == "" {
status = "online"
}
node := models.PMGNodeStatus{
Name: entry.Name,
Status: status,
Role: entry.Type,
}
backupNodes[entry.Name] = struct{}{}
// Fetch queue status for this node
if queueData, qErr := client.GetQueueStatus(ctx, entry.Name); qErr != nil {
if debugEnabled {
log.Debug().Err(qErr).
Str("instance", instanceName).
Str("node", entry.Name).
Msg("Failed to fetch PMG queue status")
}
} else if queueData != nil {
total := queueData.Active + queueData.Deferred + queueData.Hold + queueData.Incoming
node.QueueStatus = &models.PMGQueueStatus{
Active: queueData.Active,
Deferred: queueData.Deferred,
Hold: queueData.Hold,
Incoming: queueData.Incoming,
Total: total,
OldestAge: queueData.OldestAge,
UpdatedAt: time.Now(),
}
}
nodes = append(nodes, node)
}
pmgInst.Nodes = nodes
}
if len(backupNodes) == 0 {
trimmed := strings.TrimSpace(instanceName)
if trimmed != "" {
backupNodes[trimmed] = struct{}{}
}
}
pmgBackups := make([]models.PMGBackup, 0)
seenBackupIDs := make(map[string]struct{})
for nodeName := range backupNodes {
if ctx.Err() != nil {
break
}
backups, backupErr := client.ListBackups(ctx, nodeName)
if backupErr != nil {
if debugEnabled {
log.Debug().Err(backupErr).
Str("instance", instanceName).
Str("node", nodeName).
Msg("Failed to list PMG configuration backups")
}
continue
}
for _, b := range backups {
backupTime := time.Unix(b.Timestamp, 0)
id := fmt.Sprintf("pmg-%s-%s-%d", instanceName, nodeName, b.Timestamp)
if _, exists := seenBackupIDs[id]; exists {
continue
}
seenBackupIDs[id] = struct{}{}
pmgBackups = append(pmgBackups, models.PMGBackup{
ID: id,
Instance: instanceName,
Node: nodeName,
Filename: b.Filename,
BackupTime: backupTime,
Size: b.Size,
})
}
}
if debugEnabled {
log.Debug().
Str("instance", instanceName).
Int("backupCount", len(pmgBackups)).
Msg("PMG backups polled")
}
if stats, err := client.GetMailStatistics(ctx, "day"); err != nil {
log.Warn().Err(err).Str("instance", instanceName).Msg("Failed to fetch PMG mail statistics")
} else if stats != nil {
pmgInst.MailStats = &models.PMGMailStats{
Timeframe: "day",
CountTotal: stats.Count,
CountIn: stats.CountIn,
CountOut: stats.CountOut,
SpamIn: stats.SpamIn,
SpamOut: stats.SpamOut,
VirusIn: stats.VirusIn,
VirusOut: stats.VirusOut,
BouncesIn: stats.BouncesIn,
BouncesOut: stats.BouncesOut,
BytesIn: stats.BytesIn,
BytesOut: stats.BytesOut,
GreylistCount: stats.GreylistCount,
JunkIn: stats.JunkIn,
AverageProcessTimeMs: stats.AvgProcessSec * 1000,
RBLRejects: stats.RBLRejects,
PregreetRejects: stats.Pregreet,
UpdatedAt: time.Now(),
}
}
if counts, err := client.GetMailCount(ctx, 24); err != nil {
if debugEnabled {
log.Debug().Err(err).Str("instance", instanceName).Msg("Failed to fetch PMG mail count data")
}
} else if len(counts) > 0 {
points := make([]models.PMGMailCountPoint, 0, len(counts))
for _, entry := range counts {
ts := time.Unix(entry.Time, 0)
points = append(points, models.PMGMailCountPoint{
Timestamp: ts,
Count: entry.Count,
CountIn: entry.CountIn,
CountOut: entry.CountOut,
SpamIn: entry.SpamIn,
SpamOut: entry.SpamOut,
VirusIn: entry.VirusIn,
VirusOut: entry.VirusOut,
RBLRejects: entry.RBLRejects,
Pregreet: entry.PregreetReject,
BouncesIn: entry.BouncesIn,
BouncesOut: entry.BouncesOut,
Greylist: entry.GreylistCount,
Index: entry.Index,
Timeframe: "hour",
WindowStart: ts,
})
}
pmgInst.MailCount = points
}
if scores, err := client.GetSpamScores(ctx); err != nil {
if debugEnabled {
log.Debug().Err(err).Str("instance", instanceName).Msg("Failed to fetch PMG spam score distribution")
}
} else if len(scores) > 0 {
buckets := make([]models.PMGSpamBucket, 0, len(scores))
for _, bucket := range scores {
buckets = append(buckets, models.PMGSpamBucket{
Score: bucket.Level,
Count: float64(bucket.Count),
})
}
pmgInst.SpamDistribution = buckets
}
quarantine := models.PMGQuarantineTotals{}
if spamStatus, err := client.GetQuarantineStatus(ctx, "spam"); err == nil && spamStatus != nil {
quarantine.Spam = spamStatus.Count
}
if virusStatus, err := client.GetQuarantineStatus(ctx, "virus"); err == nil && virusStatus != nil {
quarantine.Virus = virusStatus.Count
}
pmgInst.Quarantine = &quarantine
m.state.UpdatePMGBackups(instanceName, pmgBackups)
m.state.UpdatePMGInstance(pmgInst)
log.Info().
Str("instance", instanceName).
Str("status", pmgInst.Status).
Int("nodes", len(pmgInst.Nodes)).
Msg("PMG instance updated in state")
// Check PMG metrics against alert thresholds
if m.alertManager != nil {
m.alertManager.CheckPMG(pmgInst)
}
}
// GetState returns the current state
func (m *Monitor) GetState() models.StateSnapshot {
// Check if mock mode is enabled
if mock.IsMockEnabled() {
state := mock.GetMockState()
if state.ActiveAlerts == nil {
// Populate snapshot lazily if the cache hasn't been filled yet.
mock.UpdateAlertSnapshots(m.alertManager.GetActiveAlerts(), m.alertManager.GetRecentlyResolved())
state = mock.GetMockState()
}
return state
}
return m.state.GetSnapshot()
}
// SetMockMode switches between mock data and real infrastructure data at runtime.
func (m *Monitor) SetMockMode(enable bool) {
current := mock.IsMockEnabled()
if current == enable {
log.Info().Bool("mockMode", enable).Msg("Mock mode already in desired state")
return
}
if enable {
mock.SetEnabled(true)
m.alertManager.ClearActiveAlerts()
m.mu.Lock()
m.resetStateLocked()
m.mu.Unlock()
m.StopDiscoveryService()
log.Info().Msg("Switched monitor to mock mode")
} else {
mock.SetEnabled(false)
m.alertManager.ClearActiveAlerts()
m.mu.Lock()
m.resetStateLocked()
m.mu.Unlock()
log.Info().Msg("Switched monitor to real data mode")
}
m.mu.RLock()
ctx := m.runtimeCtx
hub := m.wsHub
m.mu.RUnlock()
if hub != nil {
hub.BroadcastState(m.GetState().ToFrontend())
}
if !enable && ctx != nil && hub != nil {
// Kick off an immediate poll to repopulate state with live data
go m.poll(ctx, hub)
if m.config.DiscoveryEnabled {
go m.StartDiscoveryService(ctx, hub, m.config.DiscoverySubnet)
}
}
}
func (m *Monitor) resetStateLocked() {
m.state = models.NewState()
m.state.Stats = models.Stats{
StartTime: m.startTime,
Version: "2.0.0-go",
}
}
// GetStartTime returns the monitor start time
func (m *Monitor) GetStartTime() time.Time {
return m.startTime
}
// GetDiscoveryService returns the discovery service
func (m *Monitor) GetDiscoveryService() *discovery.Service {
return m.discoveryService
}
// StartDiscoveryService starts the discovery service if not already running
func (m *Monitor) StartDiscoveryService(ctx context.Context, wsHub *websocket.Hub, subnet string) {
m.mu.Lock()
defer m.mu.Unlock()
if m.discoveryService != nil {
log.Debug().Msg("Discovery service already running")
return
}
if subnet == "" {
subnet = "auto"
}
cfgProvider := func() config.DiscoveryConfig {
m.mu.RLock()
defer m.mu.RUnlock()
if m.config == nil {
return config.DefaultDiscoveryConfig()
}
return config.CloneDiscoveryConfig(m.config.Discovery)
}
m.discoveryService = discovery.NewService(wsHub, 5*time.Minute, subnet, cfgProvider)
if m.discoveryService != nil {
m.discoveryService.Start(ctx)
log.Info().Str("subnet", subnet).Msg("Discovery service started")
} else {
log.Error().Msg("Failed to create discovery service")
}
}
// StopDiscoveryService stops the discovery service if running
func (m *Monitor) StopDiscoveryService() {
m.mu.Lock()
defer m.mu.Unlock()
if m.discoveryService != nil {
m.discoveryService.Stop()
m.discoveryService = nil
log.Info().Msg("Discovery service stopped")
}
}
// GetGuestMetrics returns historical metrics for a guest
func (m *Monitor) GetGuestMetrics(guestID string, duration time.Duration) map[string][]MetricPoint {
return m.metricsHistory.GetAllGuestMetrics(guestID, duration)
}
// GetNodeMetrics returns historical metrics for a node
func (m *Monitor) GetNodeMetrics(nodeID string, metricType string, duration time.Duration) []MetricPoint {
return m.metricsHistory.GetNodeMetrics(nodeID, metricType, duration)
}
// GetStorageMetrics returns historical metrics for storage
func (m *Monitor) GetStorageMetrics(storageID string, duration time.Duration) map[string][]MetricPoint {
return m.metricsHistory.GetAllStorageMetrics(storageID, duration)
}
// GetAlertManager returns the alert manager
func (m *Monitor) GetAlertManager() *alerts.Manager {
return m.alertManager
}
// GetNotificationManager returns the notification manager
func (m *Monitor) GetNotificationManager() *notifications.NotificationManager {
return m.notificationMgr
}
// GetConfigPersistence returns the config persistence manager
func (m *Monitor) GetConfigPersistence() *config.ConfigPersistence {
return m.configPersist
}
// pollStorageBackups polls backup files from storage
// Deprecated: This function should not be called directly as it causes duplicate GetNodes calls.
// Use pollStorageBackupsWithNodes instead.
func (m *Monitor) pollStorageBackups(ctx context.Context, instanceName string, client PVEClientInterface) {
log.Warn().Str("instance", instanceName).Msg("pollStorageBackups called directly - this causes duplicate GetNodes calls and syslog spam on non-clustered nodes")
// Get all nodes
nodes, err := client.GetNodes(ctx)
if err != nil {
monErr := errors.WrapConnectionError("get_nodes_for_backups", instanceName, err)
log.Error().Err(monErr).Str("instance", instanceName).Msg("Failed to get nodes for backup polling")
return
}
m.pollStorageBackupsWithNodes(ctx, instanceName, client, nodes)
}
// pollStorageBackupsWithNodes polls backups using a provided nodes list to avoid duplicate GetNodes calls
func (m *Monitor) pollStorageBackupsWithNodes(ctx context.Context, instanceName string, client PVEClientInterface, nodes []proxmox.Node) {
var allBackups []models.StorageBackup
seenVolids := make(map[string]bool) // Track seen volume IDs to avoid duplicates
hadSuccessfulNode := false // Track if at least one node responded successfully
storagesWithBackup := 0 // Number of storages that should contain backups
contentSuccess := 0 // Number of successful storage content fetches
contentFailures := 0 // Number of failed storage content fetches
storageQueryErrors := 0 // Number of nodes where storage list could not be queried
// For each node, get storage and check content
for _, node := range nodes {
if node.Status != "online" {
continue
}
// Get storage for this node - retry once on timeout
var storages []proxmox.Storage
var err error
for attempt := 1; attempt <= 2; attempt++ {
storages, err = client.GetStorage(ctx, node.Node)
if err == nil {
break // Success
}
// Check if it's a timeout error
if strings.Contains(err.Error(), "timeout") || strings.Contains(err.Error(), "deadline exceeded") {
if attempt == 1 {
log.Warn().
Str("node", node.Node).
Str("instance", instanceName).
Msg("Storage query timed out, retrying with extended timeout...")
// Give it a bit more time on retry
time.Sleep(2 * time.Second)
continue
}
}
// Non-timeout error or second attempt failed
break
}
if err != nil {
monErr := errors.NewMonitorError(errors.ErrorTypeAPI, "get_storage_for_backups", instanceName, err).WithNode(node.Node)
log.Warn().Err(monErr).Str("node", node.Node).Msg("Failed to get storage for backups - skipping node")
storageQueryErrors++
continue
}
hadSuccessfulNode = true
// For each storage that can contain backups or templates
for _, storage := range storages {
// Check if storage supports backup content
if !strings.Contains(storage.Content, "backup") {
continue
}
storagesWithBackup++
// Get storage content
contents, err := client.GetStorageContent(ctx, node.Node, storage.Storage)
if err != nil {
monErr := errors.NewMonitorError(errors.ErrorTypeAPI, "get_storage_content", instanceName, err).WithNode(node.Node)
log.Debug().Err(monErr).
Str("node", node.Node).
Str("storage", storage.Storage).
Msg("Failed to get storage content")
contentFailures++
continue
}
contentSuccess++
// Convert to models
for _, content := range contents {
// Skip if we've already seen this item (shared storage duplicate)
if seenVolids[content.Volid] {
continue
}
seenVolids[content.Volid] = true
// Skip templates and ISOs - they're not backups
if content.Content == "vztmpl" || content.Content == "iso" {
continue
}
// Determine type from content type and VMID
backupType := "unknown"
if content.VMID == 0 {
backupType = "host"
} else if strings.Contains(content.Volid, "/vm/") || strings.Contains(content.Volid, "qemu") {
backupType = "qemu"
} else if strings.Contains(content.Volid, "/ct/") || strings.Contains(content.Volid, "lxc") {
backupType = "lxc"
} else if strings.Contains(content.Format, "pbs-ct") {
// PBS format check as fallback
backupType = "lxc"
} else if strings.Contains(content.Format, "pbs-vm") {
// PBS format check as fallback
backupType = "qemu"
}
// Always use the actual node name
backupNode := node.Node
isPBSStorage := strings.HasPrefix(storage.Storage, "pbs-") || storage.Type == "pbs"
// Check verification status for PBS backups
verified := false
verificationInfo := ""
if isPBSStorage {
// Check if verified flag is set
if content.Verified > 0 {
verified = true
}
// Also check verification map if available
if content.Verification != nil {
if state, ok := content.Verification["state"].(string); ok {
verified = (state == "ok")
verificationInfo = state
}
}
}
backup := models.StorageBackup{
ID: fmt.Sprintf("%s-%s", instanceName, content.Volid),
Storage: storage.Storage,
Node: backupNode,
Instance: instanceName,
Type: backupType,
VMID: content.VMID,
Time: time.Unix(content.CTime, 0),
CTime: content.CTime,
Size: int64(content.Size),
Format: content.Format,
Notes: content.Notes,
Protected: content.Protected > 0,
Volid: content.Volid,
IsPBS: isPBSStorage,
Verified: verified,
Verification: verificationInfo,
}
allBackups = append(allBackups, backup)
}
}
}
// Decide whether to keep existing backups when every query failed
if shouldPreserveBackups(len(nodes), hadSuccessfulNode, storagesWithBackup, contentSuccess) {
if len(nodes) > 0 && !hadSuccessfulNode {
log.Warn().
Str("instance", instanceName).
Int("nodes", len(nodes)).
Int("errors", storageQueryErrors).
Msg("Failed to query storage on all nodes; keeping previous backup list")
} else if storagesWithBackup > 0 && contentSuccess == 0 {
log.Warn().
Str("instance", instanceName).
Int("storages", storagesWithBackup).
Int("failures", contentFailures).
Msg("All storage content queries failed; keeping previous backup list")
}
return
}
// Update state with storage backups for this instance
m.state.UpdateStorageBackupsForInstance(instanceName, allBackups)
if m.alertManager != nil {
snapshot := m.state.GetSnapshot()
guestsByKey, guestsByVMID := buildGuestLookups(snapshot)
pveStorage := snapshot.Backups.PVE.StorageBackups
if len(pveStorage) == 0 && len(snapshot.PVEBackups.StorageBackups) > 0 {
pveStorage = snapshot.PVEBackups.StorageBackups
}
pbsBackups := snapshot.Backups.PBS
if len(pbsBackups) == 0 && len(snapshot.PBSBackups) > 0 {
pbsBackups = snapshot.PBSBackups
}
pmgBackups := snapshot.Backups.PMG
if len(pmgBackups) == 0 && len(snapshot.PMGBackups) > 0 {
pmgBackups = snapshot.PMGBackups
}
m.alertManager.CheckBackups(pveStorage, pbsBackups, pmgBackups, guestsByKey, guestsByVMID)
}
log.Debug().
Str("instance", instanceName).
Int("count", len(allBackups)).
Msg("Storage backups polled")
}
func shouldPreserveBackups(nodeCount int, hadSuccessfulNode bool, storagesWithBackup, contentSuccess int) bool {
if nodeCount > 0 && !hadSuccessfulNode {
return true
}
if storagesWithBackup > 0 && contentSuccess == 0 {
return true
}
return false
}
func buildGuestLookups(snapshot models.StateSnapshot) (map[string]alerts.GuestLookup, map[string]alerts.GuestLookup) {
byKey := make(map[string]alerts.GuestLookup)
byVMID := make(map[string]alerts.GuestLookup)
for _, vm := range snapshot.VMs {
info := alerts.GuestLookup{
Name: vm.Name,
Instance: vm.Instance,
Node: vm.Node,
Type: vm.Type,
VMID: vm.VMID,
}
key := alerts.BuildGuestKey(vm.Instance, vm.Node, vm.VMID)
byKey[key] = info
vmidKey := fmt.Sprintf("%d", vm.VMID)
if _, exists := byVMID[vmidKey]; !exists {
byVMID[vmidKey] = info
}
}
for _, ct := range snapshot.Containers {
info := alerts.GuestLookup{
Name: ct.Name,
Instance: ct.Instance,
Node: ct.Node,
Type: ct.Type,
VMID: int(ct.VMID),
}
key := alerts.BuildGuestKey(ct.Instance, ct.Node, int(ct.VMID))
if _, exists := byKey[key]; !exists {
byKey[key] = info
}
vmidKey := fmt.Sprintf("%d", ct.VMID)
if _, exists := byVMID[vmidKey]; !exists {
byVMID[vmidKey] = info
}
}
return byKey, byVMID
}
func (m *Monitor) calculateBackupOperationTimeout(instanceName string) time.Duration {
const (
minTimeout = 2 * time.Minute
maxTimeout = 5 * time.Minute
timeoutPerGuest = 2 * time.Second
)
timeout := minTimeout
snapshot := m.state.GetSnapshot()
guestCount := 0
for _, vm := range snapshot.VMs {
if vm.Instance == instanceName && !vm.Template {
guestCount++
}
}
for _, ct := range snapshot.Containers {
if ct.Instance == instanceName && !ct.Template {
guestCount++
}
}
if guestCount > 0 {
dynamic := time.Duration(guestCount) * timeoutPerGuest
if dynamic > timeout {
timeout = dynamic
}
}
if timeout > maxTimeout {
return maxTimeout
}
return timeout
}
// pollGuestSnapshots polls snapshots for all VMs and containers
func (m *Monitor) pollGuestSnapshots(ctx context.Context, instanceName string, client PVEClientInterface) {
log.Debug().Str("instance", instanceName).Msg("Polling guest snapshots")
// Get current VMs and containers from state for this instance
m.mu.RLock()
var vms []models.VM
for _, vm := range m.state.VMs {
if vm.Instance == instanceName {
vms = append(vms, vm)
}
}
var containers []models.Container
for _, ct := range m.state.Containers {
if ct.Instance == instanceName {
containers = append(containers, ct)
}
}
m.mu.RUnlock()
guestKey := func(instance, node string, vmid int) string {
if instance == node {
return fmt.Sprintf("%s-%d", node, vmid)
}
return fmt.Sprintf("%s-%s-%d", instance, node, vmid)
}
guestNames := make(map[string]string, len(vms)+len(containers))
for _, vm := range vms {
guestNames[guestKey(instanceName, vm.Node, vm.VMID)] = vm.Name
}
for _, ct := range containers {
guestNames[guestKey(instanceName, ct.Node, ct.VMID)] = ct.Name
}
activeGuests := 0
for _, vm := range vms {
if !vm.Template {
activeGuests++
}
}
for _, ct := range containers {
if !ct.Template {
activeGuests++
}
}
const (
minSnapshotTimeout = 60 * time.Second
maxSnapshotTimeout = 4 * time.Minute
snapshotTimeoutPerGuest = 2 * time.Second
)
timeout := minSnapshotTimeout
if activeGuests > 0 {
dynamic := time.Duration(activeGuests) * snapshotTimeoutPerGuest
if dynamic > timeout {
timeout = dynamic
}
}
if timeout > maxSnapshotTimeout {
timeout = maxSnapshotTimeout
}
if deadline, ok := ctx.Deadline(); ok {
remaining := time.Until(deadline)
if remaining <= 0 {
log.Warn().
Str("instance", instanceName).
Msg("Skipping guest snapshot polling; backup context deadline exceeded")
return
}
if timeout > remaining {
timeout = remaining
}
}
snapshotCtx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
log.Debug().
Str("instance", instanceName).
Int("guestCount", activeGuests).
Dur("timeout", timeout).
Msg("Guest snapshot polling budget established")
var allSnapshots []models.GuestSnapshot
deadlineExceeded := false
// Poll VM snapshots
for _, vm := range vms {
// Skip templates
if vm.Template {
continue
}
snapshots, err := client.GetVMSnapshots(snapshotCtx, vm.Node, vm.VMID)
if err != nil {
if snapshotCtx.Err() != nil {
log.Warn().
Str("instance", instanceName).
Str("node", vm.Node).
Int("vmid", vm.VMID).
Err(snapshotCtx.Err()).
Msg("Aborting guest snapshot polling due to context cancellation while fetching VM snapshots")
deadlineExceeded = true
break
}
// This is common for VMs without snapshots, so use debug level
monErr := errors.NewMonitorError(errors.ErrorTypeAPI, "get_vm_snapshots", instanceName, err).WithNode(vm.Node)
log.Debug().
Err(monErr).
Str("node", vm.Node).
Int("vmid", vm.VMID).
Msg("Failed to get VM snapshots")
continue
}
for _, snap := range snapshots {
snapshot := models.GuestSnapshot{
ID: fmt.Sprintf("%s-%s-%d-%s", instanceName, vm.Node, vm.VMID, snap.Name),
Name: snap.Name,
Node: vm.Node,
Instance: instanceName,
Type: "qemu",
VMID: vm.VMID,
Time: time.Unix(snap.SnapTime, 0),
Description: snap.Description,
Parent: snap.Parent,
VMState: true, // VM state support enabled
}
allSnapshots = append(allSnapshots, snapshot)
}
}
if deadlineExceeded {
log.Warn().
Str("instance", instanceName).
Msg("Guest snapshot polling timed out before completing VM collection; retaining previous snapshots")
return
}
// Poll container snapshots
for _, ct := range containers {
// Skip templates
if ct.Template {
continue
}
snapshots, err := client.GetContainerSnapshots(snapshotCtx, ct.Node, ct.VMID)
if err != nil {
if snapshotCtx.Err() != nil {
log.Warn().
Str("instance", instanceName).
Str("node", ct.Node).
Int("vmid", ct.VMID).
Err(snapshotCtx.Err()).
Msg("Aborting guest snapshot polling due to context cancellation while fetching container snapshots")
deadlineExceeded = true
break
}
// API error 596 means snapshots not supported/available - this is expected for many containers
errStr := err.Error()
if strings.Contains(errStr, "596") || strings.Contains(errStr, "not available") {
// Silently skip containers without snapshot support
continue
}
// Log other errors at debug level
monErr := errors.NewMonitorError(errors.ErrorTypeAPI, "get_container_snapshots", instanceName, err).WithNode(ct.Node)
log.Debug().
Err(monErr).
Str("node", ct.Node).
Int("vmid", ct.VMID).
Msg("Failed to get container snapshots")
continue
}
for _, snap := range snapshots {
snapshot := models.GuestSnapshot{
ID: fmt.Sprintf("%s-%s-%d-%s", instanceName, ct.Node, ct.VMID, snap.Name),
Name: snap.Name,
Node: ct.Node,
Instance: instanceName,
Type: "lxc",
VMID: ct.VMID,
Time: time.Unix(snap.SnapTime, 0),
Description: snap.Description,
Parent: snap.Parent,
VMState: false,
}
allSnapshots = append(allSnapshots, snapshot)
}
}
if deadlineExceeded || snapshotCtx.Err() != nil {
log.Warn().
Str("instance", instanceName).
Msg("Guest snapshot polling timed out before completion; retaining previous snapshots")
return
}
// Update state with guest snapshots for this instance
m.state.UpdateGuestSnapshotsForInstance(instanceName, allSnapshots)
if m.alertManager != nil {
m.alertManager.CheckSnapshotsForInstance(instanceName, allSnapshots, guestNames)
}
log.Debug().
Str("instance", instanceName).
Int("count", len(allSnapshots)).
Msg("Guest snapshots polled")
}
// Stop gracefully stops the monitor
func (m *Monitor) Stop() {
log.Info().Msg("Stopping monitor")
// Stop the alert manager to save history
if m.alertManager != nil {
m.alertManager.Stop()
}
// Stop notification manager
if m.notificationMgr != nil {
m.notificationMgr.Stop()
}
log.Info().Msg("Monitor stopped")
}
// recordAuthFailure records an authentication failure for a node
func (m *Monitor) recordAuthFailure(instanceName string, nodeType string) {
m.mu.Lock()
defer m.mu.Unlock()
nodeID := instanceName
if nodeType != "" {
nodeID = nodeType + "-" + instanceName
}
// Increment failure count
m.authFailures[nodeID]++
m.lastAuthAttempt[nodeID] = time.Now()
log.Warn().
Str("node", nodeID).
Int("failures", m.authFailures[nodeID]).
Msg("Authentication failure recorded")
// If we've exceeded the threshold, remove the node
const maxAuthFailures = 5
if m.authFailures[nodeID] >= maxAuthFailures {
log.Error().
Str("node", nodeID).
Int("failures", m.authFailures[nodeID]).
Msg("Maximum authentication failures reached, removing node from state")
// Remove from state based on type
if nodeType == "pve" {
m.removeFailedPVENode(instanceName)
} else if nodeType == "pbs" {
m.removeFailedPBSNode(instanceName)
} else if nodeType == "pmg" {
m.removeFailedPMGInstance(instanceName)
}
// Reset the counter since we've removed the node
delete(m.authFailures, nodeID)
delete(m.lastAuthAttempt, nodeID)
}
}
// resetAuthFailures resets the failure count for a node after successful auth
func (m *Monitor) resetAuthFailures(instanceName string, nodeType string) {
m.mu.Lock()
defer m.mu.Unlock()
nodeID := instanceName
if nodeType != "" {
nodeID = nodeType + "-" + instanceName
}
if count, exists := m.authFailures[nodeID]; exists && count > 0 {
log.Info().
Str("node", nodeID).
Int("previousFailures", count).
Msg("Authentication succeeded, resetting failure count")
delete(m.authFailures, nodeID)
delete(m.lastAuthAttempt, nodeID)
}
}
// removeFailedPVENode updates a PVE node to show failed authentication status
func (m *Monitor) removeFailedPVENode(instanceName string) {
// Get instance config to get host URL
var hostURL string
for _, cfg := range m.config.PVEInstances {
if cfg.Name == instanceName {
hostURL = cfg.Host
break
}
}
// Create a failed node entry to show in UI with error status
failedNode := models.Node{
ID: instanceName + "-failed",
Name: instanceName,
DisplayName: instanceName,
Instance: instanceName,
Host: hostURL, // Include host URL even for failed nodes
Status: "offline",
Type: "node",
ConnectionHealth: "error",
LastSeen: time.Now(),
// Set other fields to zero values to indicate no data
CPU: 0,
Memory: models.Memory{},
Disk: models.Disk{},
}
// Update with just the failed node
m.state.UpdateNodesForInstance(instanceName, []models.Node{failedNode})
// Remove all other resources associated with this instance
m.state.UpdateVMsForInstance(instanceName, []models.VM{})
m.state.UpdateContainersForInstance(instanceName, []models.Container{})
m.state.UpdateStorageForInstance(instanceName, []models.Storage{})
m.state.UpdateCephClustersForInstance(instanceName, []models.CephCluster{})
m.state.UpdateBackupTasksForInstance(instanceName, []models.BackupTask{})
m.state.UpdateStorageBackupsForInstance(instanceName, []models.StorageBackup{})
m.state.UpdateGuestSnapshotsForInstance(instanceName, []models.GuestSnapshot{})
// Set connection health to false
m.state.SetConnectionHealth(instanceName, false)
}
// removeFailedPBSNode removes a PBS node and all its resources from state
func (m *Monitor) removeFailedPBSNode(instanceName string) {
// Remove PBS instance by passing empty array
currentInstances := m.state.PBSInstances
var updatedInstances []models.PBSInstance
for _, inst := range currentInstances {
if inst.Name != instanceName {
updatedInstances = append(updatedInstances, inst)
}
}
m.state.UpdatePBSInstances(updatedInstances)
// Remove PBS backups
m.state.UpdatePBSBackups(instanceName, []models.PBSBackup{})
// Set connection health to false
m.state.SetConnectionHealth("pbs-"+instanceName, false)
}
// removeFailedPMGInstance removes PMG data from state when authentication fails repeatedly
func (m *Monitor) removeFailedPMGInstance(instanceName string) {
currentInstances := m.state.PMGInstances
updated := make([]models.PMGInstance, 0, len(currentInstances))
for _, inst := range currentInstances {
if inst.Name != instanceName {
updated = append(updated, inst)
}
}
m.state.UpdatePMGInstances(updated)
m.state.UpdatePMGBackups(instanceName, nil)
m.state.SetConnectionHealth("pmg-"+instanceName, false)
}
// pollPBSBackups fetches all backups from PBS datastores
func (m *Monitor) pollPBSBackups(ctx context.Context, instanceName string, client *pbs.Client, datastores []models.PBSDatastore) {
log.Debug().Str("instance", instanceName).Msg("Polling PBS backups")
var allBackups []models.PBSBackup
// Process each datastore
for _, ds := range datastores {
// Get namespace paths
namespacePaths := make([]string, 0, len(ds.Namespaces))
for _, ns := range ds.Namespaces {
namespacePaths = append(namespacePaths, ns.Path)
}
log.Info().
Str("instance", instanceName).
Str("datastore", ds.Name).
Int("namespaces", len(namespacePaths)).
Strs("namespace_paths", namespacePaths).
Msg("Processing datastore namespaces")
// Fetch backups from all namespaces concurrently
backupsMap, err := client.ListAllBackups(ctx, ds.Name, namespacePaths)
if err != nil {
log.Error().Err(err).
Str("instance", instanceName).
Str("datastore", ds.Name).
Msg("Failed to fetch PBS backups")
continue
}
// Convert PBS backups to model backups
for namespace, snapshots := range backupsMap {
for _, snapshot := range snapshots {
backupTime := time.Unix(snapshot.BackupTime, 0)
// Generate unique ID
id := fmt.Sprintf("pbs-%s-%s-%s-%s-%s-%d",
instanceName, ds.Name, namespace,
snapshot.BackupType, snapshot.BackupID,
snapshot.BackupTime)
// Extract file names from files (which can be strings or objects)
var fileNames []string
for _, file := range snapshot.Files {
switch f := file.(type) {
case string:
fileNames = append(fileNames, f)
case map[string]interface{}:
if filename, ok := f["filename"].(string); ok {
fileNames = append(fileNames, filename)
}
}
}
// Extract verification status
verified := false
if snapshot.Verification != nil {
switch v := snapshot.Verification.(type) {
case string:
verified = v == "ok"
case map[string]interface{}:
if state, ok := v["state"].(string); ok {
verified = state == "ok"
}
}
// Debug log verification data
log.Debug().
Str("vmid", snapshot.BackupID).
Int64("time", snapshot.BackupTime).
Interface("verification", snapshot.Verification).
Bool("verified", verified).
Msg("PBS backup verification status")
}
backup := models.PBSBackup{
ID: id,
Instance: instanceName,
Datastore: ds.Name,
Namespace: namespace,
BackupType: snapshot.BackupType,
VMID: snapshot.BackupID,
BackupTime: backupTime,
Size: snapshot.Size,
Protected: snapshot.Protected,
Verified: verified,
Comment: snapshot.Comment,
Files: fileNames,
Owner: snapshot.Owner,
}
allBackups = append(allBackups, backup)
}
}
}
log.Info().
Str("instance", instanceName).
Int("count", len(allBackups)).
Msg("PBS backups fetched")
// Update state
m.state.UpdatePBSBackups(instanceName, allBackups)
if m.alertManager != nil {
snapshot := m.state.GetSnapshot()
guestsByKey, guestsByVMID := buildGuestLookups(snapshot)
pveStorage := snapshot.Backups.PVE.StorageBackups
if len(pveStorage) == 0 && len(snapshot.PVEBackups.StorageBackups) > 0 {
pveStorage = snapshot.PVEBackups.StorageBackups
}
pbsBackups := snapshot.Backups.PBS
if len(pbsBackups) == 0 && len(snapshot.PBSBackups) > 0 {
pbsBackups = snapshot.PBSBackups
}
pmgBackups := snapshot.Backups.PMG
if len(pmgBackups) == 0 && len(snapshot.PMGBackups) > 0 {
pmgBackups = snapshot.PMGBackups
}
m.alertManager.CheckBackups(pveStorage, pbsBackups, pmgBackups, guestsByKey, guestsByVMID)
}
}
// checkMockAlerts checks alerts for mock data
func (m *Monitor) checkMockAlerts() {
log.Info().Bool("mockEnabled", mock.IsMockEnabled()).Msg("checkMockAlerts called")
if !mock.IsMockEnabled() {
log.Info().Msg("Mock mode not enabled, skipping mock alert check")
return
}
// Get mock state
state := mock.GetMockState()
log.Info().
Int("vms", len(state.VMs)).
Int("containers", len(state.Containers)).
Int("nodes", len(state.Nodes)).
Msg("Checking alerts for mock data")
// Clean up alerts for nodes that no longer exist
existingNodes := make(map[string]bool)
for _, node := range state.Nodes {
existingNodes[node.Name] = true
if node.Host != "" {
existingNodes[node.Host] = true
}
}
for _, pbsInst := range state.PBSInstances {
existingNodes[pbsInst.Name] = true
existingNodes["pbs-"+pbsInst.Name] = true
if pbsInst.Host != "" {
existingNodes[pbsInst.Host] = true
}
}
log.Info().
Int("trackedNodes", len(existingNodes)).
Msg("Collecting resources for alert cleanup in mock mode")
m.alertManager.CleanupAlertsForNodes(existingNodes)
guestsByKey, guestsByVMID := buildGuestLookups(state)
pveStorage := state.Backups.PVE.StorageBackups
if len(pveStorage) == 0 && len(state.PVEBackups.StorageBackups) > 0 {
pveStorage = state.PVEBackups.StorageBackups
}
pbsBackups := state.Backups.PBS
if len(pbsBackups) == 0 && len(state.PBSBackups) > 0 {
pbsBackups = state.PBSBackups
}
pmgBackups := state.Backups.PMG
if len(pmgBackups) == 0 && len(state.PMGBackups) > 0 {
pmgBackups = state.PMGBackups
}
m.alertManager.CheckBackups(pveStorage, pbsBackups, pmgBackups, guestsByKey, guestsByVMID)
// Limit how many guests we check per cycle to prevent blocking with large datasets
const maxGuestsPerCycle = 50
guestsChecked := 0
// Check alerts for VMs (up to limit)
for _, vm := range state.VMs {
if guestsChecked >= maxGuestsPerCycle {
log.Debug().
Int("checked", guestsChecked).
Int("total", len(state.VMs)+len(state.Containers)).
Msg("Reached guest check limit for this cycle")
break
}
m.alertManager.CheckGuest(vm, "mock")
guestsChecked++
}
// Check alerts for containers (if we haven't hit the limit)
for _, container := range state.Containers {
if guestsChecked >= maxGuestsPerCycle {
break
}
m.alertManager.CheckGuest(container, "mock")
guestsChecked++
}
// Check alerts for each node
for _, node := range state.Nodes {
m.alertManager.CheckNode(node)
}
// Check alerts for storage
log.Info().Int("storageCount", len(state.Storage)).Msg("Checking storage alerts")
for _, storage := range state.Storage {
log.Debug().
Str("name", storage.Name).
Float64("usage", storage.Usage).
Msg("Checking storage for alerts")
m.alertManager.CheckStorage(storage)
}
// Check alerts for PBS instances
log.Info().Int("pbsCount", len(state.PBSInstances)).Msg("Checking PBS alerts")
for _, pbsInst := range state.PBSInstances {
m.alertManager.CheckPBS(pbsInst)
}
// Check alerts for PMG instances
log.Info().Int("pmgCount", len(state.PMGInstances)).Msg("Checking PMG alerts")
for _, pmgInst := range state.PMGInstances {
m.alertManager.CheckPMG(pmgInst)
}
// Cache the latest alert snapshots directly in the mock data so the API can serve
// mock state without needing to grab the alert manager lock again.
mock.UpdateAlertSnapshots(m.alertManager.GetActiveAlerts(), m.alertManager.GetRecentlyResolved())
}