mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-02-18 00:17:39 +01:00
Router & Middleware: - Add auth context middleware for user/token extraction - Add tenant middleware with authorization checking - Refactor middleware chain ordering for proper isolation - Add router helpers for common patterns Authentication & SSO: - Enhance auth with tenant-aware context - Update OIDC, SAML, and SSO handlers for multi-tenant - Add RBAC handler improvements - Add security enhancements New Test Coverage: - API foundation tests - Auth and authorization tests - Router state and general tests - SSO handler CRUD tests - WebSocket isolation tests - Resource handler tests
1621 lines
52 KiB
Go
1621 lines
52 KiB
Go
package api
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"net/http"
|
|
"os"
|
|
"os/user"
|
|
"runtime"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/alerts"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/config"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/models"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/monitoring"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/updates"
|
|
"github.com/rcourtman/pulse-go-rewrite/pkg/pbs"
|
|
"github.com/rcourtman/pulse-go-rewrite/pkg/proxmox"
|
|
"github.com/rs/zerolog/log"
|
|
"golang.org/x/crypto/ssh"
|
|
)
|
|
|
|
// DiagnosticsInfo contains comprehensive diagnostic information
|
|
type DiagnosticsInfo struct {
|
|
Version string `json:"version"`
|
|
Runtime string `json:"runtime"`
|
|
Uptime float64 `json:"uptime"`
|
|
Nodes []NodeDiagnostic `json:"nodes"`
|
|
PBS []PBSDiagnostic `json:"pbs"`
|
|
System SystemDiagnostic `json:"system"`
|
|
MetricsStore *MetricsStoreDiagnostic `json:"metricsStore,omitempty"`
|
|
Discovery *DiscoveryDiagnostic `json:"discovery,omitempty"`
|
|
APITokens *APITokenDiagnostic `json:"apiTokens,omitempty"`
|
|
DockerAgents *DockerAgentDiagnostic `json:"dockerAgents,omitempty"`
|
|
Alerts *AlertsDiagnostic `json:"alerts,omitempty"`
|
|
AIChat *AIChatDiagnostic `json:"aiChat,omitempty"`
|
|
Errors []string `json:"errors"`
|
|
// NodeSnapshots captures the raw memory payload and derived usage Pulse last observed per node.
|
|
NodeSnapshots []monitoring.NodeMemorySnapshot `json:"nodeSnapshots,omitempty"`
|
|
// GuestSnapshots captures recent per-guest memory breakdowns (VM/LXC) with the raw Proxmox fields.
|
|
GuestSnapshots []monitoring.GuestMemorySnapshot `json:"guestSnapshots,omitempty"`
|
|
// MemorySources summarizes how many nodes currently rely on each memory source per instance.
|
|
MemorySources []MemorySourceStat `json:"memorySources,omitempty"`
|
|
}
|
|
|
|
// DiscoveryDiagnostic summarizes discovery configuration and recent activity.
|
|
type DiscoveryDiagnostic struct {
|
|
Enabled bool `json:"enabled"`
|
|
ConfiguredSubnet string `json:"configuredSubnet,omitempty"`
|
|
ActiveSubnet string `json:"activeSubnet,omitempty"`
|
|
EnvironmentOverride string `json:"environmentOverride,omitempty"`
|
|
SubnetAllowlist []string `json:"subnetAllowlist"`
|
|
SubnetBlocklist []string `json:"subnetBlocklist"`
|
|
Scanning bool `json:"scanning"`
|
|
ScanInterval string `json:"scanInterval,omitempty"`
|
|
LastScanStartedAt string `json:"lastScanStartedAt,omitempty"`
|
|
LastResultTimestamp string `json:"lastResultTimestamp,omitempty"`
|
|
LastResultServers int `json:"lastResultServers,omitempty"`
|
|
LastResultErrors int `json:"lastResultErrors,omitempty"`
|
|
History []DiscoveryHistoryItem `json:"history,omitempty"`
|
|
}
|
|
|
|
// DiscoveryHistoryItem summarizes the outcome of a recent discovery scan.
|
|
type DiscoveryHistoryItem struct {
|
|
StartedAt string `json:"startedAt"`
|
|
CompletedAt string `json:"completedAt"`
|
|
Duration string `json:"duration"`
|
|
DurationMs int64 `json:"durationMs"`
|
|
Subnet string `json:"subnet"`
|
|
ServerCount int `json:"serverCount"`
|
|
ErrorCount int `json:"errorCount"`
|
|
BlocklistLength int `json:"blocklistLength"`
|
|
Status string `json:"status"`
|
|
}
|
|
|
|
// MemorySourceStat aggregates memory-source usage per instance.
|
|
type MemorySourceStat struct {
|
|
Instance string `json:"instance"`
|
|
Source string `json:"source"`
|
|
NodeCount int `json:"nodeCount"`
|
|
LastUpdated string `json:"lastUpdated"`
|
|
Fallback bool `json:"fallback"`
|
|
}
|
|
|
|
// MetricsStoreDiagnostic summarizes metrics store health and data availability.
|
|
type MetricsStoreDiagnostic struct {
|
|
Enabled bool `json:"enabled"`
|
|
Status string `json:"status"`
|
|
DBSize int64 `json:"dbSize,omitempty"`
|
|
RawCount int64 `json:"rawCount,omitempty"`
|
|
MinuteCount int64 `json:"minuteCount,omitempty"`
|
|
HourlyCount int64 `json:"hourlyCount,omitempty"`
|
|
DailyCount int64 `json:"dailyCount,omitempty"`
|
|
TotalPoints int64 `json:"totalPoints,omitempty"`
|
|
BufferSize int `json:"bufferSize,omitempty"`
|
|
Notes []string `json:"notes,omitempty"`
|
|
Error string `json:"error,omitempty"`
|
|
}
|
|
|
|
func isFallbackMemorySource(source string) bool {
|
|
switch strings.ToLower(source) {
|
|
case "", "unknown", "nodes-endpoint", "node-status-used", "previous-snapshot":
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
func buildMetricsStoreDiagnostic(monitor *monitoring.Monitor) *MetricsStoreDiagnostic {
|
|
if monitor == nil {
|
|
return &MetricsStoreDiagnostic{
|
|
Enabled: false,
|
|
Status: "unavailable",
|
|
Error: "monitor not initialized",
|
|
}
|
|
}
|
|
|
|
store := monitor.GetMetricsStore()
|
|
if store == nil {
|
|
return &MetricsStoreDiagnostic{
|
|
Enabled: false,
|
|
Status: "unavailable",
|
|
Error: "metrics store not initialized",
|
|
}
|
|
}
|
|
|
|
stats := store.GetStats()
|
|
total := stats.RawCount + stats.MinuteCount + stats.HourlyCount + stats.DailyCount
|
|
status := "healthy"
|
|
notes := []string{}
|
|
|
|
switch {
|
|
case total == 0 && stats.BufferSize > 0:
|
|
status = "buffering"
|
|
notes = append(notes, "Metrics are buffered but not yet flushed")
|
|
case total == 0:
|
|
status = "empty"
|
|
notes = append(notes, "No historical metrics written yet")
|
|
}
|
|
|
|
return &MetricsStoreDiagnostic{
|
|
Enabled: true,
|
|
Status: status,
|
|
DBSize: stats.DBSize,
|
|
RawCount: stats.RawCount,
|
|
MinuteCount: stats.MinuteCount,
|
|
HourlyCount: stats.HourlyCount,
|
|
DailyCount: stats.DailyCount,
|
|
TotalPoints: total,
|
|
BufferSize: stats.BufferSize,
|
|
Notes: notes,
|
|
}
|
|
}
|
|
|
|
const diagnosticsCacheTTL = 45 * time.Second
|
|
|
|
var (
|
|
diagnosticsMetricsOnce sync.Once
|
|
|
|
diagnosticsCacheMu sync.RWMutex
|
|
diagnosticsCache DiagnosticsInfo
|
|
diagnosticsCacheTimestamp time.Time
|
|
|
|
diagnosticsCacheHits = prometheus.NewCounter(prometheus.CounterOpts{
|
|
Namespace: "pulse",
|
|
Subsystem: "diagnostics",
|
|
Name: "cache_hits_total",
|
|
Help: "Total number of diagnostics cache hits.",
|
|
})
|
|
|
|
diagnosticsCacheMisses = prometheus.NewCounter(prometheus.CounterOpts{
|
|
Namespace: "pulse",
|
|
Subsystem: "diagnostics",
|
|
Name: "cache_misses_total",
|
|
Help: "Total number of diagnostics cache misses.",
|
|
})
|
|
|
|
diagnosticsRefreshDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
|
|
Namespace: "pulse",
|
|
Subsystem: "diagnostics",
|
|
Name: "refresh_duration_seconds",
|
|
Help: "Duration of diagnostics refresh operations in seconds.",
|
|
Buckets: []float64{0.1, 0.25, 0.5, 1, 2.5, 5, 10, 20, 30},
|
|
})
|
|
)
|
|
|
|
// NodeDiagnostic contains diagnostic info for a Proxmox node
|
|
type NodeDiagnostic struct {
|
|
ID string `json:"id"`
|
|
Name string `json:"name"`
|
|
Host string `json:"host"`
|
|
Type string `json:"type"`
|
|
AuthMethod string `json:"authMethod"`
|
|
Connected bool `json:"connected"`
|
|
Error string `json:"error,omitempty"`
|
|
Details *NodeDetails `json:"details,omitempty"`
|
|
LastPoll string `json:"lastPoll,omitempty"`
|
|
ClusterInfo *ClusterInfo `json:"clusterInfo,omitempty"`
|
|
VMDiskCheck *VMDiskCheckResult `json:"vmDiskCheck,omitempty"`
|
|
PhysicalDisks *PhysicalDiskCheck `json:"physicalDisks,omitempty"`
|
|
}
|
|
|
|
// NodeDetails contains node-specific details
|
|
type NodeDetails struct {
|
|
NodeCount int `json:"node_count,omitempty"`
|
|
Version string `json:"version,omitempty"`
|
|
}
|
|
|
|
// VMDiskCheckResult contains VM disk monitoring diagnostic results
|
|
type VMDiskCheckResult struct {
|
|
VMsFound int `json:"vmsFound"`
|
|
VMsWithAgent int `json:"vmsWithAgent"`
|
|
VMsWithDiskData int `json:"vmsWithDiskData"`
|
|
TestVMID int `json:"testVMID,omitempty"`
|
|
TestVMName string `json:"testVMName,omitempty"`
|
|
TestResult string `json:"testResult,omitempty"`
|
|
Permissions []string `json:"permissions,omitempty"`
|
|
Recommendations []string `json:"recommendations,omitempty"`
|
|
ProblematicVMs []VMDiskIssue `json:"problematicVMs,omitempty"`
|
|
FilesystemsFound []FilesystemDetail `json:"filesystemsFound,omitempty"`
|
|
}
|
|
|
|
type VMDiskIssue struct {
|
|
VMID int `json:"vmid"`
|
|
Name string `json:"name"`
|
|
Status string `json:"status"`
|
|
Issue string `json:"issue"`
|
|
}
|
|
|
|
type FilesystemDetail struct {
|
|
Mountpoint string `json:"mountpoint"`
|
|
Type string `json:"type"`
|
|
Total uint64 `json:"total"`
|
|
Used uint64 `json:"used"`
|
|
Filtered bool `json:"filtered"`
|
|
Reason string `json:"reason,omitempty"`
|
|
}
|
|
|
|
// PhysicalDiskCheck contains diagnostic results for physical disk detection
|
|
type PhysicalDiskCheck struct {
|
|
NodesChecked int `json:"nodesChecked"`
|
|
NodesWithDisks int `json:"nodesWithDisks"`
|
|
TotalDisks int `json:"totalDisks"`
|
|
NodeResults []NodeDiskResult `json:"nodeResults"`
|
|
TestResult string `json:"testResult,omitempty"`
|
|
Recommendations []string `json:"recommendations,omitempty"`
|
|
}
|
|
|
|
type NodeDiskResult struct {
|
|
NodeName string `json:"nodeName"`
|
|
DiskCount int `json:"diskCount"`
|
|
Error string `json:"error,omitempty"`
|
|
DiskDevices []string `json:"diskDevices,omitempty"`
|
|
APIResponse string `json:"apiResponse,omitempty"`
|
|
}
|
|
|
|
// ClusterInfo contains cluster information
|
|
type ClusterInfo struct {
|
|
Nodes int `json:"nodes"`
|
|
}
|
|
|
|
// PBSDiagnostic contains diagnostic info for a PBS instance
|
|
type PBSDiagnostic struct {
|
|
ID string `json:"id"`
|
|
Name string `json:"name"`
|
|
Host string `json:"host"`
|
|
Connected bool `json:"connected"`
|
|
Error string `json:"error,omitempty"`
|
|
Details *PBSDetails `json:"details,omitempty"`
|
|
}
|
|
|
|
// PBSDetails contains PBS-specific details
|
|
type PBSDetails struct {
|
|
Version string `json:"version,omitempty"`
|
|
}
|
|
|
|
// SystemDiagnostic contains system-level diagnostic info
|
|
type SystemDiagnostic struct {
|
|
OS string `json:"os"`
|
|
Arch string `json:"arch"`
|
|
GoVersion string `json:"goVersion"`
|
|
NumCPU int `json:"numCPU"`
|
|
NumGoroutine int `json:"numGoroutine"`
|
|
MemoryMB uint64 `json:"memoryMB"`
|
|
}
|
|
|
|
// APITokenDiagnostic reports on the state of the multi-token authentication system.
|
|
type APITokenDiagnostic struct {
|
|
Enabled bool `json:"enabled"`
|
|
TokenCount int `json:"tokenCount"`
|
|
HasEnvTokens bool `json:"hasEnvTokens"`
|
|
HasLegacyToken bool `json:"hasLegacyToken"`
|
|
RecommendTokenSetup bool `json:"recommendTokenSetup"`
|
|
RecommendTokenRotation bool `json:"recommendTokenRotation"`
|
|
LegacyDockerHostCount int `json:"legacyDockerHostCount,omitempty"`
|
|
UnusedTokenCount int `json:"unusedTokenCount,omitempty"`
|
|
Notes []string `json:"notes,omitempty"`
|
|
Tokens []APITokenSummary `json:"tokens,omitempty"`
|
|
Usage []APITokenUsage `json:"usage,omitempty"`
|
|
}
|
|
|
|
// APITokenSummary provides sanitized token metadata for diagnostics display.
|
|
type APITokenSummary struct {
|
|
ID string `json:"id"`
|
|
Name string `json:"name"`
|
|
Hint string `json:"hint,omitempty"`
|
|
CreatedAt string `json:"createdAt,omitempty"`
|
|
LastUsedAt string `json:"lastUsedAt,omitempty"`
|
|
Source string `json:"source,omitempty"`
|
|
}
|
|
|
|
// APITokenUsage summarises how tokens are consumed by connected agents.
|
|
type APITokenUsage struct {
|
|
TokenID string `json:"tokenId"`
|
|
HostCount int `json:"hostCount"`
|
|
Hosts []string `json:"hosts,omitempty"`
|
|
}
|
|
|
|
// DockerAgentDiagnostic summarizes adoption of the Docker agent command system.
|
|
type DockerAgentDiagnostic struct {
|
|
HostsTotal int `json:"hostsTotal"`
|
|
HostsOnline int `json:"hostsOnline"`
|
|
HostsReportingVersion int `json:"hostsReportingVersion"`
|
|
HostsWithTokenBinding int `json:"hostsWithTokenBinding"`
|
|
HostsWithoutTokenBinding int `json:"hostsWithoutTokenBinding"`
|
|
HostsWithoutVersion int `json:"hostsWithoutVersion,omitempty"`
|
|
HostsOutdatedVersion int `json:"hostsOutdatedVersion,omitempty"`
|
|
HostsWithStaleCommand int `json:"hostsWithStaleCommand,omitempty"`
|
|
HostsPendingUninstall int `json:"hostsPendingUninstall,omitempty"`
|
|
HostsNeedingAttention int `json:"hostsNeedingAttention"`
|
|
RecommendedAgentVersion string `json:"recommendedAgentVersion,omitempty"`
|
|
Attention []DockerAgentAttention `json:"attention,omitempty"`
|
|
Notes []string `json:"notes,omitempty"`
|
|
}
|
|
|
|
// DockerAgentAttention captures an individual agent that requires user action.
|
|
type DockerAgentAttention struct {
|
|
HostID string `json:"hostId"`
|
|
Name string `json:"name"`
|
|
Status string `json:"status"`
|
|
AgentVersion string `json:"agentVersion,omitempty"`
|
|
TokenHint string `json:"tokenHint,omitempty"`
|
|
LastSeen string `json:"lastSeen,omitempty"`
|
|
Issues []string `json:"issues"`
|
|
}
|
|
|
|
// AlertsDiagnostic summarises alert configuration migration state.
|
|
type AlertsDiagnostic struct {
|
|
LegacyThresholdsDetected bool `json:"legacyThresholdsDetected"`
|
|
LegacyThresholdSources []string `json:"legacyThresholdSources,omitempty"`
|
|
LegacyScheduleSettings []string `json:"legacyScheduleSettings,omitempty"`
|
|
MissingCooldown bool `json:"missingCooldown"`
|
|
MissingGroupingWindow bool `json:"missingGroupingWindow"`
|
|
Notes []string `json:"notes,omitempty"`
|
|
}
|
|
|
|
// AIChatDiagnostic reports on the AI chat service status.
|
|
type AIChatDiagnostic struct {
|
|
Enabled bool `json:"enabled"`
|
|
Running bool `json:"running"`
|
|
Healthy bool `json:"healthy"`
|
|
Port int `json:"port,omitempty"`
|
|
URL string `json:"url,omitempty"`
|
|
Model string `json:"model,omitempty"`
|
|
MCPConnected bool `json:"mcpConnected"`
|
|
MCPToolCount int `json:"mcpToolCount,omitempty"`
|
|
Notes []string `json:"notes,omitempty"`
|
|
}
|
|
|
|
// handleDiagnostics returns comprehensive diagnostic information
|
|
func (r *Router) handleDiagnostics(w http.ResponseWriter, req *http.Request) {
|
|
diagnosticsMetricsOnce.Do(func() {
|
|
prometheus.MustRegister(diagnosticsCacheHits, diagnosticsCacheMisses, diagnosticsRefreshDuration)
|
|
})
|
|
|
|
now := time.Now()
|
|
|
|
diagnosticsCacheMu.RLock()
|
|
cachedDiag := diagnosticsCache
|
|
cachedAt := diagnosticsCacheTimestamp
|
|
diagnosticsCacheMu.RUnlock()
|
|
|
|
if !cachedAt.IsZero() && now.Sub(cachedAt) <= diagnosticsCacheTTL {
|
|
diagnosticsCacheHits.Inc()
|
|
writeDiagnosticsResponse(w, cachedDiag, cachedAt)
|
|
return
|
|
}
|
|
|
|
diagnosticsCacheMisses.Inc()
|
|
|
|
ctx, cancel := context.WithTimeout(req.Context(), 30*time.Second)
|
|
defer cancel()
|
|
|
|
start := time.Now()
|
|
fresh := r.computeDiagnostics(ctx)
|
|
diagnosticsRefreshDuration.Observe(time.Since(start).Seconds())
|
|
|
|
diagnosticsCacheMu.Lock()
|
|
diagnosticsCache = fresh
|
|
diagnosticsCacheTimestamp = time.Now()
|
|
cachedAt = diagnosticsCacheTimestamp
|
|
diagnosticsCacheMu.Unlock()
|
|
|
|
writeDiagnosticsResponse(w, fresh, cachedAt)
|
|
}
|
|
|
|
func writeDiagnosticsResponse(w http.ResponseWriter, diag DiagnosticsInfo, cachedAt time.Time) {
|
|
w.Header().Set("Content-Type", "application/json")
|
|
if !cachedAt.IsZero() {
|
|
w.Header().Set("X-Diagnostics-Cached-At", cachedAt.UTC().Format(time.RFC3339))
|
|
}
|
|
if err := json.NewEncoder(w).Encode(diag); err != nil {
|
|
log.Error().Err(err).Msg("Failed to encode diagnostics")
|
|
http.Error(w, "Failed to generate diagnostics", http.StatusInternalServerError)
|
|
}
|
|
}
|
|
|
|
func (r *Router) computeDiagnostics(ctx context.Context) DiagnosticsInfo {
|
|
diag := DiagnosticsInfo{
|
|
Errors: []string{},
|
|
}
|
|
|
|
// Version info
|
|
if versionInfo, err := updates.GetCurrentVersion(); err == nil {
|
|
diag.Version = versionInfo.Version
|
|
diag.Runtime = versionInfo.Runtime
|
|
} else {
|
|
diag.Version = "unknown"
|
|
diag.Runtime = "go"
|
|
}
|
|
|
|
// Uptime
|
|
diag.Uptime = time.Since(r.monitor.GetStartTime()).Seconds()
|
|
|
|
// System info
|
|
var memStats runtime.MemStats
|
|
runtime.ReadMemStats(&memStats)
|
|
diag.System = SystemDiagnostic{
|
|
OS: runtime.GOOS,
|
|
Arch: runtime.GOARCH,
|
|
GoVersion: runtime.Version(),
|
|
NumCPU: runtime.NumCPU(),
|
|
NumGoroutine: runtime.NumGoroutine(),
|
|
MemoryMB: memStats.Alloc / 1024 / 1024,
|
|
}
|
|
|
|
diag.APITokens = buildAPITokenDiagnostic(r.config, r.monitor)
|
|
diag.MetricsStore = buildMetricsStoreDiagnostic(r.monitor)
|
|
|
|
// Test each configured node
|
|
for _, node := range r.config.PVEInstances {
|
|
nodeDiag := NodeDiagnostic{
|
|
ID: node.Name,
|
|
Name: node.Name,
|
|
Host: node.Host,
|
|
Type: "pve",
|
|
}
|
|
|
|
// Determine auth method (sanitized - don't expose actual values)
|
|
if node.TokenName != "" && node.TokenValue != "" {
|
|
nodeDiag.AuthMethod = "api_token"
|
|
} else if node.User != "" && node.Password != "" {
|
|
nodeDiag.AuthMethod = "username_password"
|
|
} else {
|
|
nodeDiag.AuthMethod = "none"
|
|
nodeDiag.Error = "No authentication configured"
|
|
}
|
|
|
|
// Test connection
|
|
testCfg := proxmox.ClientConfig{
|
|
Host: node.Host,
|
|
User: node.User,
|
|
Password: node.Password,
|
|
TokenName: node.TokenName,
|
|
TokenValue: node.TokenValue,
|
|
VerifySSL: node.VerifySSL,
|
|
}
|
|
|
|
client, err := proxmox.NewClient(testCfg)
|
|
if err != nil {
|
|
nodeDiag.Connected = false
|
|
nodeDiag.Error = err.Error()
|
|
} else {
|
|
nodes, err := client.GetNodes(ctx)
|
|
if err != nil {
|
|
nodeDiag.Connected = false
|
|
nodeDiag.Error = "Failed to connect to Proxmox API: " + err.Error()
|
|
} else {
|
|
nodeDiag.Connected = true
|
|
|
|
if len(nodes) > 0 {
|
|
nodeDiag.Details = &NodeDetails{
|
|
NodeCount: len(nodes),
|
|
}
|
|
|
|
if status, err := client.GetNodeStatus(ctx, nodes[0].Node); err == nil && status != nil {
|
|
if status.PVEVersion != "" {
|
|
nodeDiag.Details.Version = status.PVEVersion
|
|
}
|
|
}
|
|
}
|
|
|
|
if clusterStatus, err := client.GetClusterStatus(ctx); err == nil {
|
|
nodeDiag.ClusterInfo = &ClusterInfo{Nodes: len(clusterStatus)}
|
|
} else {
|
|
log.Debug().Str("node", node.Name).Msg("Cluster status not available (likely standalone node)")
|
|
nodeDiag.ClusterInfo = &ClusterInfo{Nodes: 1}
|
|
}
|
|
|
|
nodeDiag.VMDiskCheck = r.checkVMDiskMonitoring(ctx, client, node.Name)
|
|
nodeDiag.PhysicalDisks = r.checkPhysicalDisks(ctx, client, node.Name)
|
|
}
|
|
}
|
|
|
|
diag.Nodes = append(diag.Nodes, nodeDiag)
|
|
}
|
|
|
|
// Test PBS instances
|
|
for _, pbsNode := range r.config.PBSInstances {
|
|
pbsDiag := PBSDiagnostic{
|
|
ID: pbsNode.Name,
|
|
Name: pbsNode.Name,
|
|
Host: pbsNode.Host,
|
|
}
|
|
|
|
testCfg := pbs.ClientConfig{
|
|
Host: pbsNode.Host,
|
|
User: pbsNode.User,
|
|
Password: pbsNode.Password,
|
|
TokenName: pbsNode.TokenName,
|
|
TokenValue: pbsNode.TokenValue,
|
|
Fingerprint: pbsNode.Fingerprint,
|
|
VerifySSL: pbsNode.VerifySSL,
|
|
}
|
|
|
|
client, err := pbs.NewClient(testCfg)
|
|
if err != nil {
|
|
pbsDiag.Connected = false
|
|
pbsDiag.Error = err.Error()
|
|
} else {
|
|
if version, err := client.GetVersion(ctx); err != nil {
|
|
pbsDiag.Connected = false
|
|
pbsDiag.Error = "Connection established but version check failed: " + err.Error()
|
|
} else {
|
|
pbsDiag.Connected = true
|
|
pbsDiag.Details = &PBSDetails{Version: version.Version}
|
|
}
|
|
}
|
|
|
|
diag.PBS = append(diag.PBS, pbsDiag)
|
|
}
|
|
|
|
diag.DockerAgents = buildDockerAgentDiagnostic(r.monitor, diag.Version)
|
|
diag.Alerts = buildAlertsDiagnostic(r.monitor)
|
|
diag.AIChat = buildAIChatDiagnostic(r.config, r.aiHandler)
|
|
|
|
diag.Discovery = buildDiscoveryDiagnostic(r.config, r.monitor)
|
|
|
|
if r.monitor != nil {
|
|
snapshots := r.monitor.GetDiagnosticSnapshots()
|
|
if len(snapshots.Nodes) > 0 {
|
|
diag.NodeSnapshots = snapshots.Nodes
|
|
|
|
type memorySourceAgg struct {
|
|
stat MemorySourceStat
|
|
latest time.Time
|
|
}
|
|
|
|
sourceAverages := make(map[string]*memorySourceAgg)
|
|
for _, snap := range snapshots.Nodes {
|
|
source := snap.MemorySource
|
|
if source == "" {
|
|
source = "unknown"
|
|
}
|
|
|
|
key := fmt.Sprintf("%s|%s", snap.Instance, source)
|
|
entry, ok := sourceAverages[key]
|
|
if !ok {
|
|
entry = &memorySourceAgg{
|
|
stat: MemorySourceStat{
|
|
Instance: snap.Instance,
|
|
Source: source,
|
|
Fallback: isFallbackMemorySource(source),
|
|
},
|
|
}
|
|
sourceAverages[key] = entry
|
|
}
|
|
|
|
entry.stat.NodeCount++
|
|
if snap.RetrievedAt.After(entry.latest) {
|
|
entry.latest = snap.RetrievedAt
|
|
}
|
|
}
|
|
|
|
if len(sourceAverages) > 0 {
|
|
diag.MemorySources = make([]MemorySourceStat, 0, len(sourceAverages))
|
|
for _, entry := range sourceAverages {
|
|
if !entry.latest.IsZero() {
|
|
entry.stat.LastUpdated = entry.latest.UTC().Format(time.RFC3339)
|
|
}
|
|
diag.MemorySources = append(diag.MemorySources, entry.stat)
|
|
}
|
|
|
|
sort.Slice(diag.MemorySources, func(i, j int) bool {
|
|
if diag.MemorySources[i].Instance == diag.MemorySources[j].Instance {
|
|
return diag.MemorySources[i].Source < diag.MemorySources[j].Source
|
|
}
|
|
return diag.MemorySources[i].Instance < diag.MemorySources[j].Instance
|
|
})
|
|
}
|
|
}
|
|
if len(snapshots.Guests) > 0 {
|
|
diag.GuestSnapshots = snapshots.Guests
|
|
}
|
|
}
|
|
|
|
return diag
|
|
}
|
|
|
|
func copyStringSlice(values []string) []string {
|
|
if len(values) == 0 {
|
|
return []string{}
|
|
}
|
|
return append([]string(nil), values...)
|
|
}
|
|
|
|
func buildDiscoveryDiagnostic(cfg *config.Config, monitor *monitoring.Monitor) *DiscoveryDiagnostic {
|
|
if cfg == nil {
|
|
return nil
|
|
}
|
|
|
|
discovery := &DiscoveryDiagnostic{
|
|
Enabled: cfg.DiscoveryEnabled,
|
|
ConfiguredSubnet: strings.TrimSpace(cfg.DiscoverySubnet),
|
|
EnvironmentOverride: strings.TrimSpace(cfg.Discovery.EnvironmentOverride),
|
|
SubnetAllowlist: copyStringSlice(cfg.Discovery.SubnetAllowlist),
|
|
SubnetBlocklist: copyStringSlice(cfg.Discovery.SubnetBlocklist),
|
|
}
|
|
|
|
if discovery.ConfiguredSubnet == "" {
|
|
discovery.ConfiguredSubnet = "auto"
|
|
}
|
|
if discovery.SubnetAllowlist == nil {
|
|
discovery.SubnetAllowlist = []string{}
|
|
}
|
|
if discovery.SubnetBlocklist == nil {
|
|
discovery.SubnetBlocklist = []string{}
|
|
}
|
|
|
|
if monitor != nil {
|
|
if svc := monitor.GetDiscoveryService(); svc != nil {
|
|
status := svc.GetStatus()
|
|
|
|
if val, ok := status["subnet"].(string); ok {
|
|
discovery.ActiveSubnet = val
|
|
}
|
|
if val, ok := status["is_scanning"].(bool); ok {
|
|
discovery.Scanning = val
|
|
}
|
|
if val, ok := status["interval"].(string); ok {
|
|
discovery.ScanInterval = val
|
|
}
|
|
if val, ok := status["last_scan"].(time.Time); ok && !val.IsZero() {
|
|
discovery.LastScanStartedAt = val.UTC().Format(time.RFC3339)
|
|
}
|
|
|
|
if result, updated := svc.GetCachedResult(); result != nil {
|
|
discovery.LastResultServers = len(result.Servers)
|
|
if len(result.StructuredErrors) > 0 {
|
|
discovery.LastResultErrors = len(result.StructuredErrors)
|
|
} else if len(result.Errors) > 0 {
|
|
discovery.LastResultErrors = len(result.Errors)
|
|
}
|
|
if !updated.IsZero() {
|
|
discovery.LastResultTimestamp = updated.UTC().Format(time.RFC3339)
|
|
}
|
|
}
|
|
|
|
history := svc.GetHistory(10)
|
|
if len(history) > 0 {
|
|
items := make([]DiscoveryHistoryItem, 0, len(history))
|
|
for _, entry := range history {
|
|
item := DiscoveryHistoryItem{
|
|
StartedAt: entry.StartedAt().UTC().Format(time.RFC3339),
|
|
CompletedAt: entry.CompletedAt().UTC().Format(time.RFC3339),
|
|
Duration: entry.Duration().Truncate(time.Millisecond).String(),
|
|
DurationMs: entry.Duration().Milliseconds(),
|
|
Subnet: entry.Subnet(),
|
|
ServerCount: entry.ServerCount(),
|
|
ErrorCount: entry.ErrorCount(),
|
|
BlocklistLength: entry.BlocklistLength(),
|
|
Status: entry.Status(),
|
|
}
|
|
items = append(items, item)
|
|
}
|
|
discovery.History = items
|
|
}
|
|
}
|
|
}
|
|
|
|
return discovery
|
|
}
|
|
|
|
func buildAPITokenDiagnostic(cfg *config.Config, monitor *monitoring.Monitor) *APITokenDiagnostic {
|
|
if cfg == nil {
|
|
return nil
|
|
}
|
|
|
|
diag := &APITokenDiagnostic{
|
|
Enabled: cfg.HasAPITokens(),
|
|
TokenCount: len(cfg.APITokens),
|
|
}
|
|
|
|
appendNote := func(note string) {
|
|
if note == "" || contains(diag.Notes, note) {
|
|
return
|
|
}
|
|
diag.Notes = append(diag.Notes, note)
|
|
}
|
|
|
|
envTokens := false
|
|
if cfg.EnvOverrides != nil && (cfg.EnvOverrides["API_TOKEN"] || cfg.EnvOverrides["API_TOKENS"]) {
|
|
envTokens = true
|
|
}
|
|
|
|
legacyToken := false
|
|
for _, record := range cfg.APITokens {
|
|
if strings.EqualFold(record.Name, "Environment token") {
|
|
envTokens = true
|
|
}
|
|
if strings.EqualFold(record.Name, "Legacy token") {
|
|
legacyToken = true
|
|
}
|
|
}
|
|
|
|
diag.HasEnvTokens = envTokens
|
|
diag.HasLegacyToken = legacyToken
|
|
diag.RecommendTokenSetup = len(cfg.APITokens) == 0
|
|
diag.RecommendTokenRotation = envTokens || legacyToken
|
|
|
|
if diag.RecommendTokenSetup {
|
|
appendNote("No API tokens are configured. Open Settings → Security to generate dedicated tokens for each automation or agent.")
|
|
}
|
|
|
|
tokens := make([]APITokenSummary, 0, len(cfg.APITokens))
|
|
unusedCount := 0
|
|
for _, record := range cfg.APITokens {
|
|
summary := APITokenSummary{
|
|
ID: record.ID,
|
|
Name: record.Name,
|
|
}
|
|
|
|
if !record.CreatedAt.IsZero() {
|
|
summary.CreatedAt = record.CreatedAt.UTC().Format(time.RFC3339)
|
|
}
|
|
|
|
if record.LastUsedAt != nil && !record.LastUsedAt.IsZero() {
|
|
summary.LastUsedAt = record.LastUsedAt.UTC().Format(time.RFC3339)
|
|
} else {
|
|
unusedCount++
|
|
}
|
|
|
|
switch {
|
|
case record.Prefix != "" && record.Suffix != "":
|
|
summary.Hint = fmt.Sprintf("%s…%s", record.Prefix, record.Suffix)
|
|
case record.Prefix != "":
|
|
summary.Hint = record.Prefix + "…"
|
|
case record.Suffix != "":
|
|
summary.Hint = "…" + record.Suffix
|
|
}
|
|
|
|
switch {
|
|
case strings.EqualFold(record.Name, "Environment token"):
|
|
summary.Source = "environment"
|
|
case strings.EqualFold(record.Name, "Legacy token"):
|
|
summary.Source = "legacy"
|
|
default:
|
|
summary.Source = "user"
|
|
}
|
|
|
|
tokens = append(tokens, summary)
|
|
}
|
|
|
|
diag.Tokens = tokens
|
|
diag.UnusedTokenCount = unusedCount
|
|
|
|
if len(cfg.APITokens) > 0 {
|
|
if unusedCount == len(cfg.APITokens) {
|
|
appendNote("Configured API tokens have not been used yet. Update your agents or automations to switch to the new tokens.")
|
|
} else if unusedCount > 0 {
|
|
appendNote(fmt.Sprintf("%d API token(s) have never been used. Remove unused tokens or update the corresponding agents.", unusedCount))
|
|
}
|
|
}
|
|
|
|
tokenUsage := make(map[string][]string)
|
|
legacyHosts := 0
|
|
if monitor != nil {
|
|
for _, host := range monitor.GetDockerHosts() {
|
|
name := preferredDockerHostName(host)
|
|
if strings.TrimSpace(host.TokenID) == "" {
|
|
legacyHosts++
|
|
continue
|
|
}
|
|
tokenID := strings.TrimSpace(host.TokenID)
|
|
tokenUsage[tokenID] = append(tokenUsage[tokenID], name)
|
|
}
|
|
}
|
|
|
|
diag.LegacyDockerHostCount = legacyHosts
|
|
if legacyHosts > 0 {
|
|
appendNote(fmt.Sprintf("%d Docker host(s) still rely on the shared API token. Generate dedicated tokens and rerun the installer from Settings → Docker Agents.", legacyHosts))
|
|
}
|
|
|
|
if len(tokenUsage) > 0 {
|
|
keys := make([]string, 0, len(tokenUsage))
|
|
for tokenID := range tokenUsage {
|
|
keys = append(keys, tokenID)
|
|
}
|
|
sort.Strings(keys)
|
|
|
|
diag.Usage = make([]APITokenUsage, 0, len(keys))
|
|
for _, tokenID := range keys {
|
|
hosts := tokenUsage[tokenID]
|
|
sort.Strings(hosts)
|
|
diag.Usage = append(diag.Usage, APITokenUsage{
|
|
TokenID: tokenID,
|
|
HostCount: len(hosts),
|
|
Hosts: hosts,
|
|
})
|
|
}
|
|
}
|
|
|
|
if envTokens {
|
|
appendNote("Environment-based API token detected. Migrate to tokens created in the UI for per-token tracking and safer rotation.")
|
|
}
|
|
if legacyToken {
|
|
appendNote("Legacy token detected. Generate new API tokens and update integrations to benefit from per-token management.")
|
|
}
|
|
|
|
return diag
|
|
}
|
|
|
|
func buildDockerAgentDiagnostic(m *monitoring.Monitor, serverVersion string) *DockerAgentDiagnostic {
|
|
if m == nil {
|
|
return nil
|
|
}
|
|
|
|
hosts := m.GetDockerHosts()
|
|
diag := &DockerAgentDiagnostic{
|
|
HostsTotal: len(hosts),
|
|
RecommendedAgentVersion: normalizeVersionLabel(serverVersion),
|
|
}
|
|
|
|
appendNote := func(note string) {
|
|
if note == "" || contains(diag.Notes, note) {
|
|
return
|
|
}
|
|
diag.Notes = append(diag.Notes, note)
|
|
}
|
|
|
|
if len(hosts) == 0 {
|
|
appendNote("No Docker agents have reported in yet. Use Settings → Docker Agents to install the container-side agent and unlock remote commands.")
|
|
return diag
|
|
}
|
|
|
|
var (
|
|
serverVer *updates.Version
|
|
recommendedLabel = diag.RecommendedAgentVersion
|
|
)
|
|
if serverVersion != "" {
|
|
if parsed, err := updates.ParseVersion(serverVersion); err == nil {
|
|
serverVer = parsed
|
|
recommendedLabel = normalizeVersionLabel(parsed.String())
|
|
diag.RecommendedAgentVersion = recommendedLabel
|
|
}
|
|
}
|
|
|
|
now := time.Now().UTC()
|
|
legacyTokenHosts := 0
|
|
for _, host := range hosts {
|
|
status := strings.ToLower(strings.TrimSpace(host.Status))
|
|
if status == "online" {
|
|
diag.HostsOnline++
|
|
}
|
|
versionStr := strings.TrimSpace(host.AgentVersion)
|
|
if versionStr != "" {
|
|
diag.HostsReportingVersion++
|
|
} else {
|
|
diag.HostsWithoutVersion++
|
|
}
|
|
|
|
if strings.TrimSpace(host.TokenID) != "" {
|
|
diag.HostsWithTokenBinding++
|
|
} else {
|
|
legacyTokenHosts++
|
|
}
|
|
|
|
issues := make([]string, 0, 4)
|
|
|
|
if status != "online" && status != "" {
|
|
issues = append(issues, fmt.Sprintf("Host reports status %q.", status))
|
|
}
|
|
|
|
if versionStr == "" {
|
|
issues = append(issues, "Agent has not reported a version (pre v4.24). Reinstall using Settings → Docker Agents.")
|
|
} else if serverVer != nil {
|
|
if agentVer, err := updates.ParseVersion(versionStr); err == nil {
|
|
if agentVer.Compare(serverVer) < 0 {
|
|
diag.HostsOutdatedVersion++
|
|
issues = append(issues, fmt.Sprintf("Agent version %s lags behind the recommended %s. Re-run the installer to update.", normalizeVersionLabel(versionStr), recommendedLabel))
|
|
}
|
|
} else {
|
|
issues = append(issues, fmt.Sprintf("Unrecognized agent version string %q. Reinstall to ensure command support.", versionStr))
|
|
}
|
|
}
|
|
|
|
if strings.TrimSpace(host.TokenID) == "" {
|
|
issues = append(issues, "Host is still using the shared API token. Generate a dedicated token in Settings → Security and rerun the installer.")
|
|
}
|
|
|
|
if !host.LastSeen.IsZero() && now.Sub(host.LastSeen.UTC()) > 10*time.Minute {
|
|
issues = append(issues, fmt.Sprintf("No heartbeat since %s. Verify the agent container is running.", host.LastSeen.UTC().Format(time.RFC3339)))
|
|
}
|
|
|
|
if host.Command != nil {
|
|
cmdStatus := strings.ToLower(strings.TrimSpace(host.Command.Status))
|
|
switch cmdStatus {
|
|
case monitoring.DockerCommandStatusQueued, monitoring.DockerCommandStatusDispatched, monitoring.DockerCommandStatusAcknowledged:
|
|
message := fmt.Sprintf("Command %s is still in progress.", cmdStatus)
|
|
if !host.Command.UpdatedAt.IsZero() && now.Sub(host.Command.UpdatedAt.UTC()) > 15*time.Minute {
|
|
diag.HostsWithStaleCommand++
|
|
message = fmt.Sprintf("Command %s has been pending since %s; consider allowing re-enrolment.", cmdStatus, host.Command.UpdatedAt.UTC().Format(time.RFC3339))
|
|
}
|
|
issues = append(issues, message)
|
|
}
|
|
}
|
|
|
|
if host.PendingUninstall {
|
|
diag.HostsPendingUninstall++
|
|
issues = append(issues, "Host is pending uninstall; confirm the agent container stopped or clear the flag.")
|
|
}
|
|
|
|
if len(issues) == 0 {
|
|
continue
|
|
}
|
|
|
|
diag.Attention = append(diag.Attention, DockerAgentAttention{
|
|
HostID: host.ID,
|
|
Name: preferredDockerHostName(host),
|
|
Status: host.Status,
|
|
AgentVersion: versionStr,
|
|
TokenHint: host.TokenHint,
|
|
LastSeen: formatTimeMaybe(host.LastSeen),
|
|
Issues: issues,
|
|
})
|
|
}
|
|
|
|
diag.HostsWithoutTokenBinding = legacyTokenHosts
|
|
diag.HostsNeedingAttention = len(diag.Attention)
|
|
|
|
if legacyTokenHosts > 0 {
|
|
appendNote(fmt.Sprintf("%d Docker host(s) still rely on the shared API token. Migrate each host to a dedicated token via Settings → Security and rerun the installer.", legacyTokenHosts))
|
|
}
|
|
if diag.HostsOutdatedVersion > 0 {
|
|
appendNote(fmt.Sprintf("%d Docker host(s) run an out-of-date agent. Re-run the installer from Settings → Docker Agents to upgrade them.", diag.HostsOutdatedVersion))
|
|
}
|
|
if diag.HostsWithoutVersion > 0 {
|
|
appendNote(fmt.Sprintf("%d Docker host(s) have not reported an agent version yet. Reinstall the agent to enable the new command system.", diag.HostsWithoutVersion))
|
|
}
|
|
if diag.HostsWithStaleCommand > 0 {
|
|
appendNote(fmt.Sprintf("%d Docker host command(s) appear stuck. Use the 'Allow re-enroll' action in Settings → Docker Agents to reset them.", diag.HostsWithStaleCommand))
|
|
}
|
|
if diag.HostsPendingUninstall > 0 {
|
|
appendNote(fmt.Sprintf("%d Docker host(s) are pending uninstall. Confirm the uninstall or clear the flag from Settings → Docker Agents.", diag.HostsPendingUninstall))
|
|
}
|
|
if diag.HostsNeedingAttention == 0 {
|
|
appendNote("All Docker agents are reporting with dedicated tokens and the expected version.")
|
|
}
|
|
|
|
return diag
|
|
}
|
|
|
|
func buildAlertsDiagnostic(m *monitoring.Monitor) *AlertsDiagnostic {
|
|
if m == nil {
|
|
return nil
|
|
}
|
|
|
|
manager := m.GetAlertManager()
|
|
if manager == nil {
|
|
return nil
|
|
}
|
|
|
|
config := manager.GetConfig()
|
|
diag := &AlertsDiagnostic{}
|
|
|
|
appendNote := func(note string) {
|
|
if note == "" || contains(diag.Notes, note) {
|
|
return
|
|
}
|
|
diag.Notes = append(diag.Notes, note)
|
|
}
|
|
|
|
legacySources := make([]string, 0, 4)
|
|
if hasLegacyThresholds(config.GuestDefaults) {
|
|
diag.LegacyThresholdsDetected = true
|
|
legacySources = append(legacySources, "guest-defaults")
|
|
}
|
|
if hasLegacyThresholds(config.NodeDefaults) {
|
|
diag.LegacyThresholdsDetected = true
|
|
legacySources = append(legacySources, "node-defaults")
|
|
}
|
|
|
|
overrideIndex := 0
|
|
for _, override := range config.Overrides {
|
|
overrideIndex++
|
|
if hasLegacyThresholds(override) {
|
|
diag.LegacyThresholdsDetected = true
|
|
legacySources = append(legacySources, fmt.Sprintf("override-%d", overrideIndex))
|
|
}
|
|
}
|
|
|
|
for idx, rule := range config.CustomRules {
|
|
if hasLegacyThresholds(rule.Thresholds) {
|
|
diag.LegacyThresholdsDetected = true
|
|
legacySources = append(legacySources, fmt.Sprintf("custom-%d", idx+1))
|
|
}
|
|
}
|
|
|
|
if len(legacySources) > 0 {
|
|
sort.Strings(legacySources)
|
|
diag.LegacyThresholdSources = legacySources
|
|
appendNote("Some alert rules still rely on legacy single-value thresholds. Edit and save them to enable hysteresis-based alerts.")
|
|
}
|
|
|
|
legacySchedule := make([]string, 0, 2)
|
|
if config.TimeThreshold > 0 {
|
|
legacySchedule = append(legacySchedule, "timeThreshold")
|
|
appendNote("Global alert delay still uses the legacy timeThreshold setting. Save the alerts configuration to migrate to per-metric delays.")
|
|
}
|
|
if config.Schedule.GroupingWindow > 0 && config.Schedule.Grouping.Window == 0 {
|
|
legacySchedule = append(legacySchedule, "groupingWindow")
|
|
appendNote("Alert grouping uses the deprecated groupingWindow value. Update the schedule to use the new grouping options.")
|
|
}
|
|
if len(legacySchedule) > 0 {
|
|
sort.Strings(legacySchedule)
|
|
diag.LegacyScheduleSettings = legacySchedule
|
|
}
|
|
|
|
if config.Schedule.Cooldown <= 0 {
|
|
diag.MissingCooldown = true
|
|
appendNote("Alert cooldown is not configured. Set a cooldown under Alerts → Schedule to prevent alert storms.")
|
|
}
|
|
if config.Schedule.Grouping.Window <= 0 {
|
|
diag.MissingGroupingWindow = true
|
|
appendNote("Alert grouping window is disabled. Configure a grouping window to bundle related alerts.")
|
|
}
|
|
|
|
return diag
|
|
}
|
|
|
|
func fingerprintPublicKey(pub string) (string, error) {
|
|
pub = strings.TrimSpace(pub)
|
|
if pub == "" {
|
|
return "", fmt.Errorf("empty public key")
|
|
}
|
|
key, _, _, _, err := ssh.ParseAuthorizedKey([]byte(pub))
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
return ssh.FingerprintSHA256(key), nil
|
|
}
|
|
|
|
func resolveUserName(uid uint32) string {
|
|
uidStr := strconv.FormatUint(uint64(uid), 10)
|
|
if usr, err := user.LookupId(uidStr); err == nil && usr.Username != "" {
|
|
return usr.Username
|
|
}
|
|
return "uid:" + uidStr
|
|
}
|
|
|
|
func resolveGroupName(gid uint32) string {
|
|
gidStr := strconv.FormatUint(uint64(gid), 10)
|
|
if grp, err := user.LookupGroupId(gidStr); err == nil && grp != nil && grp.Name != "" {
|
|
return grp.Name
|
|
}
|
|
return "gid:" + gidStr
|
|
}
|
|
|
|
func countLegacySSHKeys(dir string) (int, error) {
|
|
entries, err := os.ReadDir(dir)
|
|
if err != nil {
|
|
if errors.Is(err, os.ErrNotExist) {
|
|
return 0, nil
|
|
}
|
|
return 0, err
|
|
}
|
|
count := 0
|
|
for _, entry := range entries {
|
|
if entry.IsDir() {
|
|
continue
|
|
}
|
|
name := entry.Name()
|
|
if strings.HasPrefix(name, "id_") {
|
|
count++
|
|
}
|
|
}
|
|
return count, nil
|
|
}
|
|
|
|
func hasLegacyThresholds(th alerts.ThresholdConfig) bool {
|
|
return th.CPULegacy != nil ||
|
|
th.MemoryLegacy != nil ||
|
|
th.DiskLegacy != nil ||
|
|
th.DiskReadLegacy != nil ||
|
|
th.DiskWriteLegacy != nil ||
|
|
th.NetworkInLegacy != nil ||
|
|
th.NetworkOutLegacy != nil
|
|
}
|
|
|
|
func preferredDockerHostName(host models.DockerHost) string {
|
|
if name := strings.TrimSpace(host.DisplayName); name != "" {
|
|
return name
|
|
}
|
|
if name := strings.TrimSpace(host.Hostname); name != "" {
|
|
return name
|
|
}
|
|
if name := strings.TrimSpace(host.AgentID); name != "" {
|
|
return name
|
|
}
|
|
return host.ID
|
|
}
|
|
|
|
func formatTimeMaybe(t time.Time) string {
|
|
if t.IsZero() {
|
|
return ""
|
|
}
|
|
return t.UTC().Format(time.RFC3339)
|
|
}
|
|
|
|
func normalizeVersionLabel(raw string) string {
|
|
value := strings.TrimSpace(raw)
|
|
if value == "" {
|
|
return ""
|
|
}
|
|
if strings.HasPrefix(value, "v") {
|
|
return value
|
|
}
|
|
first := value[0]
|
|
if first < '0' || first > '9' {
|
|
return value
|
|
}
|
|
return "v" + value
|
|
}
|
|
|
|
// checkVMDiskMonitoring performs diagnostic checks for VM disk monitoring
|
|
func (r *Router) checkVMDiskMonitoring(ctx context.Context, client *proxmox.Client, _ string) *VMDiskCheckResult {
|
|
result := &VMDiskCheckResult{
|
|
Recommendations: []string{},
|
|
Permissions: []string{},
|
|
}
|
|
|
|
// Get all nodes to check
|
|
nodes, err := client.GetNodes(ctx)
|
|
if err != nil {
|
|
result.TestResult = "Failed to get nodes: " + err.Error()
|
|
return result
|
|
}
|
|
|
|
if len(nodes) == 0 {
|
|
result.TestResult = "No nodes found"
|
|
return result
|
|
}
|
|
|
|
// Fetch VMs once per node and keep lookup map
|
|
nodeVMMap := make(map[string][]proxmox.VM)
|
|
var allVMs []proxmox.VM
|
|
for _, node := range nodes {
|
|
vmCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
|
|
vms, err := client.GetVMs(vmCtx, node.Node)
|
|
cancel()
|
|
if err != nil {
|
|
log.Debug().Err(err).Str("node", node.Node).Msg("Failed to get VMs from node")
|
|
continue
|
|
}
|
|
nodeVMMap[node.Node] = vms
|
|
allVMs = append(allVMs, vms...)
|
|
}
|
|
|
|
result.VMsFound = len(allVMs)
|
|
vms := allVMs
|
|
|
|
if len(vms) == 0 {
|
|
result.TestResult = "No VMs found to test"
|
|
result.Recommendations = append(result.Recommendations, "Create a test VM to verify disk monitoring")
|
|
return result
|
|
}
|
|
|
|
// Check VMs for agent and disk data
|
|
var testVM *proxmox.VM
|
|
var testVMNode string
|
|
result.ProblematicVMs = []VMDiskIssue{}
|
|
for i := range vms {
|
|
vm := vms[i]
|
|
if vm.Template == 0 && vm.Status == "running" {
|
|
vmNode := strings.TrimSpace(vm.Node)
|
|
if vmNode == "" {
|
|
continue
|
|
}
|
|
|
|
// Check if agent is configured
|
|
statusCtx, statusCancel := context.WithTimeout(ctx, 10*time.Second)
|
|
vmStatus, err := client.GetVMStatus(statusCtx, vmNode, vm.VMID)
|
|
statusCancel()
|
|
if err != nil {
|
|
errStr := err.Error()
|
|
result.ProblematicVMs = append(result.ProblematicVMs, VMDiskIssue{
|
|
VMID: vm.VMID,
|
|
Name: vm.Name,
|
|
Status: vm.Status,
|
|
Issue: "Failed to get VM status: " + errStr,
|
|
})
|
|
} else if vmStatus != nil && vmStatus.Agent.Value > 0 {
|
|
result.VMsWithAgent++
|
|
|
|
// Try to get filesystem info
|
|
fsCtx, fsCancel := context.WithTimeout(ctx, 10*time.Second)
|
|
fsInfo, err := client.GetVMFSInfo(fsCtx, vmNode, vm.VMID)
|
|
fsCancel()
|
|
if err != nil {
|
|
result.ProblematicVMs = append(result.ProblematicVMs, VMDiskIssue{
|
|
VMID: vm.VMID,
|
|
Name: vm.Name,
|
|
Status: vm.Status,
|
|
Issue: "Agent enabled but failed to get filesystem info: " + err.Error(),
|
|
})
|
|
if testVM == nil {
|
|
testVM = &vms[i]
|
|
testVMNode = vmNode
|
|
}
|
|
} else if len(fsInfo) == 0 {
|
|
result.ProblematicVMs = append(result.ProblematicVMs, VMDiskIssue{
|
|
VMID: vm.VMID,
|
|
Name: vm.Name,
|
|
Status: vm.Status,
|
|
Issue: "Agent returned no filesystem info",
|
|
})
|
|
if testVM == nil {
|
|
testVM = &vms[i]
|
|
testVMNode = vmNode
|
|
}
|
|
} else {
|
|
// Check if we get usable disk data
|
|
hasUsableFS := false
|
|
for _, fs := range fsInfo {
|
|
if fs.Type != "tmpfs" && fs.Type != "devtmpfs" &&
|
|
!strings.HasPrefix(fs.Mountpoint, "/dev") &&
|
|
!strings.HasPrefix(fs.Mountpoint, "/proc") &&
|
|
!strings.HasPrefix(fs.Mountpoint, "/sys") &&
|
|
fs.TotalBytes > 0 {
|
|
hasUsableFS = true
|
|
break
|
|
}
|
|
}
|
|
|
|
if hasUsableFS {
|
|
result.VMsWithDiskData++
|
|
} else {
|
|
result.ProblematicVMs = append(result.ProblematicVMs, VMDiskIssue{
|
|
VMID: vm.VMID,
|
|
Name: vm.Name,
|
|
Status: vm.Status,
|
|
Issue: fmt.Sprintf("Agent returned %d filesystems but none are usable for disk metrics", len(fsInfo)),
|
|
})
|
|
}
|
|
|
|
if testVM == nil {
|
|
testVM = &vms[i]
|
|
testVMNode = vmNode
|
|
}
|
|
}
|
|
} else if vmStatus != nil {
|
|
// Agent not enabled
|
|
result.ProblematicVMs = append(result.ProblematicVMs, VMDiskIssue{
|
|
VMID: vm.VMID,
|
|
Name: vm.Name,
|
|
Status: vm.Status,
|
|
Issue: "Guest agent not enabled in VM configuration",
|
|
})
|
|
}
|
|
}
|
|
}
|
|
|
|
// Perform detailed test on one VM
|
|
if testVM != nil {
|
|
result.TestVMID = testVM.VMID
|
|
result.TestVMName = testVM.Name
|
|
|
|
// Check VM status for agent
|
|
statusCtx, statusCancel := context.WithTimeout(ctx, 10*time.Second)
|
|
vmStatus, err := client.GetVMStatus(statusCtx, testVMNode, testVM.VMID)
|
|
statusCancel()
|
|
if err != nil {
|
|
errStr := err.Error()
|
|
result.TestResult = "Failed to get VM status: " + errStr
|
|
if errors.Is(err, context.DeadlineExceeded) || strings.Contains(errStr, "context deadline exceeded") {
|
|
result.Recommendations = append(result.Recommendations,
|
|
"VM status request timed out; check network connectivity to the node",
|
|
"If this persists, increase the diagnostics timeout or reduce VM load during checks",
|
|
)
|
|
} else if strings.Contains(errStr, "403") || strings.Contains(errStr, "401") {
|
|
result.Recommendations = append(result.Recommendations,
|
|
"Ensure API token has PVEAuditor role for baseline access",
|
|
"Add VM.GuestAgent.Audit (PVE 9) or VM.Monitor (PVE 8) privileges; Pulse setup adds these via the PulseMonitor role",
|
|
"Include Sys.Audit when available for Ceph metrics",
|
|
)
|
|
} else {
|
|
result.Recommendations = append(result.Recommendations,
|
|
"Verify the node is reachable and API token is valid",
|
|
)
|
|
}
|
|
} else if vmStatus == nil || vmStatus.Agent.Value == 0 {
|
|
result.TestResult = "Guest agent not enabled in VM configuration"
|
|
result.Recommendations = append(result.Recommendations,
|
|
"Enable QEMU Guest Agent in VM Options",
|
|
"Install qemu-guest-agent package in the VM")
|
|
} else {
|
|
// Try to get filesystem info
|
|
fsCtx, fsCancel := context.WithTimeout(ctx, 10*time.Second)
|
|
fsInfo, err := client.GetVMFSInfo(fsCtx, testVMNode, testVM.VMID)
|
|
fsCancel()
|
|
if err != nil {
|
|
errStr := err.Error()
|
|
if strings.Contains(errStr, "500") || strings.Contains(errStr, "not running") {
|
|
result.TestResult = "Guest agent not running inside VM"
|
|
result.Recommendations = append(result.Recommendations,
|
|
"SSH into VM and run: systemctl status qemu-guest-agent",
|
|
"If not installed: apt install qemu-guest-agent",
|
|
"If installed but not running: systemctl start qemu-guest-agent")
|
|
} else if strings.Contains(errStr, "403") || strings.Contains(errStr, "401") {
|
|
result.TestResult = "Permission denied accessing guest agent"
|
|
result.Recommendations = append(result.Recommendations,
|
|
"Ensure API token has PVEAuditor role for baseline access",
|
|
"Add VM.GuestAgent.Audit (PVE 9) or VM.Monitor (PVE 8) privileges; Pulse setup adds these via the PulseMonitor role",
|
|
"Include Sys.Audit when available for Ceph metrics")
|
|
} else if errors.Is(err, context.DeadlineExceeded) || strings.Contains(errStr, "context deadline exceeded") {
|
|
result.TestResult = "Guest agent request timed out"
|
|
result.Recommendations = append(result.Recommendations,
|
|
"Ensure the VM responds to guest agent queries promptly",
|
|
"Consider increasing the diagnostics timeout if the environment is large",
|
|
)
|
|
} else {
|
|
result.TestResult = "Failed to get guest agent data: " + errStr
|
|
}
|
|
} else if len(fsInfo) == 0 {
|
|
result.TestResult = "Guest agent returned no filesystem info"
|
|
result.Recommendations = append(result.Recommendations,
|
|
"Guest agent may need restart inside VM",
|
|
"Check VM has mounted filesystems")
|
|
} else {
|
|
// Calculate disk usage from filesystem info
|
|
var totalBytes, usedBytes uint64
|
|
result.FilesystemsFound = []FilesystemDetail{}
|
|
|
|
for _, fs := range fsInfo {
|
|
fsDetail := FilesystemDetail{
|
|
Mountpoint: fs.Mountpoint,
|
|
Type: fs.Type,
|
|
Total: fs.TotalBytes,
|
|
Used: fs.UsedBytes,
|
|
}
|
|
|
|
// Check if this filesystem should be filtered
|
|
if fs.Type == "tmpfs" || fs.Type == "devtmpfs" {
|
|
fsDetail.Filtered = true
|
|
fsDetail.Reason = "Special filesystem type"
|
|
} else if strings.HasPrefix(fs.Mountpoint, "/dev") ||
|
|
strings.HasPrefix(fs.Mountpoint, "/proc") ||
|
|
strings.HasPrefix(fs.Mountpoint, "/sys") ||
|
|
strings.HasPrefix(fs.Mountpoint, "/run") ||
|
|
fs.Mountpoint == "/boot/efi" {
|
|
fsDetail.Filtered = true
|
|
fsDetail.Reason = "System mount point"
|
|
} else if fs.TotalBytes == 0 {
|
|
fsDetail.Filtered = true
|
|
fsDetail.Reason = "Zero total bytes"
|
|
} else {
|
|
// This filesystem counts toward disk usage
|
|
totalBytes += fs.TotalBytes
|
|
usedBytes += fs.UsedBytes
|
|
}
|
|
|
|
result.FilesystemsFound = append(result.FilesystemsFound, fsDetail)
|
|
}
|
|
|
|
if totalBytes > 0 {
|
|
percent := float64(usedBytes) / float64(totalBytes) * 100
|
|
result.TestResult = fmt.Sprintf("SUCCESS: Guest agent working! Disk usage: %.1f%% (%d/%d bytes)",
|
|
percent, usedBytes, totalBytes)
|
|
} else {
|
|
result.TestResult = fmt.Sprintf("Guest agent returned %d filesystems but no usable disk data (all filtered out)", len(fsInfo))
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
result.TestResult = "No running VMs found to test"
|
|
result.Recommendations = append(result.Recommendations, "Start a VM to test disk monitoring")
|
|
}
|
|
|
|
// Add general recommendations based on results
|
|
if result.VMsWithAgent > 0 && result.VMsWithDiskData == 0 {
|
|
result.Recommendations = append(result.Recommendations,
|
|
"Guest agent is configured but not providing disk data",
|
|
"Check guest agent is running inside VMs",
|
|
"Verify API token permissions")
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
// checkPhysicalDisks performs diagnostic checks for physical disk detection
|
|
func (r *Router) checkPhysicalDisks(ctx context.Context, client *proxmox.Client, _ string) *PhysicalDiskCheck {
|
|
result := &PhysicalDiskCheck{
|
|
Recommendations: []string{},
|
|
NodeResults: []NodeDiskResult{},
|
|
}
|
|
|
|
// Get all nodes
|
|
nodes, err := client.GetNodes(ctx)
|
|
if err != nil {
|
|
result.TestResult = "Failed to get nodes: " + err.Error()
|
|
return result
|
|
}
|
|
|
|
result.NodesChecked = len(nodes)
|
|
|
|
// Check each node for physical disks
|
|
for _, node := range nodes {
|
|
nodeResult := NodeDiskResult{
|
|
NodeName: node.Node,
|
|
}
|
|
|
|
// Skip offline nodes
|
|
if node.Status != "online" {
|
|
nodeResult.Error = "Node is offline"
|
|
result.NodeResults = append(result.NodeResults, nodeResult)
|
|
continue
|
|
}
|
|
|
|
// Try to get disk list
|
|
diskCtx, diskCancel := context.WithTimeout(ctx, 10*time.Second)
|
|
disks, err := client.GetDisks(diskCtx, node.Node)
|
|
diskCancel()
|
|
if err != nil {
|
|
errStr := err.Error()
|
|
nodeResult.Error = errStr
|
|
|
|
// Provide specific recommendations based on error
|
|
if strings.Contains(errStr, "401") || strings.Contains(errStr, "403") {
|
|
nodeResult.APIResponse = "Permission denied"
|
|
if !contains(result.Recommendations, "Check API token has sufficient permissions for disk monitoring") {
|
|
result.Recommendations = append(result.Recommendations,
|
|
"Check API token has sufficient permissions for disk monitoring",
|
|
"Token needs at least PVEAuditor role on the node")
|
|
}
|
|
} else if errors.Is(err, context.DeadlineExceeded) || strings.Contains(errStr, "context deadline exceeded") {
|
|
nodeResult.APIResponse = "Timeout"
|
|
if !contains(result.Recommendations, "Disk query timed out; verify node connectivity and load") {
|
|
result.Recommendations = append(result.Recommendations,
|
|
"Disk query timed out; verify node connectivity and load",
|
|
"Increase diagnostics timeout if nodes are slow to respond")
|
|
}
|
|
} else if strings.Contains(errStr, "404") || strings.Contains(errStr, "501") {
|
|
nodeResult.APIResponse = "Endpoint not available"
|
|
if !contains(result.Recommendations, "Node may be running older Proxmox version without disk API support") {
|
|
result.Recommendations = append(result.Recommendations,
|
|
"Node may be running older Proxmox version without disk API support",
|
|
"Check if node is running on non-standard hardware (Raspberry Pi, etc)")
|
|
}
|
|
} else {
|
|
nodeResult.APIResponse = "API error"
|
|
}
|
|
} else {
|
|
nodeResult.DiskCount = len(disks)
|
|
if len(disks) > 0 {
|
|
result.NodesWithDisks++
|
|
result.TotalDisks += len(disks)
|
|
|
|
// List disk devices
|
|
for _, disk := range disks {
|
|
nodeResult.DiskDevices = append(nodeResult.DiskDevices, disk.DevPath)
|
|
}
|
|
} else {
|
|
nodeResult.APIResponse = "Empty response (no traditional disks found)"
|
|
// This could be normal for SD card/USB based systems
|
|
if !contains(result.Recommendations, "Some nodes returned no disks - may be using SD cards or USB storage") {
|
|
result.Recommendations = append(result.Recommendations,
|
|
"Some nodes returned no disks - may be using SD cards or USB storage",
|
|
"Proxmox disk API only returns SATA/NVMe/SAS disks, not SD cards")
|
|
}
|
|
}
|
|
}
|
|
|
|
result.NodeResults = append(result.NodeResults, nodeResult)
|
|
}
|
|
|
|
// Generate summary
|
|
if result.NodesChecked == 0 {
|
|
result.TestResult = "No nodes found to check"
|
|
} else if result.NodesWithDisks == 0 {
|
|
result.TestResult = fmt.Sprintf("Checked %d nodes, none returned physical disks", result.NodesChecked)
|
|
} else {
|
|
result.TestResult = fmt.Sprintf("Found %d disks across %d of %d nodes",
|
|
result.TotalDisks, result.NodesWithDisks, result.NodesChecked)
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
// Helper function to check if slice contains string
|
|
func contains(slice []string, str string) bool {
|
|
for _, s := range slice {
|
|
if s == str {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func containsFold(slice []string, candidate string) bool {
|
|
target := strings.ToLower(strings.TrimSpace(candidate))
|
|
if target == "" {
|
|
return false
|
|
}
|
|
|
|
for _, s := range slice {
|
|
if strings.ToLower(strings.TrimSpace(s)) == target {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func interfaceToStringSlice(value interface{}) []string {
|
|
switch v := value.(type) {
|
|
case []string:
|
|
out := make([]string, len(v))
|
|
copy(out, v)
|
|
return out
|
|
case []interface{}:
|
|
result := make([]string, 0, len(v))
|
|
for _, item := range v {
|
|
if str, ok := item.(string); ok {
|
|
result = append(result, str)
|
|
}
|
|
}
|
|
return result
|
|
default:
|
|
return nil
|
|
}
|
|
}
|
|
|
|
func buildAIChatDiagnostic(cfg *config.Config, aiHandler *AIHandler) *AIChatDiagnostic {
|
|
if cfg == nil {
|
|
return nil
|
|
}
|
|
|
|
diag := &AIChatDiagnostic{
|
|
Enabled: false,
|
|
Notes: []string{},
|
|
}
|
|
|
|
// Calculate enabled state based on AI config
|
|
// NOTE: aiHandler might be nil during early startup
|
|
if aiHandler != nil {
|
|
ctx := context.Background()
|
|
aiCfg := aiHandler.GetAIConfig(ctx)
|
|
if aiCfg != nil {
|
|
diag.Enabled = aiCfg.Enabled
|
|
diag.Model = aiCfg.GetChatModel()
|
|
}
|
|
|
|
svc := aiHandler.GetService(ctx)
|
|
if svc != nil {
|
|
diag.Running = svc.IsRunning()
|
|
diag.Healthy = svc.IsRunning() // Consolidate for now
|
|
|
|
// Get connection details
|
|
baseURL := svc.GetBaseURL()
|
|
if baseURL != "" {
|
|
diag.URL = baseURL
|
|
// Parse port from URL
|
|
if parts := strings.Split(baseURL, ":"); len(parts) > 2 {
|
|
if port, err := strconv.Atoi(parts[2]); err == nil {
|
|
diag.Port = port
|
|
}
|
|
}
|
|
}
|
|
|
|
// Check MCP connection (if we had access to check it)
|
|
diag.MCPConnected = diag.Running // Assume connected if running for now
|
|
|
|
if !diag.Running && diag.Enabled {
|
|
diag.Notes = append(diag.Notes, "Pulse Assistant service is enabled but not running")
|
|
}
|
|
} else if diag.Enabled {
|
|
diag.Notes = append(diag.Notes, "Pulse Assistant service is nil")
|
|
}
|
|
} else {
|
|
diag.Notes = append(diag.Notes, "Pulse Assistant handler not initialized")
|
|
}
|
|
|
|
return diag
|
|
}
|