From 19a67dd4f3efec85c81e422eb047edbd5f9e6c1e Mon Sep 17 00:00:00 2001 From: rcourtman Date: Wed, 28 Jan 2026 16:52:35 +0000 Subject: [PATCH] Update core infrastructure components Config: - AI configuration improvements - API tokens handling - Persistence layer updates Host Agent: - Command execution improvements - Better test coverage Infrastructure Discovery: - Service improvements - Enhanced test coverage Models: - State snapshot updates - Model improvements Monitoring: - Polling improvements - Guest config handling - Storage config support WebSocket: - Hub tenant test updates Service Discovery: - New service discovery module --- internal/config/ai.go | 34 +- internal/config/ai_additional_test.go | 31 +- internal/config/api_tokens.go | 16 + internal/config/persistence.go | 19 +- internal/hostagent/commands.go | 44 +- internal/hostagent/commands_execute_test.go | 10 +- internal/hostagent/commands_test.go | 108 + internal/infradiscovery/service.go | 26 +- internal/infradiscovery/service_test.go | 8 +- internal/models/models.go | 1 + internal/models/state_snapshot.go | 216 ++ internal/monitoring/guest_config.go | 89 + internal/monitoring/monitor_polling.go | 42 + internal/monitoring/storage_config.go | 89 + internal/servicediscovery/commands.go | 526 +++++ internal/servicediscovery/commands_test.go | 81 + internal/servicediscovery/deep_scanner.go | 475 +++++ .../servicediscovery/deep_scanner_test.go | 395 ++++ internal/servicediscovery/fingerprint.go | 249 +++ internal/servicediscovery/formatters.go | 629 ++++++ internal/servicediscovery/formatters_test.go | 218 ++ internal/servicediscovery/service.go | 1753 +++++++++++++++++ internal/servicediscovery/service_test.go | 797 ++++++++ internal/servicediscovery/store.go | 651 ++++++ internal/servicediscovery/store_test.go | 469 +++++ internal/servicediscovery/tools_adapter.go | 226 +++ internal/servicediscovery/types.go | 298 +++ internal/servicediscovery/types_test.go | 22 + internal/websocket/hub_tenant_test.go | 4 +- 29 files changed, 7459 insertions(+), 67 deletions(-) create mode 100644 internal/monitoring/guest_config.go create mode 100644 internal/monitoring/storage_config.go create mode 100644 internal/servicediscovery/commands.go create mode 100644 internal/servicediscovery/commands_test.go create mode 100644 internal/servicediscovery/deep_scanner.go create mode 100644 internal/servicediscovery/deep_scanner_test.go create mode 100644 internal/servicediscovery/fingerprint.go create mode 100644 internal/servicediscovery/formatters.go create mode 100644 internal/servicediscovery/formatters_test.go create mode 100644 internal/servicediscovery/service.go create mode 100644 internal/servicediscovery/service_test.go create mode 100644 internal/servicediscovery/store.go create mode 100644 internal/servicediscovery/store_test.go create mode 100644 internal/servicediscovery/tools_adapter.go create mode 100644 internal/servicediscovery/types.go create mode 100644 internal/servicediscovery/types_test.go diff --git a/internal/config/ai.go b/internal/config/ai.go index 6cc3f6d71..931c49d94 100644 --- a/internal/config/ai.go +++ b/internal/config/ai.go @@ -69,13 +69,13 @@ type AIConfig struct { ProtectedGuests []string `json:"protected_guests,omitempty"` // VMIDs or names that AI cannot control // Patrol Autonomy settings - controls automatic investigation and remediation of findings - PatrolAutonomyLevel string `json:"patrol_autonomy_level,omitempty"` // "monitor", "approval", "full" + PatrolAutonomyLevel string `json:"patrol_autonomy_level,omitempty"` // "monitor", "approval", "assisted", "full" + PatrolFullModeUnlocked bool `json:"patrol_full_mode_unlocked"` // User has acknowledged Full mode risks (required to use "full") PatrolInvestigationBudget int `json:"patrol_investigation_budget,omitempty"` // Max turns per investigation (default: 15) PatrolInvestigationTimeoutSec int `json:"patrol_investigation_timeout_sec,omitempty"` // Max seconds per investigation (default: 300) - PatrolCriticalRequireApproval bool `json:"patrol_critical_require_approval"` // Critical findings always require approval (default: true) - // AI Discovery settings - controls automatic infrastructure discovery - DiscoveryEnabled bool `json:"discovery_enabled"` // Enable AI-powered infrastructure discovery + // Discovery settings - controls automatic infrastructure discovery + DiscoveryEnabled bool `json:"discovery_enabled"` // Enable infrastructure discovery DiscoveryIntervalHours int `json:"discovery_interval_hours,omitempty"` // Hours between automatic re-scans (0 = manual only, default: 0) } @@ -102,13 +102,12 @@ const ( const ( // PatrolAutonomyMonitor - Detect issues and create findings, no automatic investigation PatrolAutonomyMonitor = "monitor" - // PatrolAutonomyApproval - Spawn Chat sessions to investigate, queue fixes for user approval + // PatrolAutonomyApproval - Spawn Chat sessions to investigate, queue ALL fixes for user approval PatrolAutonomyApproval = "approval" - // PatrolAutonomyFull - Spawn Chat sessions to investigate, execute non-critical fixes automatically + // PatrolAutonomyAssisted - Auto-fix warnings, critical findings still need approval + PatrolAutonomyAssisted = "assisted" + // PatrolAutonomyFull - Full autonomy, auto-fix everything including critical (user accepts risk) PatrolAutonomyFull = "full" - // PatrolAutonomyAutonomous - Full autonomy, execute ALL fixes including destructive ones without approval - // User accepts full risk - similar to "auto-accept" mode in Claude Code - PatrolAutonomyAutonomous = "autonomous" ) // Default patrol investigation settings @@ -577,8 +576,11 @@ func (c *AIConfig) GetPatrolAutonomyLevel() string { return PatrolAutonomyMonitor } switch c.PatrolAutonomyLevel { - case PatrolAutonomyMonitor, PatrolAutonomyApproval, PatrolAutonomyFull, PatrolAutonomyAutonomous: + case PatrolAutonomyMonitor, PatrolAutonomyApproval, PatrolAutonomyAssisted, PatrolAutonomyFull: return c.PatrolAutonomyLevel + // Migration: treat old "autonomous" as new "full" + case "autonomous": + return PatrolAutonomyFull default: return PatrolAutonomyMonitor } @@ -614,20 +616,10 @@ func (c *AIConfig) GetPatrolInvestigationTimeout() time.Duration { return time.Duration(c.PatrolInvestigationTimeoutSec) * time.Second } -// ShouldCriticalRequireApproval returns whether critical findings should always require approval -// Defaults to true for safety -func (c *AIConfig) ShouldCriticalRequireApproval() bool { - // This is a safety feature, default to true - // The JSON field uses the default Go behavior (false when not set), - // so we explicitly check if it was intended to be false - // For backwards compatibility, treat unset as true - return c.PatrolCriticalRequireApproval || c.PatrolAutonomyLevel == "" -} - // IsValidPatrolAutonomyLevel checks if a patrol autonomy level string is valid func IsValidPatrolAutonomyLevel(level string) bool { switch level { - case PatrolAutonomyMonitor, PatrolAutonomyApproval, PatrolAutonomyFull, PatrolAutonomyAutonomous: + case PatrolAutonomyMonitor, PatrolAutonomyApproval, PatrolAutonomyAssisted, PatrolAutonomyFull: return true default: return false diff --git a/internal/config/ai_additional_test.go b/internal/config/ai_additional_test.go index 80d24d985..54a705592 100644 --- a/internal/config/ai_additional_test.go +++ b/internal/config/ai_additional_test.go @@ -52,14 +52,26 @@ func TestAIConfig_PatrolSettings(t *testing.T) { t.Fatalf("patrol autonomy should be disabled by default") } + // Test all valid levels + cfg.PatrolAutonomyLevel = PatrolAutonomyAssisted + if got := cfg.GetPatrolAutonomyLevel(); got != PatrolAutonomyAssisted { + t.Fatalf("patrol autonomy = %q, want assisted", got) + } + cfg.PatrolAutonomyLevel = PatrolAutonomyFull if got := cfg.GetPatrolAutonomyLevel(); got != PatrolAutonomyFull { - t.Fatalf("patrol autonomy = %q", got) + t.Fatalf("patrol autonomy = %q, want full", got) } if !cfg.IsPatrolAutonomyEnabled() { t.Fatalf("patrol autonomy should be enabled for full mode") } + // Test migration: old "autonomous" maps to new "full" + cfg.PatrolAutonomyLevel = "autonomous" + if got := cfg.GetPatrolAutonomyLevel(); got != PatrolAutonomyFull { + t.Fatalf("patrol autonomy = %q, want full (migrated from autonomous)", got) + } + cfg.PatrolAutonomyLevel = "invalid" if got := cfg.GetPatrolAutonomyLevel(); got != PatrolAutonomyMonitor { t.Fatalf("invalid autonomy should fallback to monitor, got %q", got) @@ -94,17 +106,6 @@ func TestAIConfig_PatrolSettings(t *testing.T) { if got := cfg.GetPatrolInvestigationTimeout(); got.Seconds() != 120 { t.Fatalf("timeout should be 120s, got %s", got) } - - cfg.PatrolAutonomyLevel = "" - cfg.PatrolCriticalRequireApproval = false - if !cfg.ShouldCriticalRequireApproval() { - t.Fatalf("critical approval should default to true when level unset") - } - - cfg.PatrolAutonomyLevel = PatrolAutonomyMonitor - if cfg.ShouldCriticalRequireApproval() { - t.Fatalf("critical approval should be false when explicitly disabled") - } } func TestAIConfig_ProtectedGuestsAndValidation(t *testing.T) { @@ -131,4 +132,10 @@ func TestAIConfig_ProtectedGuestsAndValidation(t *testing.T) { if !IsValidPatrolAutonomyLevel(PatrolAutonomyApproval) { t.Fatalf("expected patrol approval to be valid") } + if !IsValidPatrolAutonomyLevel(PatrolAutonomyAssisted) { + t.Fatalf("expected patrol assisted to be valid") + } + if !IsValidPatrolAutonomyLevel(PatrolAutonomyFull) { + t.Fatalf("expected patrol full to be valid") + } } diff --git a/internal/config/api_tokens.go b/internal/config/api_tokens.go index 6a5cb84d8..4550f0ebd 100644 --- a/internal/config/api_tokens.go +++ b/internal/config/api_tokens.go @@ -297,6 +297,22 @@ func (c *Config) ValidateAPIToken(rawToken string) (*APITokenRecord, bool) { return nil, false } +// IsValidAPIToken checks if a token is valid without mutating any metadata. +// Use this for read-only checks like admin verification where you don't need +// to update LastUsedAt or get the full record. Safe to call under RLock. +func (c *Config) IsValidAPIToken(rawToken string) bool { + if rawToken == "" { + return false + } + + for _, record := range c.APITokens { + if auth.CompareAPIToken(rawToken, record.Hash) { + return true + } + } + return false +} + // UpsertAPIToken inserts or replaces a record by ID. func (c *Config) UpsertAPIToken(record APITokenRecord) { record.ensureScopes() diff --git a/internal/config/persistence.go b/internal/config/persistence.go index d1cc90937..2cc90dfb9 100644 --- a/internal/config/persistence.go +++ b/internal/config/persistence.go @@ -1959,12 +1959,19 @@ type PatrolRunHistoryData struct { // PatrolRunRecord represents a single patrol check run type PatrolRunRecord struct { - ID string `json:"id"` - StartedAt time.Time `json:"started_at"` - CompletedAt time.Time `json:"completed_at"` - DurationMs int64 `json:"duration_ms"` - Type string `json:"type"` // "quick" or "deep" - ResourcesChecked int `json:"resources_checked"` + ID string `json:"id"` + StartedAt time.Time `json:"started_at"` + CompletedAt time.Time `json:"completed_at"` + DurationMs int64 `json:"duration_ms"` + Type string `json:"type"` // "quick" or "deep" + TriggerReason string `json:"trigger_reason,omitempty"` + ScopeResourceIDs []string `json:"scope_resource_ids,omitempty"` + ScopeResourceTypes []string `json:"scope_resource_types,omitempty"` + ScopeDepth string `json:"scope_depth,omitempty"` + ScopeContext string `json:"scope_context,omitempty"` + AlertID string `json:"alert_id,omitempty"` + FindingID string `json:"finding_id,omitempty"` + ResourcesChecked int `json:"resources_checked"` // Breakdown by resource type NodesChecked int `json:"nodes_checked"` GuestsChecked int `json:"guests_checked"` diff --git a/internal/hostagent/commands.go b/internal/hostagent/commands.go index 8c6ff7c2e..a76b9366d 100644 --- a/internal/hostagent/commands.go +++ b/internal/hostagent/commands.go @@ -7,7 +7,9 @@ import ( "encoding/json" "fmt" "net/url" + "os" "os/exec" + "regexp" "runtime" "strings" "sync" @@ -17,6 +19,10 @@ import ( "github.com/rs/zerolog" ) +// safeTargetIDPattern validates target IDs to prevent shell injection. +// Allows alphanumeric, dash, underscore, period (no colons or special chars). +var safeTargetIDPattern = regexp.MustCompile(`^[a-zA-Z0-9._-]+$`) + var execCommandContext = exec.CommandContext // CommandClient handles WebSocket connection to Pulse for AI command execution @@ -387,15 +393,41 @@ func (c *CommandClient) handleExecuteCommand(ctx context.Context, conn *websocke } func wrapCommand(payload executeCommandPayload) string { - if payload.TargetType == "container" && payload.TargetID != "" { - return fmt.Sprintf("pct exec %s -- %s", payload.TargetID, payload.Command) - } - if payload.TargetType == "vm" && payload.TargetID != "" { - return fmt.Sprintf("qm guest exec %s -- %s", payload.TargetID, payload.Command) + // Only validate TargetID when it will be interpolated into the command + // (container and vm types). Host type doesn't use TargetID in the command. + needsTargetID := (payload.TargetType == "container" || payload.TargetType == "vm") && payload.TargetID != "" + + if needsTargetID { + // Validate TargetID to prevent shell injection - defense in depth + if !safeTargetIDPattern.MatchString(payload.TargetID) { + // Return a command that fails with non-zero exit and error message + return "sh -c 'echo \"Error: invalid target ID\" >&2; exit 1'" + } + + // Wrap command in sh -c so shell metacharacters (pipes, redirects, globs) + // are processed inside the container/VM, not on the Proxmox host. + // Without this, "pct exec 141 -- grep pattern /var/log/*.log" would + // expand the glob on the host (where /var/log/*.log doesn't exist). + quotedCmd := shellQuote(payload.Command) + + if payload.TargetType == "container" { + return fmt.Sprintf("pct exec %s -- sh -c %s", payload.TargetID, quotedCmd) + } + if payload.TargetType == "vm" { + return fmt.Sprintf("qm guest exec %s -- sh -c %s", payload.TargetID, quotedCmd) + } } + return payload.Command } +// shellQuote safely quotes a string for use as a shell argument. +// Uses single quotes and escapes any embedded single quotes. +func shellQuote(s string) string { + escaped := strings.ReplaceAll(s, "'", "'\"'\"'") + return "'" + escaped + "'" +} + func (c *CommandClient) executeCommand(ctx context.Context, payload executeCommandPayload) commandResultPayload { result := commandResultPayload{ RequestID: payload.RequestID, @@ -418,6 +450,8 @@ func (c *CommandClient) executeCommand(ctx context.Context, payload executeComma cmd = execCommandContext(cmdCtx, "cmd", "/C", command) } else { cmd = execCommandContext(cmdCtx, "sh", "-c", command) + // Ensure PATH includes common binary locations for docker, kubectl, etc. + cmd.Env = append(os.Environ(), "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:"+os.Getenv("PATH")) } var stdout, stderr bytes.Buffer diff --git a/internal/hostagent/commands_execute_test.go b/internal/hostagent/commands_execute_test.go index a71c2f067..7d0b0302e 100644 --- a/internal/hostagent/commands_execute_test.go +++ b/internal/hostagent/commands_execute_test.go @@ -23,22 +23,24 @@ func TestWrapCommand_TargetWrapping(t *testing.T) { want: "echo ok", }, { - name: "container wraps with pct", + name: "container wraps with pct and sh -c", payload: executeCommandPayload{ Command: "echo ok", TargetType: "container", TargetID: "101", }, - want: "pct exec 101 -- echo ok", + // Commands are wrapped in sh -c so shell metacharacters are processed inside the container + want: "pct exec 101 -- sh -c 'echo ok'", }, { - name: "vm wraps with qm guest exec", + name: "vm wraps with qm guest exec and sh -c", payload: executeCommandPayload{ Command: "echo ok", TargetType: "vm", TargetID: "900", }, - want: "qm guest exec 900 -- echo ok", + // Commands are wrapped in sh -c so shell metacharacters are processed inside the VM + want: "qm guest exec 900 -- sh -c 'echo ok'", }, { name: "missing target id does not wrap", diff --git a/internal/hostagent/commands_test.go b/internal/hostagent/commands_test.go index ac81c7b2a..8f8bd6484 100644 --- a/internal/hostagent/commands_test.go +++ b/internal/hostagent/commands_test.go @@ -6,6 +6,7 @@ import ( "net/http" "net/http/httptest" "os/exec" + "strings" "testing" "time" @@ -128,3 +129,110 @@ func TestCommandClient_Run(t *testing.T) { cancel() time.Sleep(100 * time.Millisecond) } + +func TestWrapCommand(t *testing.T) { + tests := []struct { + name string + payload executeCommandPayload + wantCmd string + checkFn func(string) bool + }{ + { + name: "HostCommandPassedThrough", + payload: executeCommandPayload{ + Command: "ls -la", + TargetType: "host", + TargetID: "", + }, + wantCmd: "ls -la", + }, + { + name: "LXCCommandWrappedInShC", + payload: executeCommandPayload{ + Command: "grep pattern /var/log/*.log", + TargetType: "container", + TargetID: "141", + }, + checkFn: func(cmd string) bool { + // Should be: pct exec 141 -- sh -c 'grep pattern /var/log/*.log' + return strings.HasPrefix(cmd, "pct exec 141 -- sh -c '") && + strings.Contains(cmd, "grep pattern /var/log/*.log") + }, + }, + { + name: "VMCommandWrappedInShC", + payload: executeCommandPayload{ + Command: "cat /etc/hostname", + TargetType: "vm", + TargetID: "100", + }, + checkFn: func(cmd string) bool { + // Should be: qm guest exec 100 -- sh -c 'cat /etc/hostname' + return strings.HasPrefix(cmd, "qm guest exec 100 -- sh -c '") && + strings.Contains(cmd, "cat /etc/hostname") + }, + }, + { + name: "LXCCommandWithSingleQuotes", + payload: executeCommandPayload{ + Command: "echo \"it's working\"", + TargetType: "container", + TargetID: "141", + }, + checkFn: func(cmd string) bool { + // Single quotes should be escaped: it's -> it'"'"'s + return strings.HasPrefix(cmd, "pct exec 141 -- sh -c '") && + strings.Contains(cmd, `it'"'"'s`) + }, + }, + { + name: "LXCCommandWithPipeline", + payload: executeCommandPayload{ + Command: "echo 'test' | base64 -d > /tmp/file", + TargetType: "container", + TargetID: "108", + }, + checkFn: func(cmd string) bool { + // Pipeline should be wrapped so it runs inside LXC + return strings.HasPrefix(cmd, "pct exec 108 -- sh -c '") && + strings.Contains(cmd, "| base64 -d > /tmp/file") + }, + }, + { + name: "InvalidTargetIDReturnsError", + payload: executeCommandPayload{ + Command: "ls", + TargetType: "container", + TargetID: "141; rm -rf /", // injection attempt + }, + checkFn: func(cmd string) bool { + return strings.Contains(cmd, "invalid target ID") + }, + }, + { + name: "EmptyTargetIDPassedThrough", + payload: executeCommandPayload{ + Command: "ls", + TargetType: "container", + TargetID: "", + }, + wantCmd: "ls", // No wrapping when TargetID is empty + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := wrapCommand(tt.payload) + if tt.wantCmd != "" { + if got != tt.wantCmd { + t.Errorf("wrapCommand() = %q, want %q", got, tt.wantCmd) + } + } + if tt.checkFn != nil { + if !tt.checkFn(got) { + t.Errorf("wrapCommand() = %q, check failed", got) + } + } + }) + } +} diff --git a/internal/infradiscovery/service.go b/internal/infradiscovery/service.go index 1caec4637..72699bcf5 100644 --- a/internal/infradiscovery/service.go +++ b/internal/infradiscovery/service.go @@ -1,4 +1,4 @@ -// Package infradiscovery provides AI-powered infrastructure discovery for detecting +// Package infradiscovery provides infrastructure discovery for detecting // applications and services running on monitored hosts. It uses LLM analysis to // identify services from Docker containers, enabling AI systems like Patrol to // understand where services run and propose correct remediation commands. @@ -70,8 +70,8 @@ type PortInfo struct { Protocol string `json:"protocol,omitempty"` } -// AIDiscoveryResult represents the AI's analysis of a container. -type AIDiscoveryResult struct { +// DiscoveryResult represents the AI's analysis of a container. +type DiscoveryResult struct { ServiceType string `json:"service_type"` // e.g., "postgres", "pbs", "nginx", "unknown" ServiceName string `json:"service_name"` // Human-readable name Category string `json:"category"` // backup, database, web, monitoring, etc. @@ -80,7 +80,7 @@ type AIDiscoveryResult struct { Reasoning string `json:"reasoning"` // Why the AI made this determination } -// Service manages AI-powered infrastructure discovery. +// Service manages infrastructure discovery. type Service struct { stateProvider StateProvider knowledgeStore *knowledge.Store @@ -94,7 +94,7 @@ type Service struct { // Cache to avoid re-analyzing the same containers // Key: image name, Value: analysis result - analysisCache map[string]*AIDiscoveryResult + analysisCache map[string]*DiscoveryResult cacheMu sync.RWMutex cacheExpiry time.Duration lastCacheUpdate time.Time @@ -114,7 +114,7 @@ func DefaultConfig() Config { } } -// NewService creates a new AI-powered infrastructure discovery service. +// NewService creates a new infrastructure discovery service. func NewService(stateProvider StateProvider, knowledgeStore *knowledge.Store, cfg Config) *Service { if cfg.Interval == 0 { cfg.Interval = 5 * time.Minute @@ -130,12 +130,12 @@ func NewService(stateProvider StateProvider, knowledgeStore *knowledge.Store, cf cacheExpiry: cfg.CacheExpiry, stopCh: make(chan struct{}), discoveries: make([]DiscoveredApp, 0), - analysisCache: make(map[string]*AIDiscoveryResult), + analysisCache: make(map[string]*DiscoveryResult), } } // SetAIAnalyzer sets the AI analyzer for discovery. -// This must be called before Start() for AI-powered discovery to work. +// This must be called before Start() for discovery to work. func (s *Service) SetAIAnalyzer(analyzer AIAnalyzer) { s.mu.Lock() defer s.mu.Unlock() @@ -154,7 +154,7 @@ func (s *Service) Start(ctx context.Context) { log.Info(). Dur("interval", s.interval). - Msg("Starting AI-powered infrastructure discovery service") + Msg("Starting infrastructure discovery service") // Run immediately on startup go func() { @@ -285,7 +285,7 @@ func (s *Service) analyzeContainer(ctx context.Context, analyzer AIAnalyzer, c m cacheValid := time.Since(s.lastCacheUpdate) < s.cacheExpiry s.cacheMu.RUnlock() - var result *AIDiscoveryResult + var result *DiscoveryResult if found && cacheValid { result = cached @@ -459,7 +459,7 @@ Respond with ONLY the JSON, no other text.`, string(infoJSON)) } // parseAIResponse parses the AI's JSON response. -func (s *Service) parseAIResponse(response string) *AIDiscoveryResult { +func (s *Service) parseAIResponse(response string) *DiscoveryResult { // Try to extract JSON from the response response = strings.TrimSpace(response) @@ -487,7 +487,7 @@ func (s *Service) parseAIResponse(response string) *AIDiscoveryResult { response = response[start : end+1] } - var result AIDiscoveryResult + var result DiscoveryResult if err := json.Unmarshal([]byte(response), &result); err != nil { log.Debug(). Err(err). @@ -581,7 +581,7 @@ func (s *Service) ForceRefresh(ctx context.Context) { func (s *Service) ClearCache() { s.cacheMu.Lock() defer s.cacheMu.Unlock() - s.analysisCache = make(map[string]*AIDiscoveryResult) + s.analysisCache = make(map[string]*DiscoveryResult) s.lastCacheUpdate = time.Time{} } diff --git a/internal/infradiscovery/service_test.go b/internal/infradiscovery/service_test.go index e56ed398e..0b320f99b 100644 --- a/internal/infradiscovery/service_test.go +++ b/internal/infradiscovery/service_test.go @@ -64,7 +64,7 @@ func TestParseAIResponse(t *testing.T) { tests := []struct { name string response string - want *AIDiscoveryResult + want *DiscoveryResult }{ { name: "valid JSON", @@ -76,7 +76,7 @@ func TestParseAIResponse(t *testing.T) { "confidence": 0.95, "reasoning": "Image name contains postgres" }`, - want: &AIDiscoveryResult{ + want: &DiscoveryResult{ ServiceType: "postgres", ServiceName: "PostgreSQL", Category: "database", @@ -88,7 +88,7 @@ func TestParseAIResponse(t *testing.T) { { name: "JSON in markdown code block", response: "```json\n{\"service_type\": \"redis\", \"service_name\": \"Redis\", \"category\": \"cache\", \"cli_command\": \"docker exec {container} redis-cli\", \"confidence\": 0.9, \"reasoning\": \"Redis image\"}\n```", - want: &AIDiscoveryResult{ + want: &DiscoveryResult{ ServiceType: "redis", ServiceName: "Redis", Category: "cache", @@ -107,7 +107,7 @@ func TestParseAIResponse(t *testing.T) { response: `Here's my analysis: {"service_type": "nginx", "service_name": "Nginx", "category": "web", "cli_command": "", "confidence": 0.85, "reasoning": "Web server"} That's my answer.`, - want: &AIDiscoveryResult{ + want: &DiscoveryResult{ ServiceType: "nginx", ServiceName: "Nginx", Category: "web", diff --git a/internal/models/models.go b/internal/models/models.go index 436ace46b..23f6a0d38 100644 --- a/internal/models/models.go +++ b/internal/models/models.go @@ -696,6 +696,7 @@ type Storage struct { NodeCount int `json:"nodeCount,omitempty"` Type string `json:"type"` Status string `json:"status"` + Path string `json:"path,omitempty"` Total int64 `json:"total"` Used int64 `json:"used"` Free int64 `json:"free"` diff --git a/internal/models/state_snapshot.go b/internal/models/state_snapshot.go index c777c5084..e274003e5 100644 --- a/internal/models/state_snapshot.go +++ b/internal/models/state_snapshot.go @@ -87,6 +87,222 @@ func (s *State) GetSnapshot() StateSnapshot { return snapshot } +// ResourceLocation describes where a resource lives in the infrastructure hierarchy. +// This is the authoritative source of truth for routing commands to resources. +type ResourceLocation struct { + // What was found + Found bool // True if the resource was found + Name string // The resource name + ResourceType string // "node", "vm", "lxc", "dockerhost", "docker", "host", "k8s_cluster", "k8s_pod", "k8s_deployment" + + // For VMs and LXCs (Proxmox) + VMID int // VMID if this is a VM or LXC + Node string // Proxmox node name + + // For Docker/Podman containers + DockerHostName string // Name of the Docker host (LXC/VM/standalone) + DockerHostType string // "lxc", "vm", or "standalone" + DockerHostVMID int // VMID if Docker host is an LXC/VM + + // For Kubernetes resources + K8sClusterName string // Kubernetes cluster name + K8sNamespace string // Kubernetes namespace + K8sAgentID string // Agent ID for routing kubectl commands + + // For generic hosts (Windows/Linux via Pulse Unified Agent) + HostID string // Host ID + Platform string // "linux", "windows", etc. + + // The key output: where to route commands + TargetHost string // The target_host to use for pulse_control/pulse_file_edit + AgentID string // Direct agent ID if known (for K8s, standalone hosts) +} + +// ResolveResource looks up a resource by name and returns its location in the hierarchy. +// This is the single source of truth for determining where any resource lives. +func (s StateSnapshot) ResolveResource(name string) ResourceLocation { + // Check Proxmox nodes first + for _, node := range s.Nodes { + if node.Name == name { + return ResourceLocation{ + Found: true, + Name: name, + ResourceType: "node", + Node: node.Name, + TargetHost: node.Name, + } + } + } + + // Check VMs + for _, vm := range s.VMs { + if vm.Name == name { + return ResourceLocation{ + Found: true, + Name: name, + ResourceType: "vm", + VMID: vm.VMID, + Node: vm.Node, + TargetHost: vm.Name, // Route to VM by name + } + } + } + + // Check LXC containers + for _, lxc := range s.Containers { + if lxc.Name == name { + return ResourceLocation{ + Found: true, + Name: name, + ResourceType: "lxc", + VMID: lxc.VMID, + Node: lxc.Node, + TargetHost: lxc.Name, // Route to LXC by name + } + } + } + + // Check Docker hosts (LXCs/VMs/standalone hosts running Docker) + for _, dh := range s.DockerHosts { + if dh.Hostname == name || dh.ID == name { + loc := ResourceLocation{ + Found: true, + Name: dh.Hostname, + ResourceType: "dockerhost", + DockerHostName: dh.Hostname, + TargetHost: dh.Hostname, + } + // Check if this Docker host is an LXC + for _, lxc := range s.Containers { + if lxc.Name == dh.Hostname || lxc.Name == dh.ID { + loc.DockerHostType = "lxc" + loc.DockerHostVMID = lxc.VMID + loc.Node = lxc.Node + break + } + } + // Check if this Docker host is a VM + if loc.DockerHostType == "" { + for _, vm := range s.VMs { + if vm.Name == dh.Hostname || vm.Name == dh.ID { + loc.DockerHostType = "vm" + loc.DockerHostVMID = vm.VMID + loc.Node = vm.Node + break + } + } + } + if loc.DockerHostType == "" { + loc.DockerHostType = "standalone" + } + return loc + } + } + + // Check Docker containers - this is the critical path for "homepage" -> "homepage-docker" + for _, dh := range s.DockerHosts { + for _, container := range dh.Containers { + if container.Name == name { + loc := ResourceLocation{ + Found: true, + Name: name, + ResourceType: "docker", + DockerHostName: dh.Hostname, + TargetHost: dh.Hostname, // Route to the Docker host, not the container + } + // Resolve the Docker host's parent (LXC/VM/standalone) + for _, lxc := range s.Containers { + if lxc.Name == dh.Hostname || lxc.Name == dh.ID { + loc.DockerHostType = "lxc" + loc.DockerHostVMID = lxc.VMID + loc.Node = lxc.Node + loc.TargetHost = lxc.Name // Route to the LXC + break + } + } + if loc.DockerHostType == "" { + for _, vm := range s.VMs { + if vm.Name == dh.Hostname || vm.Name == dh.ID { + loc.DockerHostType = "vm" + loc.DockerHostVMID = vm.VMID + loc.Node = vm.Node + loc.TargetHost = vm.Name // Route to the VM + break + } + } + } + if loc.DockerHostType == "" { + loc.DockerHostType = "standalone" + } + return loc + } + } + } + + // Check generic Hosts (Windows/Linux via Pulse Unified Agent) + for _, host := range s.Hosts { + if host.Hostname == name || host.ID == name { + return ResourceLocation{ + Found: true, + Name: host.Hostname, + ResourceType: "host", + HostID: host.ID, + Platform: host.Platform, + TargetHost: host.Hostname, + } + } + } + + // Check Kubernetes clusters, pods, and deployments + for _, cluster := range s.KubernetesClusters { + if cluster.Name == name || cluster.ID == name || cluster.DisplayName == name { + return ResourceLocation{ + Found: true, + Name: cluster.Name, + ResourceType: "k8s_cluster", + K8sClusterName: cluster.Name, + K8sAgentID: cluster.AgentID, + TargetHost: cluster.Name, + AgentID: cluster.AgentID, + } + } + + // Check pods within this cluster + for _, pod := range cluster.Pods { + if pod.Name == name { + return ResourceLocation{ + Found: true, + Name: pod.Name, + ResourceType: "k8s_pod", + K8sClusterName: cluster.Name, + K8sNamespace: pod.Namespace, + K8sAgentID: cluster.AgentID, + TargetHost: cluster.Name, + AgentID: cluster.AgentID, + } + } + } + + // Check deployments within this cluster + for _, deploy := range cluster.Deployments { + if deploy.Name == name { + return ResourceLocation{ + Found: true, + Name: deploy.Name, + ResourceType: "k8s_deployment", + K8sClusterName: cluster.Name, + K8sNamespace: deploy.Namespace, + K8sAgentID: cluster.AgentID, + TargetHost: cluster.Name, + AgentID: cluster.AgentID, + } + } + } + } + + return ResourceLocation{Found: false, Name: name} +} + // ToFrontend converts a StateSnapshot to frontend format with proper tag handling func (s StateSnapshot) ToFrontend() StateFrontend { // Convert nodes diff --git a/internal/monitoring/guest_config.go b/internal/monitoring/guest_config.go new file mode 100644 index 000000000..9c3b81550 --- /dev/null +++ b/internal/monitoring/guest_config.go @@ -0,0 +1,89 @@ +package monitoring + +import ( + "context" + "fmt" + "strings" +) + +// GetGuestConfig fetches Proxmox guest configuration for a VM or LXC container. +// If instance or node are empty, it attempts to resolve them from the current state. +func (m *Monitor) GetGuestConfig(ctx context.Context, guestType, instance, node string, vmid int) (map[string]interface{}, error) { + if m == nil { + return nil, fmt.Errorf("monitor not available") + } + if vmid <= 0 { + return nil, fmt.Errorf("invalid vmid") + } + + gt := strings.ToLower(strings.TrimSpace(guestType)) + if gt == "" { + return nil, fmt.Errorf("guest type is required") + } + + // Resolve instance/node from state if missing. + if instance == "" || node == "" { + m.mu.RLock() + state := m.state + m.mu.RUnlock() + if state == nil { + return nil, fmt.Errorf("state not available") + } + + switch gt { + case "container", "lxc": + for _, ct := range state.Containers { + if ct.VMID == vmid { + if instance == "" { + instance = ct.Instance + } + if node == "" { + node = ct.Node + } + break + } + } + case "vm": + for _, vm := range state.VMs { + if vm.VMID == vmid { + if instance == "" { + instance = vm.Instance + } + if node == "" { + node = vm.Node + } + break + } + } + default: + return nil, fmt.Errorf("unsupported guest type: %s", guestType) + } + } + + if instance == "" || node == "" { + return nil, fmt.Errorf("unable to resolve instance or node for guest") + } + + m.mu.RLock() + client := m.pveClients[instance] + m.mu.RUnlock() + if client == nil { + return nil, fmt.Errorf("no PVE client for instance %s", instance) + } + + switch gt { + case "container", "lxc": + return client.GetContainerConfig(ctx, node, vmid) + case "vm": + type vmConfigClient interface { + GetVMConfig(ctx context.Context, node string, vmid int) (map[string]interface{}, error) + } + vmClient, ok := client.(vmConfigClient) + if !ok { + return nil, fmt.Errorf("VM config not supported by client") + } + return vmClient.GetVMConfig(ctx, node, vmid) + default: + return nil, fmt.Errorf("unsupported guest type: %s", guestType) + } +} diff --git a/internal/monitoring/monitor_polling.go b/internal/monitoring/monitor_polling.go index 94ef6ac64..ff672873c 100644 --- a/internal/monitoring/monitor_polling.go +++ b/internal/monitoring/monitor_polling.go @@ -1410,6 +1410,7 @@ func (m *Monitor) pollStorageWithNodes(ctx context.Context, instanceName string, Instance: storageInstanceName, Type: storage.Type, Status: "available", + Path: storage.Path, Total: int64(storage.Total), Used: int64(storage.Used), Free: int64(storage.Available), @@ -1420,6 +1421,15 @@ func (m *Monitor) pollStorageWithNodes(ctx context.Context, instanceName string, Active: storage.Active == 1, } + if hasClusterConfig { + if nodes := parseClusterStorageNodes(clusterConfig.Nodes); len(nodes) > 0 { + modelStorage.Nodes = nodes + } + if modelStorage.Path == "" && clusterConfig.Path != "" { + modelStorage.Path = clusterConfig.Path + } + } + // If this is ZFS storage, attach pool status information if storage.Type == "zfspool" || storage.Type == "zfs" || storage.Type == "local-zfs" { // Try to match by storage name or by common ZFS pool names @@ -2478,3 +2488,35 @@ func (m *Monitor) pollPVENode( return modelNode, effectiveStatus, nil } + +func parseClusterStorageNodes(raw string) []string { + raw = strings.TrimSpace(raw) + if raw == "" { + return nil + } + + parts := strings.FieldsFunc(raw, func(r rune) bool { + return r == ',' || r == ';' || r == ' ' || r == '\t' || r == '\n' + }) + if len(parts) == 0 { + return nil + } + + seen := make(map[string]struct{}, len(parts)) + result := make([]string, 0, len(parts)) + for _, part := range parts { + part = strings.TrimSpace(part) + if part == "" { + continue + } + if _, exists := seen[part]; exists { + continue + } + seen[part] = struct{}{} + result = append(result, part) + } + if len(result) == 0 { + return nil + } + return result +} diff --git a/internal/monitoring/storage_config.go b/internal/monitoring/storage_config.go new file mode 100644 index 000000000..a73095a63 --- /dev/null +++ b/internal/monitoring/storage_config.go @@ -0,0 +1,89 @@ +package monitoring + +import ( + "context" + "fmt" + "strings" + + "github.com/rcourtman/pulse-go-rewrite/pkg/proxmox" + "github.com/rs/zerolog/log" +) + +// GetStorageConfig fetches Proxmox storage configuration across instances. +// If instance is empty, returns configs for all instances. +func (m *Monitor) GetStorageConfig(ctx context.Context, instance string) (map[string][]proxmox.Storage, error) { + if m == nil { + return nil, fmt.Errorf("monitor not available") + } + if ctx == nil { + ctx = context.Background() + } + + filter := strings.TrimSpace(instance) + + m.mu.RLock() + clients := make(map[string]PVEClientInterface, len(m.pveClients)) + for name, client := range m.pveClients { + clients[name] = client + } + m.mu.RUnlock() + + if len(clients) == 0 { + return nil, fmt.Errorf("no PVE clients available") + } + + results := make(map[string][]proxmox.Storage) + var firstErr error + + for name, client := range clients { + if client == nil { + continue + } + if filter != "" && !m.matchesInstanceFilter(name, filter) { + continue + } + + storageInstance := name + if cfg := m.getInstanceConfig(name); cfg != nil && cfg.IsCluster && cfg.ClusterName != "" { + storageInstance = cfg.ClusterName + } + + storages, err := client.GetAllStorage(ctx) + if err != nil { + if filter != "" { + return nil, err + } + if firstErr == nil { + firstErr = err + } + log.Warn(). + Err(err). + Str("instance", name). + Msg("Failed to fetch storage config for instance") + continue + } + + results[storageInstance] = append(results[storageInstance], storages...) + } + + if len(results) == 0 && firstErr != nil { + return nil, firstErr + } + + if filter != "" && len(results) == 0 { + return nil, fmt.Errorf("no PVE instance matches %s", filter) + } + + return results, nil +} + +func (m *Monitor) matchesInstanceFilter(instanceName, filter string) bool { + if strings.EqualFold(instanceName, filter) { + return true + } + cfg := m.getInstanceConfig(instanceName) + if cfg != nil && cfg.IsCluster && cfg.ClusterName != "" && strings.EqualFold(cfg.ClusterName, filter) { + return true + } + return false +} diff --git a/internal/servicediscovery/commands.go b/internal/servicediscovery/commands.go new file mode 100644 index 000000000..91cc84041 --- /dev/null +++ b/internal/servicediscovery/commands.go @@ -0,0 +1,526 @@ +package servicediscovery + +import ( + "fmt" + "regexp" + "strings" +) + +// safeResourceIDPattern matches valid resource IDs: alphanumeric, dash, underscore, period, colon +// This prevents shell injection via malicious resource names. +var safeResourceIDPattern = regexp.MustCompile(`^[a-zA-Z0-9._:-]+$`) + +// ValidateResourceID checks if a resource ID is safe to use in shell commands. +// Returns an error if the ID contains potentially dangerous characters. +func ValidateResourceID(id string) error { + if id == "" { + return fmt.Errorf("resource ID cannot be empty") + } + if len(id) > 256 { + return fmt.Errorf("resource ID too long (max 256 chars)") + } + if !safeResourceIDPattern.MatchString(id) { + return fmt.Errorf("resource ID contains invalid characters: only alphanumeric, dash, underscore, period, and colon allowed") + } + return nil +} + +// shellQuote safely quotes a string for use as a shell argument. +// Uses single quotes and escapes any embedded single quotes. +func shellQuote(s string) string { + // Replace single quotes with '\'' (end quote, escaped quote, start quote) + escaped := strings.ReplaceAll(s, "'", "'\"'\"'") + return "'" + escaped + "'" +} + +// DiscoveryCommand represents a command to run during discovery. +type DiscoveryCommand struct { + Name string // Human-readable name + Command string // The command template + Description string // What this discovers + Categories []string // What categories of info this provides + Timeout int // Timeout in seconds (0 = default) + Optional bool // If true, don't fail if command fails +} + +// CommandSet represents a set of commands for a resource type. +type CommandSet struct { + ResourceType ResourceType + Commands []DiscoveryCommand +} + +// GetCommandsForResource returns the commands to run for a given resource type. +func GetCommandsForResource(resourceType ResourceType) []DiscoveryCommand { + switch resourceType { + case ResourceTypeLXC: + return getLXCCommands() + case ResourceTypeVM: + return getVMCommands() + case ResourceTypeDocker: + return getDockerCommands() + case ResourceTypeDockerVM, ResourceTypeDockerLXC: + return getNestedDockerCommands() + case ResourceTypeK8s: + return getK8sCommands() + case ResourceTypeHost: + return getHostCommands() + default: + return []DiscoveryCommand{} + } +} + +// getLXCCommands returns commands for discovering LXC containers. +func getLXCCommands() []DiscoveryCommand { + return []DiscoveryCommand{ + { + Name: "os_release", + Command: "cat /etc/os-release", + Description: "Operating system identification", + Categories: []string{"version", "config"}, + Optional: true, + }, + { + Name: "hostname", + Command: "hostname", + Description: "Container hostname", + Categories: []string{"config"}, + Optional: true, + }, + { + Name: "running_services", + Command: "systemctl list-units --type=service --state=running --no-pager 2>/dev/null | head -30 || service --status-all 2>/dev/null | grep '+' | head -30", + Description: "Running services and daemons", + Categories: []string{"service"}, + Optional: true, + }, + { + Name: "listening_ports", + Command: "ss -tlnp 2>/dev/null | head -25 || netstat -tlnp 2>/dev/null | head -25", + Description: "Network ports listening", + Categories: []string{"port", "network"}, + Optional: true, + }, + { + Name: "top_processes", + Command: "ps aux --sort=-rss 2>/dev/null | head -15 || ps aux | head -15", + Description: "Top processes by memory", + Categories: []string{"service"}, + Optional: true, + }, + { + Name: "disk_usage", + Command: "df -h 2>/dev/null | head -15", + Description: "Disk usage and mount points", + Categories: []string{"storage"}, + Optional: true, + }, + { + Name: "docker_check", + Command: "docker ps --format '{{.Names}}: {{.Image}} ({{.Status}})' 2>/dev/null | head -20 || echo 'no_docker'", + Description: "Docker containers if running", + Categories: []string{"service", "container"}, + Optional: true, + }, + { + Name: "docker_mounts", + Command: `sh -c 'docker ps -q 2>/dev/null | head -15 | while read id; do name=$(docker inspect --format "{{.Name}}" "$id" 2>/dev/null | sed "s|^/||"); echo "CONTAINER:$name"; docker inspect --format "{{range .Mounts}}{{.Source}}|{{.Destination}}|{{.Type}}{{println}}{{end}}" "$id" 2>/dev/null | grep -v "^$" || true; done; echo docker_mounts_done'`, + Description: "Docker container bind mounts (source -> destination)", + Categories: []string{"config", "storage"}, + Optional: true, + }, + { + Name: "installed_packages", + Command: "dpkg -l 2>/dev/null | grep -E '^ii' | awk '{print $2}' | head -50 || rpm -qa 2>/dev/null | head -50 || apk list --installed 2>/dev/null | head -50", + Description: "Installed packages", + Categories: []string{"version", "service"}, + Optional: true, + }, + { + Name: "config_files", + Command: "find /etc -name '*.conf' -o -name '*.yml' -o -name '*.yaml' -o -name '*.json' 2>/dev/null | head -30", + Description: "Configuration files", + Categories: []string{"config"}, + Optional: true, + }, + { + Name: "cron_jobs", + Command: "crontab -l 2>/dev/null | grep -v '^#' | head -10 || ls -la /etc/cron.d/ 2>/dev/null | head -10", + Description: "Scheduled jobs", + Categories: []string{"service"}, + Optional: true, + }, + { + Name: "hardware_info", + Command: "lspci 2>/dev/null | head -20 || echo 'no_lspci'", + Description: "Hardware devices (e.g., Coral TPU)", + Categories: []string{"hardware"}, + Optional: true, + }, + { + Name: "gpu_devices", + Command: "ls -la /dev/dri/ 2>/dev/null; ls -la /dev/apex* 2>/dev/null; nvidia-smi -L 2>/dev/null || echo 'no_gpu'", + Description: "GPU and TPU devices", + Categories: []string{"hardware"}, + Optional: true, + }, + } +} + +// getVMCommands returns commands for discovering VMs (via QEMU guest agent). +func getVMCommands() []DiscoveryCommand { + return []DiscoveryCommand{ + { + Name: "os_release", + Command: "cat /etc/os-release", + Description: "Operating system identification", + Categories: []string{"version", "config"}, + Optional: true, + }, + { + Name: "hostname", + Command: "hostname", + Description: "VM hostname", + Categories: []string{"config"}, + Optional: true, + }, + { + Name: "running_services", + Command: "systemctl list-units --type=service --state=running --no-pager 2>/dev/null | head -30", + Description: "Running services and daemons", + Categories: []string{"service"}, + Optional: true, + }, + { + Name: "listening_ports", + Command: "ss -tlnp 2>/dev/null | head -25 || netstat -tlnp 2>/dev/null | head -25", + Description: "Network ports listening", + Categories: []string{"port", "network"}, + Optional: true, + }, + { + Name: "top_processes", + Command: "ps aux --sort=-rss 2>/dev/null | head -15", + Description: "Top processes by memory", + Categories: []string{"service"}, + Optional: true, + }, + { + Name: "disk_usage", + Command: "df -h 2>/dev/null | head -15", + Description: "Disk usage and mount points", + Categories: []string{"storage"}, + Optional: true, + }, + { + Name: "docker_check", + Command: "docker ps --format '{{.Names}}: {{.Image}} ({{.Status}})' 2>/dev/null | head -20 || echo 'no_docker'", + Description: "Docker containers if running", + Categories: []string{"service", "container"}, + Optional: true, + }, + { + Name: "docker_mounts", + Command: `sh -c 'docker ps -q 2>/dev/null | head -15 | while read id; do name=$(docker inspect --format "{{.Name}}" "$id" 2>/dev/null | sed "s|^/||"); echo "CONTAINER:$name"; docker inspect --format "{{range .Mounts}}{{.Source}}|{{.Destination}}|{{.Type}}{{println}}{{end}}" "$id" 2>/dev/null | grep -v "^$" || true; done; echo docker_mounts_done'`, + Description: "Docker container bind mounts (source -> destination)", + Categories: []string{"config", "storage"}, + Optional: true, + }, + { + Name: "hardware_info", + Command: "lspci 2>/dev/null | head -20", + Description: "PCI hardware devices", + Categories: []string{"hardware"}, + Optional: true, + }, + { + Name: "gpu_devices", + Command: "ls -la /dev/dri/ 2>/dev/null; nvidia-smi -L 2>/dev/null || echo 'no_gpu'", + Description: "GPU devices", + Categories: []string{"hardware"}, + Optional: true, + }, + } +} + +// getDockerCommands returns commands for discovering Docker containers. +// These are run inside the container via docker exec. +func getDockerCommands() []DiscoveryCommand { + return []DiscoveryCommand{ + { + Name: "os_release", + Command: "cat /etc/os-release 2>/dev/null || cat /etc/alpine-release 2>/dev/null || echo 'unknown'", + Description: "Container OS", + Categories: []string{"version"}, + Optional: true, + }, + { + Name: "processes", + Command: "ps aux 2>/dev/null || echo 'no_ps'", + Description: "Running processes", + Categories: []string{"service"}, + Optional: true, + }, + { + Name: "listening_ports", + Command: "ss -tlnp 2>/dev/null || netstat -tlnp 2>/dev/null || echo 'no_ss'", + Description: "Listening ports inside container", + Categories: []string{"port"}, + Optional: true, + }, + { + Name: "env_vars", + Command: "env 2>/dev/null | grep -vE '(PASSWORD|SECRET|KEY|TOKEN|CREDENTIAL)' | head -30", + Description: "Environment variables (filtered)", + Categories: []string{"config"}, + Optional: true, + }, + { + Name: "config_files", + Command: "find /config /data /app /etc -maxdepth 2 -name '*.conf' -o -name '*.yml' -o -name '*.yaml' -o -name '*.json' 2>/dev/null | head -20", + Description: "Configuration files", + Categories: []string{"config"}, + Optional: true, + }, + } +} + +// getNestedDockerCommands returns commands for Docker inside VMs or LXCs. +func getNestedDockerCommands() []DiscoveryCommand { + return []DiscoveryCommand{ + { + Name: "docker_containers", + Command: "docker ps -a --format '{{.Names}}|{{.Image}}|{{.Status}}|{{.Ports}}'", + Description: "All Docker containers", + Categories: []string{"container", "service"}, + Optional: false, + }, + { + Name: "docker_images", + Command: "docker images --format '{{.Repository}}:{{.Tag}}' | head -20", + Description: "Docker images", + Categories: []string{"version"}, + Optional: true, + }, + { + Name: "docker_compose", + Command: "find /opt /home /root -name 'docker-compose*.yml' -o -name 'compose*.yml' 2>/dev/null | head -10", + Description: "Docker compose files", + Categories: []string{"config"}, + Optional: true, + }, + } +} + +// getK8sCommands returns commands for discovering Kubernetes pods. +func getK8sCommands() []DiscoveryCommand { + return []DiscoveryCommand{ + { + Name: "processes", + Command: "ps aux 2>/dev/null || echo 'no_ps'", + Description: "Running processes in pod", + Categories: []string{"service"}, + Optional: true, + }, + { + Name: "listening_ports", + Command: "ss -tlnp 2>/dev/null || netstat -tlnp 2>/dev/null || echo 'no_ss'", + Description: "Listening ports", + Categories: []string{"port"}, + Optional: true, + }, + { + Name: "env_vars", + Command: "env 2>/dev/null | grep -vE '(PASSWORD|SECRET|KEY|TOKEN|CREDENTIAL)' | head -30", + Description: "Environment variables (filtered)", + Categories: []string{"config"}, + Optional: true, + }, + } +} + +// getHostCommands returns commands for discovering host systems. +func getHostCommands() []DiscoveryCommand { + return []DiscoveryCommand{ + { + Name: "os_release", + Command: "cat /etc/os-release", + Description: "Operating system", + Categories: []string{"version", "config"}, + Optional: true, + }, + { + Name: "hostname", + Command: "hostname -f 2>/dev/null || hostname", + Description: "Full hostname", + Categories: []string{"config"}, + Optional: true, + }, + { + Name: "running_services", + Command: "systemctl list-units --type=service --state=running --no-pager 2>/dev/null | head -40", + Description: "Running services", + Categories: []string{"service"}, + Optional: true, + }, + { + Name: "listening_ports", + Command: "ss -tlnp 2>/dev/null | head -30", + Description: "Listening network ports", + Categories: []string{"port", "network"}, + Optional: true, + }, + { + Name: "docker_containers", + Command: "docker ps --format '{{.Names}}: {{.Image}} ({{.Status}})' 2>/dev/null | head -30 || echo 'no_docker'", + Description: "Docker containers on host", + Categories: []string{"container", "service"}, + Optional: true, + }, + { + Name: "proxmox_version", + Command: "pveversion 2>/dev/null || echo 'not_proxmox'", + Description: "Proxmox version if applicable", + Categories: []string{"version"}, + Optional: true, + }, + { + Name: "zfs_pools", + Command: "zpool list 2>/dev/null | head -10 || echo 'no_zfs'", + Description: "ZFS pools", + Categories: []string{"storage"}, + Optional: true, + }, + { + Name: "disk_usage", + Command: "df -h | head -20", + Description: "Disk usage", + Categories: []string{"storage"}, + Optional: true, + }, + { + Name: "hardware_info", + Command: "lscpu | head -20", + Description: "CPU information", + Categories: []string{"hardware"}, + Optional: true, + }, + { + Name: "memory_info", + Command: "free -h", + Description: "Memory information", + Categories: []string{"hardware"}, + Optional: true, + }, + } +} + +// BuildLXCCommand wraps a command for execution in an LXC container. +// The vmid is validated to prevent command injection. +func BuildLXCCommand(vmid string, cmd string) string { + if err := ValidateResourceID(vmid); err != nil { + // Don't include the invalid ID in output to prevent any injection + return "sh -c 'echo \"Discovery error: invalid LXC container ID\" >&2; exit 1'" + } + return fmt.Sprintf("pct exec %s -- sh -c %s", vmid, shellQuote(cmd)) +} + +// BuildVMCommand wraps a command for execution in a VM via QEMU guest agent. +// Note: This requires the guest agent to be running. +// The vmid is validated to prevent command injection. +func BuildVMCommand(vmid string, cmd string) string { + if err := ValidateResourceID(vmid); err != nil { + return "sh -c 'echo \"Discovery error: invalid VM ID\" >&2; exit 1'" + } + // For VMs, we use qm guest exec which requires the guest agent + return fmt.Sprintf("qm guest exec %s -- sh -c %s", vmid, shellQuote(cmd)) +} + +// BuildDockerCommand wraps a command for execution in a Docker container. +// The containerName is validated to prevent command injection. +// Note: Leading slashes are trimmed as Docker API often returns names with leading /. +func BuildDockerCommand(containerName string, cmd string) string { + // Docker API returns container names with leading slash, trim it + containerName = strings.TrimPrefix(containerName, "/") + if err := ValidateResourceID(containerName); err != nil { + return "sh -c 'echo \"Discovery error: invalid container name\" >&2; exit 1'" + } + return fmt.Sprintf("docker exec %s sh -c %s", shellQuote(containerName), shellQuote(cmd)) +} + +// BuildNestedDockerCommand builds a command to run inside Docker on a VM/LXC. +// All resource identifiers are validated to prevent command injection. +func BuildNestedDockerCommand(vmid string, isLXC bool, containerName string, cmd string) string { + if err := ValidateResourceID(vmid); err != nil { + return "sh -c 'echo \"Discovery error: invalid VM/LXC ID\" >&2; exit 1'" + } + // Docker API returns container names with leading slash, trim it + containerName = strings.TrimPrefix(containerName, "/") + if err := ValidateResourceID(containerName); err != nil { + return "sh -c 'echo \"Discovery error: invalid container name\" >&2; exit 1'" + } + dockerCmd := BuildDockerCommand(containerName, cmd) + if isLXC { + return BuildLXCCommand(vmid, dockerCmd) + } + return BuildVMCommand(vmid, dockerCmd) +} + +// BuildK8sCommand builds a command to run in a Kubernetes pod. +// All identifiers are validated to prevent command injection. +func BuildK8sCommand(namespace, podName, containerName, cmd string) string { + if err := ValidateResourceID(namespace); err != nil { + return "sh -c 'echo \"Discovery error: invalid namespace\" >&2; exit 1'" + } + if err := ValidateResourceID(podName); err != nil { + return "sh -c 'echo \"Discovery error: invalid pod name\" >&2; exit 1'" + } + if containerName != "" { + if err := ValidateResourceID(containerName); err != nil { + return "sh -c 'echo \"Discovery error: invalid container name\" >&2; exit 1'" + } + return fmt.Sprintf("kubectl exec -n %s %s -c %s -- sh -c %s", shellQuote(namespace), shellQuote(podName), shellQuote(containerName), shellQuote(cmd)) + } + return fmt.Sprintf("kubectl exec -n %s %s -- sh -c %s", shellQuote(namespace), shellQuote(podName), shellQuote(cmd)) +} + +// GetCLIAccessTemplate returns a CLI access template for a resource type. +// These are instructions for using pulse_control, NOT literal shell commands. +// Commands via pulse_control run directly on the target where the agent is installed. +func GetCLIAccessTemplate(resourceType ResourceType) string { + switch resourceType { + case ResourceTypeLXC: + // Agent runs ON the LXC - commands execute directly inside the container + return "Use pulse_control with target_host matching this LXC's hostname. Commands run directly inside the container." + case ResourceTypeVM: + // Agent runs ON the VM - commands execute directly inside the VM + return "Use pulse_control with target_host matching this VM's hostname. Commands run directly inside the VM." + case ResourceTypeDocker: + // Docker container on a host - need docker exec from the host + return "Use pulse_control targeting the Docker host with command: docker exec {container} " + case ResourceTypeDockerLXC: + // Docker inside an LXC - agent on the LXC runs docker exec + return "Use pulse_control targeting the LXC hostname with command: docker exec {container} " + case ResourceTypeDockerVM: + // Docker inside a VM - agent on the VM runs docker exec + return "Use pulse_control targeting the VM hostname with command: docker exec {container} " + case ResourceTypeK8s: + return "Use kubectl exec -n {namespace} {pod} -- " + case ResourceTypeHost: + return "Use pulse_control with target_host matching this host. Commands run directly." + default: + return "Use pulse_control with target_host matching the resource hostname." + } +} + +// FormatCLIAccess formats a CLI access string with actual values. +func FormatCLIAccess(resourceType ResourceType, vmid, containerName, namespace, podName string) string { + template := GetCLIAccessTemplate(resourceType) + result := template + + result = strings.ReplaceAll(result, "{vmid}", vmid) + result = strings.ReplaceAll(result, "{container}", containerName) + result = strings.ReplaceAll(result, "{namespace}", namespace) + result = strings.ReplaceAll(result, "{pod}", podName) + + return result +} diff --git a/internal/servicediscovery/commands_test.go b/internal/servicediscovery/commands_test.go new file mode 100644 index 000000000..5fd9780e1 --- /dev/null +++ b/internal/servicediscovery/commands_test.go @@ -0,0 +1,81 @@ +package servicediscovery + +import ( + "strings" + "testing" +) + +func TestCommandsAndTemplates(t *testing.T) { + resourceTypes := []ResourceType{ + ResourceTypeLXC, + ResourceTypeVM, + ResourceTypeDocker, + ResourceTypeDockerVM, + ResourceTypeDockerLXC, + ResourceTypeK8s, + ResourceTypeHost, + } + + for _, rt := range resourceTypes { + cmds := GetCommandsForResource(rt) + if len(cmds) == 0 { + t.Fatalf("expected commands for %s", rt) + } + } + + if len(GetCommandsForResource(ResourceType("unknown"))) != 0 { + t.Fatalf("expected no commands for unknown resource type") + } + + if !strings.Contains(BuildLXCCommand("101", "echo hi"), "pct exec 101") { + t.Fatalf("unexpected LXC command") + } + if !strings.Contains(BuildVMCommand("101", "echo hi"), "qm guest exec 101") { + t.Fatalf("unexpected VM command") + } + // Docker commands now quote container names for safety + dockerCmd := BuildDockerCommand("web", "echo hi") + if !strings.Contains(dockerCmd, "docker exec") || !strings.Contains(dockerCmd, "web") { + t.Fatalf("unexpected docker command: %s", dockerCmd) + } + + nestedLXC := BuildNestedDockerCommand("201", true, "web", "echo hi") + if !strings.Contains(nestedLXC, "pct exec 201") || !strings.Contains(nestedLXC, "docker exec") || !strings.Contains(nestedLXC, "web") { + t.Fatalf("unexpected nested LXC command: %s", nestedLXC) + } + + nestedVM := BuildNestedDockerCommand("301", false, "web", "echo hi") + if !strings.Contains(nestedVM, "qm guest exec 301") || !strings.Contains(nestedVM, "docker exec") || !strings.Contains(nestedVM, "web") { + t.Fatalf("unexpected nested VM command: %s", nestedVM) + } + + // K8s commands now quote arguments for safety + withContainer := BuildK8sCommand("default", "pod", "app", "echo hi") + if !strings.Contains(withContainer, "-c") || !strings.Contains(withContainer, "app") || !strings.Contains(withContainer, "kubectl exec") { + t.Fatalf("unexpected k8s command: %s", withContainer) + } + + withoutContainer := BuildK8sCommand("default", "pod", "", "echo hi") + if strings.Contains(withoutContainer, "-c") && strings.Contains(withoutContainer, "app") { + t.Fatalf("unexpected container selector: %s", withoutContainer) + } + + template := GetCLIAccessTemplate(ResourceTypeK8s) + if !strings.Contains(template, "{namespace}") || !strings.Contains(template, "{pod}") { + t.Fatalf("unexpected template: %s", template) + } + + for _, rt := range resourceTypes { + if tmpl := GetCLIAccessTemplate(rt); tmpl == "" { + t.Fatalf("expected template for %s", rt) + } + } + if tmpl := GetCLIAccessTemplate(ResourceType("unknown")); !strings.Contains(tmpl, "pulse_control") { + t.Fatalf("expected default template to mention pulse_control, got: %s", tmpl) + } + + formatted := FormatCLIAccess(ResourceTypeK8s, "101", "container", "default", "pod") + if !strings.Contains(formatted, "default") || !strings.Contains(formatted, "pod") { + t.Fatalf("unexpected formatted access: %s", formatted) + } +} diff --git a/internal/servicediscovery/deep_scanner.go b/internal/servicediscovery/deep_scanner.go new file mode 100644 index 000000000..9b745c7c6 --- /dev/null +++ b/internal/servicediscovery/deep_scanner.go @@ -0,0 +1,475 @@ +package servicediscovery + +import ( + "context" + "fmt" + "sync" + "time" + + "github.com/google/uuid" + "github.com/rs/zerolog/log" +) + +// CommandExecutor executes commands on infrastructure. +type CommandExecutor interface { + ExecuteCommand(ctx context.Context, agentID string, cmd ExecuteCommandPayload) (*CommandResultPayload, error) + GetConnectedAgents() []ConnectedAgent + IsAgentConnected(agentID string) bool +} + +// ExecuteCommandPayload mirrors agentexec.ExecuteCommandPayload +type ExecuteCommandPayload struct { + RequestID string `json:"request_id"` + Command string `json:"command"` + TargetType string `json:"target_type"` // "host", "container", "vm" + TargetID string `json:"target_id,omitempty"` // VMID for container/VM + Timeout int `json:"timeout,omitempty"` +} + +// CommandResultPayload mirrors agentexec.CommandResultPayload +type CommandResultPayload struct { + RequestID string `json:"request_id"` + Success bool `json:"success"` + Stdout string `json:"stdout,omitempty"` + Stderr string `json:"stderr,omitempty"` + ExitCode int `json:"exit_code"` + Error string `json:"error,omitempty"` + Duration int64 `json:"duration_ms"` +} + +// ConnectedAgent mirrors agentexec.ConnectedAgent +type ConnectedAgent struct { + AgentID string + Hostname string + Version string + Platform string + Tags []string + ConnectedAt time.Time +} + +// ProgressCallback is called when discovery progress changes. +type ProgressCallback func(*DiscoveryProgress) + +// DeepScanner runs discovery commands on resources. +type DeepScanner struct { + executor CommandExecutor + mu sync.RWMutex + progress map[string]*DiscoveryProgress // resourceID -> progress + maxParallel int + timeout time.Duration + progressCallback ProgressCallback +} + +// NewDeepScanner creates a new deep scanner. +func NewDeepScanner(executor CommandExecutor) *DeepScanner { + return &DeepScanner{ + executor: executor, + progress: make(map[string]*DiscoveryProgress), + maxParallel: 3, // Run up to 3 commands in parallel per resource + timeout: 30 * time.Second, + } +} + +// SetProgressCallback sets a callback function that will be called when discovery progress changes. +func (s *DeepScanner) SetProgressCallback(callback ProgressCallback) { + s.mu.Lock() + defer s.mu.Unlock() + s.progressCallback = callback +} + +// notifyProgress calls the progress callback if set. +func (s *DeepScanner) notifyProgress(progress *DiscoveryProgress) { + s.mu.RLock() + callback := s.progressCallback + s.mu.RUnlock() + + if callback != nil && progress != nil { + // Calculate elapsed time and percent complete + progressCopy := *progress + if !progress.StartedAt.IsZero() { + progressCopy.ElapsedMs = time.Since(progress.StartedAt).Milliseconds() + } + if progress.TotalSteps > 0 { + progressCopy.PercentComplete = float64(progress.CompletedSteps) / float64(progress.TotalSteps) * 100 + } + callback(&progressCopy) + } +} + +// ScanResult contains the results of a deep scan. +type ScanResult struct { + ResourceType ResourceType + ResourceID string + HostID string + Hostname string + CommandOutputs map[string]string + Errors map[string]string + StartedAt time.Time + CompletedAt time.Time +} + +// Scan runs discovery commands on a resource and returns the outputs. +func (s *DeepScanner) Scan(ctx context.Context, req DiscoveryRequest) (*ScanResult, error) { + resourceID := MakeResourceID(req.ResourceType, req.HostID, req.ResourceID) + startTime := time.Now() + + // Initialize progress + s.mu.Lock() + s.progress[resourceID] = &DiscoveryProgress{ + ResourceID: resourceID, + Status: DiscoveryStatusRunning, + CurrentStep: "initializing", + StartedAt: startTime, + } + initialProgress := *s.progress[resourceID] + s.mu.Unlock() + + // Broadcast scan start + s.notifyProgress(&initialProgress) + + defer func() { + s.mu.Lock() + delete(s.progress, resourceID) + s.mu.Unlock() + }() + + result := &ScanResult{ + ResourceType: req.ResourceType, + ResourceID: req.ResourceID, + HostID: req.HostID, + Hostname: req.Hostname, + CommandOutputs: make(map[string]string), + Errors: make(map[string]string), + StartedAt: time.Now(), + } + + // Check if we have an agent for this host + if s.executor == nil { + return nil, fmt.Errorf("no command executor available") + } + + // Find the agent for this host + agentID := s.findAgentForHost(req.HostID, req.Hostname) + if agentID == "" { + return nil, fmt.Errorf("no connected agent for host %s (%s)", req.HostID, req.Hostname) + } + + // Get commands for this resource type + commands := GetCommandsForResource(req.ResourceType) + if len(commands) == 0 { + return nil, fmt.Errorf("no commands defined for resource type %s", req.ResourceType) + } + + // Update progress + s.mu.Lock() + if prog, ok := s.progress[resourceID]; ok { + prog.TotalSteps = len(commands) + prog.CurrentStep = "running commands" + progressCopy := *prog + s.mu.Unlock() + s.notifyProgress(&progressCopy) + } else { + s.mu.Unlock() + } + + // Run commands with limited parallelism + semaphore := make(chan struct{}, s.maxParallel) + var wg sync.WaitGroup + var mu sync.Mutex + + for _, cmd := range commands { + wg.Add(1) + go func(cmd DiscoveryCommand) { + defer wg.Done() + + select { + case semaphore <- struct{}{}: + defer func() { <-semaphore }() + case <-ctx.Done(): + return + } + + // Build the actual command to run + actualCmd := s.buildCommand(req.ResourceType, req.ResourceID, cmd.Command) + + // Get the target ID for the agent + targetID := s.getTargetID(req.ResourceType, req.ResourceID) + + // Only validate TargetID when it will be interpolated into shell commands + // by the agent (container/vm types). Host/docker types don't use TargetID + // in command wrapping, so they can have any format (including colons for IPv6). + targetType := s.getTargetType(req.ResourceType) + if targetType == "container" || targetType == "vm" { + if err := ValidateResourceID(targetID); err != nil { + mu.Lock() + result.Errors[cmd.Name] = fmt.Sprintf("invalid target ID: %v", err) + mu.Unlock() + return + } + } + + // Execute the command + cmdCtx, cancel := context.WithTimeout(ctx, s.timeout) + defer cancel() + + cmdResult, err := s.executor.ExecuteCommand(cmdCtx, agentID, ExecuteCommandPayload{ + RequestID: uuid.New().String(), + Command: actualCmd, + TargetType: s.getTargetType(req.ResourceType), + TargetID: targetID, + Timeout: cmd.Timeout, + }) + + mu.Lock() + defer mu.Unlock() + + if err != nil { + if !cmd.Optional { + result.Errors[cmd.Name] = err.Error() + } + log.Debug(). + Err(err). + Str("command", cmd.Name). + Str("resource", resourceID). + Msg("Command failed during discovery") + return + } + + if cmdResult != nil { + output := cmdResult.Stdout + if cmdResult.Stderr != "" && output != "" { + output += "\n--- stderr ---\n" + cmdResult.Stderr + } else if cmdResult.Stderr != "" { + output = cmdResult.Stderr + } + + if output != "" { + result.CommandOutputs[cmd.Name] = output + } + + if !cmdResult.Success && cmdResult.Error != "" && !cmd.Optional { + result.Errors[cmd.Name] = cmdResult.Error + } + } + + // Update progress and broadcast + s.mu.Lock() + if prog, ok := s.progress[resourceID]; ok { + prog.CompletedSteps++ + prog.CurrentCommand = cmd.Name + progressCopy := *prog + s.mu.Unlock() + s.notifyProgress(&progressCopy) + } else { + s.mu.Unlock() + } + }(cmd) + } + + wg.Wait() + result.CompletedAt = time.Now() + + // Broadcast scan completion + completionProgress := DiscoveryProgress{ + ResourceID: resourceID, + Status: DiscoveryStatusCompleted, + CurrentStep: "completed", + TotalSteps: len(commands), + CompletedSteps: len(commands), + StartedAt: startTime, + ElapsedMs: result.CompletedAt.Sub(startTime).Milliseconds(), + PercentComplete: 100, + } + s.notifyProgress(&completionProgress) + + log.Info(). + Str("resource", resourceID). + Int("outputs", len(result.CommandOutputs)). + Int("errors", len(result.Errors)). + Dur("duration", result.CompletedAt.Sub(result.StartedAt)). + Msg("Deep scan completed") + + return result, nil +} + +// buildCommand wraps the command appropriately for the resource type. +// NOTE: For LXC/VM, the agent handles wrapping via pct exec / qm guest exec +// based on TargetType, so we don't wrap here. We only wrap for Docker containers +// since Docker isn't a recognized TargetType in the agent. +func (s *DeepScanner) buildCommand(resourceType ResourceType, resourceID string, cmd string) string { + switch resourceType { + case ResourceTypeLXC: + // Agent wraps with pct exec based on TargetType="container" + return cmd + case ResourceTypeVM: + // Agent wraps with qm guest exec based on TargetType="vm" + return cmd + case ResourceTypeDocker: + // Docker needs wrapping here since agent doesn't handle it + return BuildDockerCommand(resourceID, cmd) + case ResourceTypeHost: + // Commands run directly on host + return cmd + case ResourceTypeDockerLXC: + // Docker inside LXC - agent wraps with pct exec, we just add docker exec + // resourceID format: "vmid:container_name" + parts := splitResourceID(resourceID) + if len(parts) >= 2 { + return BuildDockerCommand(parts[1], cmd) + } + return cmd + case ResourceTypeDockerVM: + // Docker inside VM - agent wraps with qm guest exec, we just add docker exec + parts := splitResourceID(resourceID) + if len(parts) >= 2 { + return BuildDockerCommand(parts[1], cmd) + } + return cmd + default: + return cmd + } +} + +// getTargetType returns the target type for the agent execution payload. +func (s *DeepScanner) getTargetType(resourceType ResourceType) string { + switch resourceType { + case ResourceTypeLXC: + return "container" + case ResourceTypeVM: + return "vm" + case ResourceTypeDocker: + return "host" // Docker commands run on host via docker exec + case ResourceTypeDockerLXC: + return "container" // Docker inside LXC: agent wraps with pct exec + case ResourceTypeDockerVM: + return "vm" // Docker inside VM: agent wraps with qm guest exec + case ResourceTypeHost: + return "host" + default: + return "host" + } +} + +// getTargetID returns the target ID for the agent execution payload. +// For nested Docker (docker_lxc/docker_vm), this extracts just the vmid. +func (s *DeepScanner) getTargetID(resourceType ResourceType, resourceID string) string { + switch resourceType { + case ResourceTypeDockerLXC, ResourceTypeDockerVM: + // resourceID format: "vmid:container_name" - extract just vmid + parts := splitResourceID(resourceID) + if len(parts) >= 1 { + return parts[0] + } + return resourceID + default: + return resourceID + } +} + +// findAgentForHost finds the agent ID for a given host. +func (s *DeepScanner) findAgentForHost(hostID, hostname string) string { + agents := s.executor.GetConnectedAgents() + + // First try exact match on agent ID + for _, agent := range agents { + if agent.AgentID == hostID { + return agent.AgentID + } + } + + // Then try hostname match + for _, agent := range agents { + if agent.Hostname == hostname || agent.Hostname == hostID { + return agent.AgentID + } + } + + // If only one agent connected, use it + if len(agents) == 1 { + return agents[0].AgentID + } + + return "" +} + +// GetProgress returns a copy of the current progress of a scan. +// Returns nil if no scan is in progress for the resource. +// A copy is returned to avoid data races with the scan goroutine. +func (s *DeepScanner) GetProgress(resourceID string) *DiscoveryProgress { + s.mu.RLock() + defer s.mu.RUnlock() + if prog, ok := s.progress[resourceID]; ok { + // Return a copy to avoid race with scan goroutine + copy := *prog + return © + } + return nil +} + +// IsScanning returns whether a resource is currently being scanned. +func (s *DeepScanner) IsScanning(resourceID string) bool { + s.mu.RLock() + defer s.mu.RUnlock() + _, ok := s.progress[resourceID] + return ok +} + +// splitResourceID splits a compound resource ID (e.g., "101:container_name"). +func splitResourceID(id string) []string { + var parts []string + start := 0 + for i, c := range id { + if c == ':' { + parts = append(parts, id[start:i]) + start = i + 1 + } + } + if start < len(id) { + parts = append(parts, id[start:]) + } + return parts +} + +// ScanDocker runs discovery on Docker containers via the host. +func (s *DeepScanner) ScanDocker(ctx context.Context, hostID, hostname, containerName string) (*ScanResult, error) { + req := DiscoveryRequest{ + ResourceType: ResourceTypeDocker, + ResourceID: containerName, + HostID: hostID, + Hostname: hostname, + } + return s.Scan(ctx, req) +} + +// ScanLXC runs discovery on an LXC container. +func (s *DeepScanner) ScanLXC(ctx context.Context, hostID, hostname, vmid string) (*ScanResult, error) { + req := DiscoveryRequest{ + ResourceType: ResourceTypeLXC, + ResourceID: vmid, + HostID: hostID, + Hostname: hostname, + } + return s.Scan(ctx, req) +} + +// ScanVM runs discovery on a VM via QEMU guest agent. +func (s *DeepScanner) ScanVM(ctx context.Context, hostID, hostname, vmid string) (*ScanResult, error) { + req := DiscoveryRequest{ + ResourceType: ResourceTypeVM, + ResourceID: vmid, + HostID: hostID, + Hostname: hostname, + } + return s.Scan(ctx, req) +} + +// ScanHost runs discovery on a host system. +func (s *DeepScanner) ScanHost(ctx context.Context, hostID, hostname string) (*ScanResult, error) { + req := DiscoveryRequest{ + ResourceType: ResourceTypeHost, + ResourceID: hostID, + HostID: hostID, + Hostname: hostname, + } + return s.Scan(ctx, req) +} diff --git a/internal/servicediscovery/deep_scanner_test.go b/internal/servicediscovery/deep_scanner_test.go new file mode 100644 index 000000000..f21005bc6 --- /dev/null +++ b/internal/servicediscovery/deep_scanner_test.go @@ -0,0 +1,395 @@ +package servicediscovery + +import ( + "context" + "strings" + "sync" + "testing" + "time" +) + +type stubExecutor struct { + mu sync.Mutex + commands []string + payloads []ExecuteCommandPayload // Track full payloads for testing + agents []ConnectedAgent +} + +func (s *stubExecutor) ExecuteCommand(ctx context.Context, agentID string, cmd ExecuteCommandPayload) (*CommandResultPayload, error) { + s.mu.Lock() + s.commands = append(s.commands, cmd.Command) + s.payloads = append(s.payloads, cmd) + s.mu.Unlock() + + if err := ctx.Err(); err != nil { + return nil, err + } + + if strings.Contains(cmd.Command, "docker ps -a") { + return &CommandResultPayload{ + RequestID: cmd.RequestID, + Success: false, + Error: "boom", + }, nil + } + + return &CommandResultPayload{ + RequestID: cmd.RequestID, + Success: true, + Stdout: cmd.Command, + Duration: 5, + }, nil +} + +func (s *stubExecutor) GetConnectedAgents() []ConnectedAgent { + return s.agents +} + +func (s *stubExecutor) IsAgentConnected(agentID string) bool { + for _, agent := range s.agents { + if agent.AgentID == agentID { + return true + } + } + return false +} + +type outputExecutor struct{} + +func (outputExecutor) ExecuteCommand(ctx context.Context, agentID string, cmd ExecuteCommandPayload) (*CommandResultPayload, error) { + switch { + case strings.Contains(cmd.Command, "docker ps -a"): + return &CommandResultPayload{Success: true, Stdout: "out", Stderr: "err"}, nil + case strings.Contains(cmd.Command, "docker images"): + return &CommandResultPayload{Success: true, Stderr: "err-only"}, nil + default: + return &CommandResultPayload{Success: true}, nil + } +} + +func (outputExecutor) GetConnectedAgents() []ConnectedAgent { + return []ConnectedAgent{{AgentID: "host1", Hostname: "host1"}} +} + +func (outputExecutor) IsAgentConnected(string) bool { return true } + +type errorExecutor struct{} + +func (errorExecutor) ExecuteCommand(ctx context.Context, agentID string, cmd ExecuteCommandPayload) (*CommandResultPayload, error) { + return nil, context.DeadlineExceeded +} + +func (errorExecutor) GetConnectedAgents() []ConnectedAgent { + return []ConnectedAgent{{AgentID: "host1", Hostname: "host1"}} +} + +func (errorExecutor) IsAgentConnected(string) bool { return true } + +func TestDeepScanner_Scan_NestedDockerCommands(t *testing.T) { + exec := &stubExecutor{ + agents: []ConnectedAgent{ + {AgentID: "host1", Hostname: "host1", ConnectedAt: time.Now()}, + }, + } + scanner := NewDeepScanner(exec) + + result, err := scanner.Scan(context.Background(), DiscoveryRequest{ + ResourceType: ResourceTypeDockerVM, + ResourceID: "101:web", + HostID: "host1", + Hostname: "host1", + }) + if err != nil { + t.Fatalf("Scan error: %v", err) + } + if len(result.CommandOutputs) == 0 { + t.Fatalf("expected command outputs") + } + if _, ok := result.Errors["docker_containers"]; !ok { + t.Fatalf("expected docker_containers error, got %#v", result.Errors) + } + + exec.mu.Lock() + defer exec.mu.Unlock() + + // Verify the payload fields are set correctly for nested Docker: + // - Command should contain "docker exec" (buildCommand adds this) + // - TargetType should be "vm" (agent wraps with qm guest exec) + // - TargetID should be "101" (extracted from "101:web") + foundCorrectPayload := false + for _, payload := range exec.payloads { + hasDockerExec := strings.Contains(payload.Command, "docker exec") + hasContainerName := strings.Contains(payload.Command, "web") + correctTargetType := payload.TargetType == "vm" + correctTargetID := payload.TargetID == "101" + + if hasDockerExec && hasContainerName && correctTargetType && correctTargetID { + foundCorrectPayload = true + break + } + } + if !foundCorrectPayload { + t.Fatalf("expected nested docker payload with docker exec, TargetType=vm, TargetID=101, got payloads: %+v", exec.payloads) + } +} + +func TestDeepScanner_FindAgentAndTargetType(t *testing.T) { + exec := &stubExecutor{ + agents: []ConnectedAgent{ + {AgentID: "a1", Hostname: "node1"}, + {AgentID: "a2", Hostname: "node2"}, + }, + } + scanner := NewDeepScanner(exec) + + if got := scanner.findAgentForHost("a2", ""); got != "a2" { + t.Fatalf("expected direct agent match, got %s", got) + } + if got := scanner.findAgentForHost("node1", "node1"); got != "a1" { + t.Fatalf("expected hostname match, got %s", got) + } + + exec.agents = []ConnectedAgent{{AgentID: "solo", Hostname: "only"}} + if got := scanner.findAgentForHost("missing", "missing"); got != "solo" { + t.Fatalf("expected single agent fallback, got %s", got) + } + exec.agents = nil + if got := scanner.findAgentForHost("missing", "missing"); got != "" { + t.Fatalf("expected no agent, got %s", got) + } + + if scanner.getTargetType(ResourceTypeLXC) != "container" { + t.Fatalf("unexpected target type for lxc") + } + if scanner.getTargetType(ResourceTypeVM) != "vm" { + t.Fatalf("unexpected target type for vm") + } + if scanner.getTargetType(ResourceTypeDocker) != "host" { + t.Fatalf("unexpected target type for docker") + } + if scanner.getTargetType(ResourceTypeHost) != "host" { + t.Fatalf("unexpected target type for host") + } +} + +func TestSplitResourceID(t *testing.T) { + parts := splitResourceID("101:web:extra") + if len(parts) != 3 || parts[0] != "101" || parts[1] != "web" || parts[2] != "extra" { + t.Fatalf("unexpected parts: %#v", parts) + } +} + +func TestDeepScanner_GetTargetTypeAndID(t *testing.T) { + scanner := NewDeepScanner(&stubExecutor{}) + + // Test getTargetType + tests := []struct { + resourceType ResourceType + wantType string + }{ + {ResourceTypeLXC, "container"}, + {ResourceTypeVM, "vm"}, + {ResourceTypeDocker, "host"}, + {ResourceTypeDockerLXC, "container"}, // Docker inside LXC runs via pct exec + {ResourceTypeDockerVM, "vm"}, // Docker inside VM runs via qm guest exec + {ResourceTypeHost, "host"}, + {ResourceType("unknown"), "host"}, + } + for _, tt := range tests { + if got := scanner.getTargetType(tt.resourceType); got != tt.wantType { + t.Errorf("getTargetType(%s) = %s, want %s", tt.resourceType, got, tt.wantType) + } + } + + // Test getTargetID + idTests := []struct { + resourceType ResourceType + resourceID string + wantID string + }{ + {ResourceTypeLXC, "101", "101"}, + {ResourceTypeVM, "102", "102"}, + {ResourceTypeDocker, "web", "web"}, + {ResourceTypeDockerLXC, "201:nginx", "201"}, // Extract vmid for nested docker + {ResourceTypeDockerVM, "301:postgres", "301"}, // Extract vmid for nested docker + {ResourceTypeHost, "myhost", "myhost"}, + } + for _, tt := range idTests { + if got := scanner.getTargetID(tt.resourceType, tt.resourceID); got != tt.wantID { + t.Errorf("getTargetID(%s, %s) = %s, want %s", tt.resourceType, tt.resourceID, got, tt.wantID) + } + } +} + +func TestDeepScanner_BuildCommandAndProgress(t *testing.T) { + scanner := NewDeepScanner(&stubExecutor{}) + + // LXC: buildCommand returns raw command, agent handles pct exec wrapping + if cmd := scanner.buildCommand(ResourceTypeLXC, "101", "echo hi"); cmd != "echo hi" { + t.Fatalf("LXC should return raw command (agent wraps), got: %s", cmd) + } + // VM: buildCommand returns raw command, agent handles qm guest exec wrapping + if cmd := scanner.buildCommand(ResourceTypeVM, "101", "echo hi"); cmd != "echo hi" { + t.Fatalf("VM should return raw command (agent wraps), got: %s", cmd) + } + // Docker: buildCommand wraps with docker exec since agent doesn't handle it + if cmd := scanner.buildCommand(ResourceTypeDocker, "web", "echo hi"); !strings.Contains(cmd, "docker exec") { + t.Fatalf("Docker should include docker exec, got: %s", cmd) + } + // Host: buildCommand returns raw command + if cmd := scanner.buildCommand(ResourceTypeHost, "host", "echo hi"); cmd != "echo hi" { + t.Fatalf("Host should return raw command, got: %s", cmd) + } + + // DockerLXC: buildCommand adds docker exec, agent adds pct exec + // So we should only see docker exec in the command (agent adds pct exec at runtime) + dockerLXC := scanner.buildCommand(ResourceTypeDockerLXC, "201:web", "echo hi") + if !strings.Contains(dockerLXC, "docker exec") { + t.Fatalf("DockerLXC should include docker exec, got: %s", dockerLXC) + } + if strings.Contains(dockerLXC, "pct exec") { + t.Fatalf("DockerLXC should NOT include pct exec (agent adds it), got: %s", dockerLXC) + } + if cmd := scanner.buildCommand(ResourceTypeDockerLXC, "bad", "echo hi"); cmd != "echo hi" { + t.Fatalf("DockerLXC with bad ID should fallback, got: %s", cmd) + } + + // DockerVM: buildCommand adds docker exec, agent adds qm guest exec + dockerVM := scanner.buildCommand(ResourceTypeDockerVM, "301:web", "echo hi") + if !strings.Contains(dockerVM, "docker exec") { + t.Fatalf("DockerVM should include docker exec, got: %s", dockerVM) + } + if strings.Contains(dockerVM, "qm guest exec") { + t.Fatalf("DockerVM should NOT include qm guest exec (agent adds it), got: %s", dockerVM) + } + if cmd := scanner.buildCommand(ResourceTypeDockerVM, "bad", "echo hi"); cmd != "echo hi" { + t.Fatalf("DockerVM with bad ID should fallback, got: %s", cmd) + } + + // Unknown type: returns raw command + if cmd := scanner.buildCommand(ResourceType("unknown"), "id", "echo hi"); cmd != "echo hi" { + t.Fatalf("Unknown type should return raw command, got: %s", cmd) + } + + scanner.progress["id"] = &DiscoveryProgress{ResourceID: "id"} + if scanner.GetProgress("id") == nil { + t.Fatalf("expected progress") + } + if !scanner.IsScanning("id") { + t.Fatalf("expected IsScanning true") + } + if scanner.GetProgress("missing") != nil { + t.Fatalf("expected nil progress") + } + if scanner.IsScanning("missing") { + t.Fatalf("expected IsScanning false") + } + + noExec := NewDeepScanner(nil) + if _, err := noExec.ScanHost(context.Background(), "host1", "host1"); err == nil { + t.Fatalf("expected error without executor") + } +} + +func TestDeepScanner_ScanWrappers(t *testing.T) { + exec := &stubExecutor{ + agents: []ConnectedAgent{{AgentID: "host1", Hostname: "host1"}}, + } + scanner := NewDeepScanner(exec) + scanner.maxParallel = 1 + + if _, err := scanner.ScanDocker(context.Background(), "host1", "host1", "web"); err != nil { + t.Fatalf("ScanDocker error: %v", err) + } + if _, err := scanner.ScanLXC(context.Background(), "host1", "host1", "101"); err != nil { + t.Fatalf("ScanLXC error: %v", err) + } + if _, err := scanner.ScanVM(context.Background(), "host1", "host1", "102"); err != nil { + t.Fatalf("ScanVM error: %v", err) + } +} + +func TestDeepScanner_ScanErrors(t *testing.T) { + exec := &stubExecutor{ + agents: []ConnectedAgent{{AgentID: "host1", Hostname: "host1"}}, + } + scanner := NewDeepScanner(exec) + if _, err := scanner.Scan(context.Background(), DiscoveryRequest{ + ResourceType: ResourceType("unknown"), + ResourceID: "id", + HostID: "host1", + Hostname: "host1", + }); err == nil { + t.Fatalf("expected error for unknown resource type") + } + + exec.agents = nil + if _, err := scanner.Scan(context.Background(), DiscoveryRequest{ + ResourceType: ResourceTypeDocker, + ResourceID: "web", + HostID: "host1", + Hostname: "host1", + }); err == nil { + t.Fatalf("expected error for missing agent") + } +} + +func TestDeepScanner_OutputHandling(t *testing.T) { + exec := outputExecutor{} + scanner := NewDeepScanner(exec) + scanner.maxParallel = 1 + + result, err := scanner.Scan(context.Background(), DiscoveryRequest{ + ResourceType: ResourceTypeDockerVM, + ResourceID: "101:web", + HostID: "host1", + Hostname: "host1", + }) + if err != nil { + t.Fatalf("Scan error: %v", err) + } + if out := result.CommandOutputs["docker_containers"]; !strings.Contains(out, "--- stderr ---") { + t.Fatalf("expected combined stderr output, got %s", out) + } + if out := result.CommandOutputs["docker_images"]; out != "err-only" { + t.Fatalf("expected stderr-only output, got %s", out) + } +} + +func TestDeepScanner_CommandErrorHandling(t *testing.T) { + scanner := NewDeepScanner(errorExecutor{}) + scanner.maxParallel = 1 + + result, err := scanner.Scan(context.Background(), DiscoveryRequest{ + ResourceType: ResourceTypeDockerVM, + ResourceID: "101:web", + HostID: "host1", + Hostname: "host1", + }) + if err != nil { + t.Fatalf("Scan error: %v", err) + } + if _, ok := result.Errors["docker_containers"]; !ok { + t.Fatalf("expected error for non-optional command") + } +} + +func TestDeepScanner_ScanCanceledContext(t *testing.T) { + exec := &stubExecutor{ + agents: []ConnectedAgent{{AgentID: "host1", Hostname: "host1"}}, + } + scanner := NewDeepScanner(exec) + scanner.maxParallel = 0 + + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + if _, err := scanner.Scan(ctx, DiscoveryRequest{ + ResourceType: ResourceTypeDockerVM, + ResourceID: "101:web", + HostID: "host1", + Hostname: "host1", + }); err != nil { + t.Fatalf("Scan error: %v", err) + } +} diff --git a/internal/servicediscovery/fingerprint.go b/internal/servicediscovery/fingerprint.go new file mode 100644 index 000000000..c4aa721d7 --- /dev/null +++ b/internal/servicediscovery/fingerprint.go @@ -0,0 +1,249 @@ +package servicediscovery + +import ( + "crypto/sha256" + "encoding/hex" + "fmt" + "sort" + "strconv" + "strings" + "time" +) + +// GenerateDockerFingerprint creates a fingerprint from Docker container metadata. +// The fingerprint captures key metadata that indicates when a container has changed +// in ways that would affect discovery results (image, ports, mounts, env keys). +func GenerateDockerFingerprint(hostID string, container *DockerContainer) *ContainerFingerprint { + fp := &ContainerFingerprint{ + ResourceID: container.Name, + HostID: hostID, + SchemaVersion: FingerprintSchemaVersion, + GeneratedAt: time.Now(), + ImageName: container.Image, + } + + // Extract port mappings (private port + protocol) + for _, p := range container.Ports { + fp.Ports = append(fp.Ports, fmt.Sprintf("%d/%s", p.PrivatePort, p.Protocol)) + } + sort.Strings(fp.Ports) + + // Extract mount paths (container destination paths, not host paths) + for _, m := range container.Mounts { + fp.MountPaths = append(fp.MountPaths, m.Destination) + } + sort.Strings(fp.MountPaths) + + // Extract environment variable keys from labels (if present) + // Note: We don't have direct access to env vars in DockerContainer, + // but labels often contain relevant configuration hints + for key := range container.Labels { + fp.EnvKeys = append(fp.EnvKeys, key) + } + sort.Strings(fp.EnvKeys) + + // Generate the hash + fp.Hash = fp.computeHash() + return fp +} + +// computeHash generates a truncated SHA256 hash of the fingerprint components. +// Includes schema version so algorithm changes produce different hashes. +func (fp *ContainerFingerprint) computeHash() string { + h := sha256.New() + // Include schema version first so algorithm changes are detected + h.Write([]byte(strconv.Itoa(fp.SchemaVersion))) + h.Write([]byte(fp.ImageID)) + h.Write([]byte(fp.ImageName)) + h.Write([]byte(fp.CreatedAt)) + h.Write([]byte(strings.Join(fp.Ports, ","))) + h.Write([]byte(strings.Join(fp.MountPaths, ","))) + h.Write([]byte(strings.Join(fp.EnvKeys, ","))) + return hex.EncodeToString(h.Sum(nil))[:16] // Short hash is sufficient +} + +// HasChanged compares two fingerprints and returns true if they differ. +// Also returns true if the schema version changed (algorithm updated). +func (fp *ContainerFingerprint) HasChanged(other *ContainerFingerprint) bool { + if other == nil { + return true + } + return fp.Hash != other.Hash +} + +// HasSchemaChanged returns true if the fingerprint was generated with a different schema. +func (fp *ContainerFingerprint) HasSchemaChanged(other *ContainerFingerprint) bool { + if other == nil { + return false + } + return fp.SchemaVersion != other.SchemaVersion +} + +// String returns a human-readable representation of the fingerprint. +func (fp *ContainerFingerprint) String() string { + return fmt.Sprintf("Fingerprint{id=%s, host=%s, hash=%s, image=%s, ports=%v}", + fp.ResourceID, fp.HostID, fp.Hash, fp.ImageName, fp.Ports) +} + +// GenerateLXCFingerprint creates a fingerprint from LXC container metadata. +// Tracks: VMID, name, OS template, resource allocation, and tags. +func GenerateLXCFingerprint(nodeID string, container *Container) *ContainerFingerprint { + fp := &ContainerFingerprint{ + ResourceID: strconv.Itoa(container.VMID), + HostID: nodeID, + SchemaVersion: FingerprintSchemaVersion, + GeneratedAt: time.Now(), + ImageName: container.OSTemplate, // OS template is like the "image" for LXCs + } + + // Build components for hashing + var components []string + + // Core identity + components = append(components, strconv.Itoa(container.VMID)) + components = append(components, container.Name) + components = append(components, container.OSTemplate) + components = append(components, container.OSName) + + // Resource allocation (changes here might affect what's running) + components = append(components, strconv.Itoa(container.CPUs)) + components = append(components, strconv.FormatUint(container.MaxMemory, 10)) + components = append(components, strconv.FormatUint(container.MaxDisk, 10)) + + // OCI container flag (different container type) + if container.IsOCI { + components = append(components, "oci:true") + } + + // Template flag (templates shouldn't trigger discovery) + if container.Template { + components = append(components, "template:true") + } + + // Note: IP addresses intentionally excluded - DHCP churn causes false positives + + // Tags (user might tag based on what's running) + if len(container.Tags) > 0 { + sortedTags := make([]string, len(container.Tags)) + copy(sortedTags, container.Tags) + sort.Strings(sortedTags) + components = append(components, sortedTags...) + } + + // Generate hash + h := sha256.New() + h.Write([]byte(strings.Join(components, "|"))) + fp.Hash = hex.EncodeToString(h.Sum(nil))[:16] + + return fp +} + +// GenerateVMFingerprint creates a fingerprint from VM metadata. +// Tracks: VMID, name, OS, resource allocation, and tags. +func GenerateVMFingerprint(nodeID string, vm *VM) *ContainerFingerprint { + fp := &ContainerFingerprint{ + ResourceID: strconv.Itoa(vm.VMID), + HostID: nodeID, + SchemaVersion: FingerprintSchemaVersion, + GeneratedAt: time.Now(), + ImageName: vm.OSName, // OS name is the closest to an "image" for VMs + } + + // Build components for hashing + var components []string + + // Core identity + components = append(components, strconv.Itoa(vm.VMID)) + components = append(components, vm.Name) + components = append(components, vm.OSName) + components = append(components, vm.OSVersion) + + // Resource allocation + components = append(components, strconv.Itoa(vm.CPUs)) + components = append(components, strconv.FormatUint(vm.MaxMemory, 10)) + components = append(components, strconv.FormatUint(vm.MaxDisk, 10)) + + // Template flag (templates shouldn't trigger discovery) + if vm.Template { + components = append(components, "template:true") + } + + // Note: IP addresses intentionally excluded - DHCP churn causes false positives + + // Tags + if len(vm.Tags) > 0 { + sortedTags := make([]string, len(vm.Tags)) + copy(sortedTags, vm.Tags) + sort.Strings(sortedTags) + components = append(components, sortedTags...) + } + + // Generate hash + h := sha256.New() + h.Write([]byte(strings.Join(components, "|"))) + fp.Hash = hex.EncodeToString(h.Sum(nil))[:16] + + return fp +} + +// GenerateK8sPodFingerprint creates a fingerprint from Kubernetes pod metadata. +// Tracks: UID, name, namespace, labels, owner (deployment/statefulset/etc), and container images. +func GenerateK8sPodFingerprint(clusterID string, pod *KubernetesPod) *ContainerFingerprint { + fp := &ContainerFingerprint{ + ResourceID: pod.UID, + HostID: clusterID, + SchemaVersion: FingerprintSchemaVersion, + GeneratedAt: time.Now(), + } + + // Build components for hashing + var components []string + + // Core identity + components = append(components, pod.UID) + components = append(components, pod.Name) + components = append(components, pod.Namespace) + components = append(components, pod.NodeName) + + // Owner reference (deployment, statefulset, daemonset, etc.) + if pod.OwnerKind != "" { + components = append(components, "owner:"+pod.OwnerKind+"/"+pod.OwnerName) + } + + // Container images (most important for detecting app changes) + var images []string + for _, c := range pod.Containers { + images = append(images, c.Name+":"+c.Image) + } + sort.Strings(images) + if len(images) > 0 { + fp.ImageName = images[0] // Use first container image as the "image name" + components = append(components, "images:"+strings.Join(images, ",")) + } + + // Labels (sorted by key for consistency) + if len(pod.Labels) > 0 { + var labelKeys []string + for k := range pod.Labels { + labelKeys = append(labelKeys, k) + } + sort.Strings(labelKeys) + var labelPairs []string + for _, k := range labelKeys { + labelPairs = append(labelPairs, k+"="+pod.Labels[k]) + } + components = append(components, "labels:"+strings.Join(labelPairs, ",")) + } + + // Generate hash + h := sha256.New() + h.Write([]byte(strings.Join(components, "|"))) + fp.Hash = hex.EncodeToString(h.Sum(nil))[:16] + + return fp +} + +// GenerateFingerprint is an alias for GenerateDockerFingerprint for backwards compatibility. +func GenerateFingerprint(hostID string, container *DockerContainer) *ContainerFingerprint { + return GenerateDockerFingerprint(hostID, container) +} diff --git a/internal/servicediscovery/formatters.go b/internal/servicediscovery/formatters.go new file mode 100644 index 000000000..8974a7057 --- /dev/null +++ b/internal/servicediscovery/formatters.go @@ -0,0 +1,629 @@ +package servicediscovery + +import ( + "fmt" + "strings" + "time" +) + +// FormatForAIContext formats discoveries for inclusion in AI prompts. +// This provides context about resources for Patrol, Investigation, and Chat. +func FormatForAIContext(discoveries []*ResourceDiscovery) string { + if len(discoveries) == 0 { + return "" + } + + var sb strings.Builder + sb.WriteString("## Infrastructure Discovery\n\n") + sb.WriteString("The following has been discovered about the affected resources:\n\n") + + for _, d := range discoveries { + sb.WriteString(formatSingleDiscovery(d)) + sb.WriteString("\n") + } + + sb.WriteString("\n**IMPORTANT:** Use the CLI access methods shown above. For example:\n") + sb.WriteString("- For LXC containers, use `pct exec -- `\n") + sb.WriteString("- For VMs with guest agent, use `qm guest exec -- `\n") + sb.WriteString("- For Docker containers, use `docker exec `\n") + + return sb.String() +} + +// FormatSingleForAIContext formats a single discovery for AI context. +func FormatSingleForAIContext(d *ResourceDiscovery) string { + if d == nil { + return "" + } + return formatSingleDiscovery(d) +} + +// formatSingleDiscovery formats a single discovery entry. +func formatSingleDiscovery(d *ResourceDiscovery) string { + var sb strings.Builder + + // Header with service info + sb.WriteString(fmt.Sprintf("### %s (%s)\n", d.ServiceName, d.ID)) + sb.WriteString(fmt.Sprintf("- **Type:** %s\n", d.ResourceType)) + sb.WriteString(fmt.Sprintf("- **Host:** %s\n", d.Hostname)) + + if d.ServiceVersion != "" { + sb.WriteString(fmt.Sprintf("- **Version:** %s\n", d.ServiceVersion)) + } + + if d.Category != "" && d.Category != CategoryUnknown { + sb.WriteString(fmt.Sprintf("- **Category:** %s\n", d.Category)) + } + + // CLI access (most important for remediation) + if d.CLIAccess != "" { + sb.WriteString(fmt.Sprintf("- **CLI Access:** `%s`\n", d.CLIAccess)) + } + + // Config, data, and log paths + if len(d.ConfigPaths) > 0 { + sb.WriteString(fmt.Sprintf("- **Config Paths:** %s\n", strings.Join(d.ConfigPaths, ", "))) + } + if len(d.DataPaths) > 0 { + sb.WriteString(fmt.Sprintf("- **Data Paths:** %s\n", strings.Join(d.DataPaths, ", "))) + } + if len(d.LogPaths) > 0 { + sb.WriteString(fmt.Sprintf("- **Log Paths:** %s\n", strings.Join(d.LogPaths, ", "))) + } + + // Ports + if len(d.Ports) > 0 { + var ports []string + for _, p := range d.Ports { + ports = append(ports, fmt.Sprintf("%d/%s", p.Port, p.Protocol)) + } + sb.WriteString(fmt.Sprintf("- **Ports:** %s\n", strings.Join(ports, ", "))) + } + + // Important facts + importantFacts := filterImportantFacts(d.Facts) + if len(importantFacts) > 0 { + sb.WriteString("- **Key Facts:**\n") + for _, f := range importantFacts { + sb.WriteString(fmt.Sprintf(" - %s: %s\n", f.Key, f.Value)) + } + } + + // User notes (critical for context) + if d.UserNotes != "" { + sb.WriteString(fmt.Sprintf("- **User Notes:** %s\n", d.UserNotes)) + } + + return sb.String() +} + +// filterImportantFacts returns the most relevant facts for AI context. +func filterImportantFacts(facts []DiscoveryFact) []DiscoveryFact { + var important []DiscoveryFact + + // Priority categories + priorityCategories := map[FactCategory]bool{ + FactCategoryHardware: true, // GPU, TPU + FactCategoryDependency: true, // MQTT, database connections + FactCategorySecurity: true, // Auth info + FactCategoryVersion: true, // Version info + } + + for _, f := range facts { + if priorityCategories[f.Category] && f.Confidence >= 0.7 { + important = append(important, f) + } + } + + // Limit to top 5 facts + if len(important) > 5 { + important = important[:5] + } + + return important +} + +// FormatDiscoverySummary formats a summary of all discoveries. +func FormatDiscoverySummary(discoveries []*ResourceDiscovery) string { + if len(discoveries) == 0 { + return "No infrastructure discovery data available." + } + + var sb strings.Builder + sb.WriteString(fmt.Sprintf("Infrastructure Discovery Summary (%d resources):\n\n", len(discoveries))) + + // Group by resource type + byType := make(map[ResourceType][]*ResourceDiscovery) + for _, d := range discoveries { + byType[d.ResourceType] = append(byType[d.ResourceType], d) + } + + for rt, ds := range byType { + sb.WriteString(fmt.Sprintf("**%s** (%d):\n", rt, len(ds))) + for _, d := range ds { + confidence := "" + if d.Confidence >= 0.9 { + confidence = " [high confidence]" + } else if d.Confidence >= 0.7 { + confidence = " [medium confidence]" + } + sb.WriteString(fmt.Sprintf(" - %s: %s%s\n", d.ResourceID, d.ServiceName, confidence)) + } + sb.WriteString("\n") + } + + return sb.String() +} + +// FormatScopeHint returns a compact, single-line discovery hint for scoped patrols. +func FormatScopeHint(discoveries []*ResourceDiscovery) string { + if len(discoveries) == 0 { + return "" + } + primary := discoveries[0] + summary := formatScopeDiscoverySummary(primary) + if summary == "" { + return "" + } + if len(discoveries) > 1 { + summary = fmt.Sprintf("%s (+%d more)", summary, len(discoveries)-1) + } + return "Discovery: " + summary +} + +func formatScopeDiscoverySummary(d *ResourceDiscovery) string { + if d == nil { + return "" + } + name := firstNonEmpty(d.ServiceName, d.ServiceType, d.ResourceID, d.ID) + if name == "" { + return "" + } + base := name + if d.ServiceVersion != "" && !strings.Contains(strings.ToLower(base), strings.ToLower(d.ServiceVersion)) { + version := d.ServiceVersion + if !strings.HasPrefix(strings.ToLower(version), "v") { + version = "v" + version + } + base = fmt.Sprintf("%s %s", base, version) + } + + host := firstNonEmpty(d.Hostname, d.HostID) + meta := strings.TrimSpace(string(d.ResourceType)) + if host != "" { + if meta != "" { + meta = fmt.Sprintf("%s on %s", meta, host) + } else { + meta = host + } + } + if meta != "" { + base = fmt.Sprintf("%s (%s)", base, meta) + } + + parts := []string{base} + if cli := shortenScopeCLI(d.CLIAccess); cli != "" { + parts = append(parts, "cli: "+cli) + } + if ports := formatScopePorts(d.Ports); ports != "" { + parts = append(parts, "ports: "+ports) + } + + return strings.Join(parts, "; ") +} + +func shortenScopeCLI(value string) string { + trimmed := strings.TrimSpace(value) + if trimmed == "" { + return "" + } + compact := strings.Join(strings.Fields(trimmed), " ") + return truncateScopeText(compact, 64) +} + +func formatScopePorts(ports []PortInfo) string { + if len(ports) == 0 { + return "" + } + maxPorts := 3 + if len(ports) < maxPorts { + maxPorts = len(ports) + } + parts := make([]string, 0, maxPorts) + for i := 0; i < maxPorts; i++ { + p := ports[i] + proto := p.Protocol + if proto == "" { + proto = "tcp" + } + parts = append(parts, fmt.Sprintf("%d/%s", p.Port, proto)) + } + if len(ports) > maxPorts { + parts = append(parts, fmt.Sprintf("+%d more", len(ports)-maxPorts)) + } + return strings.Join(parts, ", ") +} + +func truncateScopeText(value string, max int) string { + if max <= 0 || len(value) <= max { + return value + } + if max <= 3 { + return value[:max] + } + return value[:max-3] + "..." +} + +func firstNonEmpty(values ...string) string { + for _, v := range values { + if strings.TrimSpace(v) != "" { + return v + } + } + return "" +} + +// FormatForRemediation formats discovery specifically for remediation context. +func FormatForRemediation(d *ResourceDiscovery) string { + if d == nil { + return "" + } + + var sb strings.Builder + sb.WriteString("## Resource Context for Remediation\n\n") + + sb.WriteString(fmt.Sprintf("**Resource:** %s (%s)\n", d.ServiceName, d.ID)) + sb.WriteString(fmt.Sprintf("**Type:** %s on %s\n\n", d.ResourceType, d.Hostname)) + + // CLI access is most critical + if d.CLIAccess != "" { + sb.WriteString("### How to Execute Commands\n") + sb.WriteString(fmt.Sprintf("```\n%s\n```\n\n", d.CLIAccess)) + } + + // Service-specific info + if d.ServiceType != "" { + sb.WriteString(fmt.Sprintf("**Service:** %s", d.ServiceType)) + if d.ServiceVersion != "" { + sb.WriteString(fmt.Sprintf(" v%s", d.ServiceVersion)) + } + sb.WriteString("\n\n") + } + + // Config paths for potential fixes + if len(d.ConfigPaths) > 0 { + sb.WriteString("### Configuration Files\n") + for _, p := range d.ConfigPaths { + sb.WriteString(fmt.Sprintf("- `%s`\n", p)) + } + sb.WriteString("\n") + } + + // Log paths for troubleshooting + if len(d.LogPaths) > 0 { + sb.WriteString("### Log Files\n") + for _, p := range d.LogPaths { + sb.WriteString(fmt.Sprintf("- `%s`\n", p)) + } + sb.WriteString("\n") + } + + // User notes may contain important context + if d.UserNotes != "" { + sb.WriteString("### User Notes\n") + sb.WriteString(d.UserNotes) + sb.WriteString("\n\n") + } + + // Hardware info for special considerations + for _, f := range d.Facts { + if f.Category == FactCategoryHardware { + sb.WriteString(fmt.Sprintf("**Hardware:** %s = %s\n", f.Key, f.Value)) + } + } + + return sb.String() +} + +// FormatDiscoveryAge returns a human-readable age string. +func FormatDiscoveryAge(d *ResourceDiscovery) string { + if d == nil || d.UpdatedAt.IsZero() { + return "unknown" + } + + age := time.Since(d.UpdatedAt) + switch { + case age < time.Minute: + return "just now" + case age < time.Hour: + mins := int(age.Minutes()) + if mins == 1 { + return "1 minute ago" + } + return fmt.Sprintf("%d minutes ago", mins) + case age < 24*time.Hour: + hours := int(age.Hours()) + if hours == 1 { + return "1 hour ago" + } + return fmt.Sprintf("%d hours ago", hours) + default: + days := int(age.Hours() / 24) + if days == 1 { + return "1 day ago" + } + return fmt.Sprintf("%d days ago", days) + } +} + +// FilterDiscoveriesByResourceIDs returns discoveries that match any of the given resource IDs. +// This is used to scope discovery context for targeted patrol runs. +func FilterDiscoveriesByResourceIDs(discoveries []*ResourceDiscovery, resourceIDs []string) []*ResourceDiscovery { + if len(discoveries) == 0 { + return nil + } + if len(resourceIDs) == 0 { + return discoveries + } + + tokens := buildResourceIDTokenSet(resourceIDs) + if len(tokens) == 0 { + return nil + } + + filtered := make([]*ResourceDiscovery, 0, len(discoveries)) + for _, d := range discoveries { + if discoveryMatchesTokens(d, tokens) { + filtered = append(filtered, d) + } + } + return filtered +} + +func buildResourceIDTokenSet(resourceIDs []string) map[string]struct{} { + tokens := make(map[string]struct{}) + for _, id := range resourceIDs { + addResourceIDTokens(tokens, id) + } + return tokens +} + +func addResourceIDTokens(tokens map[string]struct{}, resourceID string) { + trimmed := strings.TrimSpace(resourceID) + if trimmed == "" { + return + } + + addToken(tokens, trimmed) + + if last := lastSegment(trimmed, '/'); last != "" { + addToken(tokens, last) + } + if last := lastSegment(trimmed, ':'); last != "" { + addToken(tokens, last) + } + + lower := strings.ToLower(trimmed) + if strings.HasPrefix(lower, "vm-") { + addToken(tokens, trimmed[3:]) + } + if strings.HasPrefix(lower, "ct-") { + addToken(tokens, trimmed[3:]) + } + if strings.HasPrefix(lower, "lxc-") { + addToken(tokens, trimmed[4:]) + } + + if strings.Contains(lower, "qemu/") || strings.Contains(lower, "lxc/") || strings.HasPrefix(lower, "vm-") || strings.HasPrefix(lower, "ct-") { + if digits := trailingDigits(trimmed); digits != "" { + addToken(tokens, digits) + } + } + + // docker:host/container -> host + container tokens + if strings.Contains(trimmed, ":") { + parts := strings.SplitN(trimmed, ":", 2) + if len(parts) == 2 { + rest := parts[1] + if slash := strings.Index(rest, "/"); slash >= 0 { + host := strings.TrimSpace(rest[:slash]) + container := strings.TrimSpace(rest[slash+1:]) + addToken(tokens, host) + addToken(tokens, container) + } + } + } +} + +func discoveryMatchesTokens(d *ResourceDiscovery, tokens map[string]struct{}) bool { + if d == nil { + return false + } + + candidates := discoveryTokens(d) + for _, candidate := range candidates { + if _, ok := tokens[candidate]; ok { + return true + } + } + return false +} + +func discoveryTokens(d *ResourceDiscovery) []string { + var tokens []string + add := func(value string) { + trimmed := strings.TrimSpace(value) + if trimmed == "" { + return + } + tokens = append(tokens, strings.ToLower(trimmed)) + } + + add(d.ResourceID) + add(d.ID) + add(d.HostID) + if d.HostID != "" { + add("host:" + d.HostID) + } + + switch d.ResourceType { + case ResourceTypeVM: + add("qemu/" + d.ResourceID) + add("vm/" + d.ResourceID) + add("vm-" + d.ResourceID) + case ResourceTypeLXC: + add("lxc/" + d.ResourceID) + add("ct/" + d.ResourceID) + add("ct-" + d.ResourceID) + case ResourceTypeDocker: + if d.HostID != "" { + add("docker:" + d.HostID) + add("docker:" + d.HostID + "/" + d.ResourceID) + } + case ResourceTypeHost: + add("host:" + d.ResourceID) + case ResourceTypeK8s: + add("k8s/" + d.ResourceID) + add("kubernetes/" + d.ResourceID) + } + + return tokens +} + +func addToken(tokens map[string]struct{}, value string) { + trimmed := strings.TrimSpace(value) + if trimmed == "" { + return + } + tokens[strings.ToLower(trimmed)] = struct{}{} +} + +func lastSegment(value string, sep byte) string { + if value == "" { + return "" + } + idx := strings.LastIndexByte(value, sep) + if idx == -1 || idx+1 >= len(value) { + return "" + } + return value[idx+1:] +} + +func trailingDigits(value string) string { + if value == "" { + return "" + } + i := len(value) + for i > 0 { + c := value[i-1] + if c < '0' || c > '9' { + break + } + i-- + } + if i == len(value) { + return "" + } + return value[i:] +} + +// GetCLIExample returns an example CLI command for the resource. +func GetCLIExample(d *ResourceDiscovery, exampleCmd string) string { + if d == nil || d.CLIAccess == "" { + return "" + } + + // Replace the placeholder with the example command + cli := d.CLIAccess + cli = strings.ReplaceAll(cli, "...", exampleCmd) + cli = strings.ReplaceAll(cli, "{command}", exampleCmd) + + return cli +} + +// FormatFactsTable formats facts as a simple table. +func FormatFactsTable(facts []DiscoveryFact) string { + if len(facts) == 0 { + return "" + } + + var sb strings.Builder + sb.WriteString("| Category | Key | Value |\n") + sb.WriteString("|----------|-----|-------|\n") + + for _, f := range facts { + value := f.Value + if len(value) > 50 { + value = value[:47] + "..." + } + sb.WriteString(fmt.Sprintf("| %s | %s | %s |\n", f.Category, f.Key, value)) + } + + return sb.String() +} + +// BuildResourceContextForPatrol builds context for Patrol findings. +func BuildResourceContextForPatrol(store *Store, resourceIDs []string) string { + if store == nil || len(resourceIDs) == 0 { + return "" + } + + discoveries, err := store.GetMultiple(resourceIDs) + if err != nil || len(discoveries) == 0 { + return "" + } + + return FormatForAIContext(discoveries) +} + +// ToJSON converts a discovery to a JSON-friendly map. +func ToJSON(d *ResourceDiscovery) map[string]any { + if d == nil { + return nil + } + + facts := make([]map[string]any, 0, len(d.Facts)) + for _, f := range d.Facts { + facts = append(facts, map[string]any{ + "category": f.Category, + "key": f.Key, + "value": f.Value, + "source": f.Source, + "confidence": f.Confidence, + }) + } + + ports := make([]map[string]any, 0, len(d.Ports)) + for _, p := range d.Ports { + ports = append(ports, map[string]any{ + "port": p.Port, + "protocol": p.Protocol, + "process": p.Process, + "address": p.Address, + }) + } + + return map[string]any{ + "id": d.ID, + "resource_type": d.ResourceType, + "resource_id": d.ResourceID, + "host_id": d.HostID, + "hostname": d.Hostname, + "service_type": d.ServiceType, + "service_name": d.ServiceName, + "service_version": d.ServiceVersion, + "category": d.Category, + "cli_access": d.CLIAccess, + "facts": facts, + "config_paths": d.ConfigPaths, + "data_paths": d.DataPaths, + "log_paths": d.LogPaths, + "ports": ports, + "user_notes": d.UserNotes, + "confidence": d.Confidence, + "ai_reasoning": d.AIReasoning, + "discovered_at": d.DiscoveredAt, + "updated_at": d.UpdatedAt, + "scan_duration": d.ScanDuration, + } +} diff --git a/internal/servicediscovery/formatters_test.go b/internal/servicediscovery/formatters_test.go new file mode 100644 index 000000000..988ba2727 --- /dev/null +++ b/internal/servicediscovery/formatters_test.go @@ -0,0 +1,218 @@ +package servicediscovery + +import ( + "strings" + "testing" + "time" +) + +func TestFormattersAndTables(t *testing.T) { + if FormatForAIContext(nil) != "" { + t.Fatalf("expected empty context for nil discoveries") + } + + discovery := &ResourceDiscovery{ + ID: MakeResourceID(ResourceTypeDocker, "host1", "app"), + ResourceType: ResourceTypeDocker, + ResourceID: "app", + HostID: "host1", + Hostname: "host1", + ServiceType: "app", + ServiceName: "App Service", + ServiceVersion: "1.0", + Category: CategoryWebServer, + CLIAccess: "docker exec app ...", + ConfigPaths: []string{"/etc/app/config.yml"}, + DataPaths: []string{"/var/lib/app"}, + Ports: []PortInfo{{Port: 80, Protocol: "tcp"}}, + UserNotes: "keepalive enabled", + Facts: []DiscoveryFact{ + {Category: FactCategoryHardware, Key: "gpu", Value: "nvidia", Confidence: 0.9}, + {Category: FactCategoryService, Key: "worker", Value: "enabled", Confidence: 0.9}, + }, + } + + ctx := FormatForAIContext([]*ResourceDiscovery{discovery}) + if !strings.Contains(ctx, "Infrastructure Discovery") || !strings.Contains(ctx, "App Service") { + t.Fatalf("unexpected context: %s", ctx) + } + if !strings.Contains(ctx, "docker exec") || !strings.Contains(ctx, "User Notes") { + t.Fatalf("missing expected fields in context") + } + + if FormatSingleForAIContext(nil) != "" { + t.Fatalf("expected empty string for nil discovery") + } + if !strings.Contains(FormatSingleForAIContext(discovery), "App Service") { + t.Fatalf("expected single discovery output") + } + + remediation := FormatForRemediation(discovery) + if !strings.Contains(remediation, "How to Execute Commands") || !strings.Contains(remediation, "Hardware") { + t.Fatalf("unexpected remediation output: %s", remediation) + } + if FormatForRemediation(nil) != "" { + t.Fatalf("expected empty remediation output for nil") + } + + example := GetCLIExample(discovery, "ls /") + if !strings.Contains(example, "ls /") { + t.Fatalf("unexpected cli example: %s", example) + } + if GetCLIExample(&ResourceDiscovery{}, "ls /") != "" { + t.Fatalf("expected empty example when cli access missing") + } + + table := FormatFactsTable([]DiscoveryFact{ + {Category: FactCategoryVersion, Key: "app", Value: strings.Repeat("x", 60)}, + }) + if !strings.Contains(table, "...") { + t.Fatalf("expected truncated table value: %s", table) + } + if FormatFactsTable(nil) != "" { + t.Fatalf("expected empty facts table for nil") + } + + jsonMap := ToJSON(discovery) + if jsonMap["service_name"] != "App Service" || jsonMap["resource_id"] != "app" { + t.Fatalf("unexpected json map: %#v", jsonMap) + } + if ToJSON(nil) != nil { + t.Fatalf("expected nil json map for nil discovery") + } +} + +func TestFormatDiscoverySummaryAndAge(t *testing.T) { + now := time.Now() + if FormatDiscoverySummary(nil) == "" { + t.Fatalf("expected summary text for empty list") + } + if FormatDiscoveryAge(nil) != "unknown" { + t.Fatalf("expected unknown age for nil") + } + if FormatDiscoveryAge(&ResourceDiscovery{}) != "unknown" { + t.Fatalf("expected unknown age for zero timestamp") + } + discoveries := []*ResourceDiscovery{ + { + ID: MakeResourceID(ResourceTypeVM, "node1", "101"), + ResourceType: ResourceTypeVM, + ResourceID: "101", + HostID: "node1", + ServiceName: "VM One", + Confidence: 0.95, + UpdatedAt: now.Add(-2 * time.Hour), + }, + { + ID: MakeResourceID(ResourceTypeDocker, "host1", "app"), + ResourceType: ResourceTypeDocker, + ResourceID: "app", + HostID: "host1", + ServiceName: "App", + Confidence: 0.75, + UpdatedAt: now.Add(-2 * 24 * time.Hour), + }, + } + + summary := FormatDiscoverySummary(discoveries) + if !strings.Contains(summary, "[high confidence]") || !strings.Contains(summary, "[medium confidence]") { + t.Fatalf("unexpected summary: %s", summary) + } + + tests := []struct { + name string + updated time.Time + expected string + }{ + {name: "just-now", updated: now.Add(-30 * time.Second), expected: "just now"}, + {name: "one-minute", updated: now.Add(-1 * time.Minute), expected: "1 minute ago"}, + {name: "minutes", updated: now.Add(-10 * time.Minute), expected: "10 minutes ago"}, + {name: "one-hour", updated: now.Add(-1 * time.Hour), expected: "1 hour ago"}, + {name: "hours", updated: now.Add(-2 * time.Hour), expected: "2 hours ago"}, + {name: "one-day", updated: now.Add(-24 * time.Hour), expected: "1 day ago"}, + {name: "days", updated: now.Add(-3 * 24 * time.Hour), expected: "3 days ago"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := FormatDiscoveryAge(&ResourceDiscovery{UpdatedAt: tt.updated}) + if got != tt.expected { + t.Fatalf("expected %s, got %s", tt.expected, got) + } + }) + } +} + +func TestBuildResourceContextForPatrol(t *testing.T) { + store, err := NewStore(t.TempDir()) + if err != nil { + t.Fatalf("NewStore error: %v", err) + } + store.crypto = nil + + discovery := &ResourceDiscovery{ + ID: MakeResourceID(ResourceTypeDocker, "host1", "app"), + ResourceType: ResourceTypeDocker, + ResourceID: "app", + HostID: "host1", + ServiceName: "App Service", + } + if err := store.Save(discovery); err != nil { + t.Fatalf("Save error: %v", err) + } + + ctx := BuildResourceContextForPatrol(store, []string{discovery.ID}) + if !strings.Contains(ctx, "App Service") { + t.Fatalf("unexpected patrol context: %s", ctx) + } + + if BuildResourceContextForPatrol(nil, []string{discovery.ID}) != "" { + t.Fatalf("expected empty context for nil store") + } + if BuildResourceContextForPatrol(store, nil) != "" { + t.Fatalf("expected empty context for empty ids") + } + if BuildResourceContextForPatrol(store, []string{"missing"}) != "" { + t.Fatalf("expected empty context for missing discoveries") + } +} + +func TestFormatScopeHint(t *testing.T) { + discovery := &ResourceDiscovery{ + ID: MakeResourceID(ResourceTypeDocker, "host1", "app"), + ResourceType: ResourceTypeDocker, + ResourceID: "app", + HostID: "host1", + Hostname: "host1", + ServiceType: "app", + ServiceName: "App Service", + ServiceVersion: "1.2.3", + CLIAccess: "docker exec app -- ...", + Ports: []PortInfo{{Port: 80, Protocol: "tcp"}, {Port: 443, Protocol: "tcp"}}, + } + + hint := FormatScopeHint([]*ResourceDiscovery{discovery}) + if !strings.Contains(hint, "Discovery:") || !strings.Contains(hint, "App Service") { + t.Fatalf("unexpected scope hint: %s", hint) + } + if FormatScopeHint(nil) != "" { + t.Fatalf("expected empty hint for nil") + } +} + +func TestFilterImportantFactsLimit(t *testing.T) { + var facts []DiscoveryFact + for i := 0; i < 7; i++ { + facts = append(facts, DiscoveryFact{ + Category: FactCategoryVersion, + Key: "k", + Value: "v", + Confidence: 0.9, + }) + } + + important := filterImportantFacts(facts) + if len(important) != 5 { + t.Fatalf("expected 5 facts, got %d", len(important)) + } +} diff --git a/internal/servicediscovery/service.go b/internal/servicediscovery/service.go new file mode 100644 index 000000000..872019626 --- /dev/null +++ b/internal/servicediscovery/service.go @@ -0,0 +1,1753 @@ +// Package servicediscovery provides infrastructure discovery capabilities. +// It discovers services, versions, configurations, and CLI access methods +// for VMs, LXCs, Docker containers, Kubernetes pods, and hosts. +package servicediscovery + +import ( + "context" + "encoding/json" + "fmt" + "strconv" + "strings" + "sync" + "time" + + "github.com/rs/zerolog/log" +) + +// StateProvider provides access to the current infrastructure state. +type StateProvider interface { + GetState() StateSnapshot +} + +// StateSnapshot represents the infrastructure state. This mirrors models.StateSnapshot +// to avoid circular dependencies. +type StateSnapshot struct { + VMs []VM + Containers []Container + DockerHosts []DockerHost + KubernetesClusters []KubernetesCluster +} + +// VM represents a virtual machine. +type VM struct { + VMID int + Name string + Node string + Status string + Instance string + // Additional metadata for fingerprinting + CPUs int // Number of CPU cores + MaxMemory uint64 // Max memory in bytes + MaxDisk uint64 // Max disk in bytes + Tags []string // User-defined tags + OSName string // Detected OS name + OSVersion string // OS version string + IPAddresses []string // IP addresses assigned to the VM + Template bool // True if this is a template +} + +// Container represents an LXC container. +type Container struct { + VMID int + Name string + Node string + Status string + Instance string + // Additional metadata for fingerprinting + CPUs int // Number of CPU cores + MaxMemory uint64 // Max memory in bytes + MaxDisk uint64 // Max disk in bytes + Tags []string // User-defined tags + OSTemplate string // Template or OCI image used + OSName string // Detected OS name + IsOCI bool // True if OCI container (Proxmox 9.1+) + IPAddresses []string // IP addresses assigned to the container + Template bool // True if this is a template +} + +// DockerHost represents a Docker host. +type DockerHost struct { + AgentID string + Hostname string + Containers []DockerContainer +} + +// DockerContainer represents a Docker container. +type DockerContainer struct { + ID string + Name string + Image string + Status string + Ports []DockerPort + Labels map[string]string + Mounts []DockerMount +} + +// DockerPort represents a port mapping. +type DockerPort struct { + PublicPort int + PrivatePort int + Protocol string +} + +// DockerMount represents a mount point. +type DockerMount struct { + Source string + Destination string +} + +// KubernetesCluster represents a Kubernetes cluster. +type KubernetesCluster struct { + ID string + Name string + AgentID string + Status string + Pods []KubernetesPod +} + +// KubernetesPod represents a Kubernetes pod. +type KubernetesPod struct { + UID string + Name string + Namespace string + NodeName string + Phase string + Labels map[string]string + OwnerKind string // e.g., "Deployment", "StatefulSet", "DaemonSet" + OwnerName string + Containers []KubernetesPodContainer +} + +// KubernetesPodContainer represents a container within a Kubernetes pod. +type KubernetesPodContainer struct { + Name string + Image string + Ready bool + RestartCount int32 + State string // e.g., "running", "waiting", "terminated" +} + +// AIAnalyzer provides AI analysis capabilities for discovery. +type AIAnalyzer interface { + AnalyzeForDiscovery(ctx context.Context, prompt string) (string, error) +} + +// WSMessage represents a WebSocket message for broadcasting. +type WSMessage struct { + Type string `json:"type"` + Data interface{} `json:"data"` +} + +// WSBroadcaster provides WebSocket broadcasting capabilities. +type WSBroadcaster interface { + BroadcastDiscoveryProgress(progress *DiscoveryProgress) +} + +// Service manages infrastructure discovery. +type Service struct { + store *Store + scanner *DeepScanner + stateProvider StateProvider + aiAnalyzer AIAnalyzer + wsHub WSBroadcaster // WebSocket hub for broadcasting progress + + mu sync.RWMutex + running bool + stopCh chan struct{} + intervalCh chan time.Duration // Channel for live interval updates + interval time.Duration + initialDelay time.Duration + lastRun time.Time + deepScanTimeout time.Duration // Timeout for individual deep scans + maxDiscoveryAge time.Duration // Max age before rediscovery (default 30 days) + + // Cache for AI analysis results (by image name) + analysisCache map[string]*analysisCacheEntry + cacheMu sync.RWMutex + cacheExpiry time.Duration + + // In-progress discovery tracking (prevents duplicate concurrent discoveries) + inProgressMu sync.Mutex + inProgress map[string]*discoveryInProgress +} + +// discoveryInProgress tracks an ongoing discovery operation. +// Multiple callers can wait on the done channel for completion. +type discoveryInProgress struct { + done chan struct{} // Closed when discovery completes + result *ResourceDiscovery // Result after completion + err error // Error after completion +} + +// analysisCacheEntry holds a cached AI analysis result with its timestamp. +type analysisCacheEntry struct { + result *AIAnalysisResponse + cachedAt time.Time +} + +// Config holds discovery service configuration. +type Config struct { + DataDir string + Interval time.Duration // How often to run fingerprint collection (default 5 min) + CacheExpiry time.Duration // How long to cache AI analysis results + DeepScanTimeout time.Duration // Timeout for individual deep scans (default 60s) + + // Fingerprint-based discovery settings + MaxDiscoveryAge time.Duration // Rediscover after this duration (default 30 days) + FingerprintInterval time.Duration // How often to collect fingerprints (default 5 min) +} + +// DefaultConfig returns the default discovery configuration. +func DefaultConfig() Config { + return Config{ + Interval: 5 * time.Minute, // Fingerprint collection interval + CacheExpiry: 1 * time.Hour, + DeepScanTimeout: 60 * time.Second, + MaxDiscoveryAge: 30 * 24 * time.Hour, // 30 days + FingerprintInterval: 5 * time.Minute, + } +} + +// NewService creates a new discovery service. +func NewService(store *Store, scanner *DeepScanner, stateProvider StateProvider, cfg Config) *Service { + if cfg.Interval == 0 { + cfg.Interval = 5 * time.Minute + } + if cfg.CacheExpiry == 0 { + cfg.CacheExpiry = 1 * time.Hour + } + if cfg.DeepScanTimeout == 0 { + cfg.DeepScanTimeout = 60 * time.Second + } + if cfg.MaxDiscoveryAge == 0 { + cfg.MaxDiscoveryAge = 30 * 24 * time.Hour // 30 days + } + + return &Service{ + store: store, + scanner: scanner, + stateProvider: stateProvider, + interval: cfg.Interval, + initialDelay: 30 * time.Second, + cacheExpiry: cfg.CacheExpiry, + deepScanTimeout: cfg.DeepScanTimeout, + maxDiscoveryAge: cfg.MaxDiscoveryAge, + stopCh: make(chan struct{}), + intervalCh: make(chan time.Duration, 1), // Buffered to prevent blocking + analysisCache: make(map[string]*analysisCacheEntry), + inProgress: make(map[string]*discoveryInProgress), + } +} + +// SetAIAnalyzer sets the AI analyzer for discovery. +func (s *Service) SetAIAnalyzer(analyzer AIAnalyzer) { + s.mu.Lock() + defer s.mu.Unlock() + s.aiAnalyzer = analyzer +} + +// Start begins the background discovery service. +func (s *Service) Start(ctx context.Context) { + s.mu.Lock() + if s.running { + s.mu.Unlock() + return + } + s.running = true + s.stopCh = make(chan struct{}) + s.mu.Unlock() + + log.Info(). + Dur("interval", s.interval). + Msg("Starting infrastructure discovery service") + + go s.discoveryLoop(ctx) +} + +// Stop stops the background discovery service. +func (s *Service) Stop() { + s.mu.Lock() + defer s.mu.Unlock() + if s.running { + close(s.stopCh) + s.running = false + } +} + +// SetInterval updates the scan interval. Takes effect immediately if running. +func (s *Service) SetInterval(interval time.Duration) { + s.mu.Lock() + s.interval = interval + running := s.running + s.mu.Unlock() + + // If running, send the new interval to the loop (non-blocking) + if running { + select { + case s.intervalCh <- interval: + log.Info().Dur("interval", interval).Msg("Discovery interval updated (live)") + default: + // Channel full, interval will be picked up eventually + log.Debug().Dur("interval", interval).Msg("Discovery interval updated (pending)") + } + } +} + +// needsDeepScan determines if a discovery result needs a deep scan based on quality. +// Returns true if the discovery is incomplete or low-confidence. +func (s *Service) needsDeepScan(discovery *ResourceDiscovery) bool { + if discovery == nil { + return true // No discovery at all + } + + // Already has deep scan data (raw command outputs) + if len(discovery.RawCommandOutput) > 0 { + return false + } + + // Low confidence - needs more investigation + if discovery.Confidence < 0.7 { + return true + } + + // Unknown service type + if discovery.ServiceType == "" || discovery.ServiceType == "unknown" { + return true + } + + // Missing key paths that deep scan could discover + if len(discovery.Facts) == 0 && len(discovery.ConfigPaths) == 0 && len(discovery.LogPaths) == 0 { + return true + } + + return false +} + +// SetWSHub sets the WebSocket hub for broadcasting progress updates. +func (s *Service) SetWSHub(hub WSBroadcaster) { + s.mu.Lock() + s.wsHub = hub + s.mu.Unlock() + + // Wire up the scanner's progress callback to broadcast via WebSocket + if s.scanner != nil { + s.scanner.SetProgressCallback(s.broadcastProgress) + } + + log.Info().Msg("WebSocket hub connected to discovery service") +} + +// broadcastProgress broadcasts discovery progress to all WebSocket clients. +func (s *Service) broadcastProgress(progress *DiscoveryProgress) { + s.mu.RLock() + hub := s.wsHub + s.mu.RUnlock() + + if hub == nil || progress == nil { + return + } + + hub.BroadcastDiscoveryProgress(progress) +} + +// IsRunning returns whether the background discovery loop is active. +func (s *Service) IsRunning() bool { + s.mu.RLock() + defer s.mu.RUnlock() + return s.running +} + +// discoveryLoop runs periodic fingerprint collection (NOT actual discovery). +// This is the new fingerprint-based approach: background loop only collects fingerprints +// to detect changes. Discovery only runs on-demand when data is actually needed. +func (s *Service) discoveryLoop(ctx context.Context) { + delay := s.initialDelay + if delay <= 0 { + delay = 30 * time.Second + } + + // Run initial fingerprint collection after a short delay + select { + case <-time.After(delay): + case <-s.stopCh: + return + case <-ctx.Done(): + return + } + + s.collectFingerprints(ctx) + + s.mu.RLock() + currentInterval := s.interval + s.mu.RUnlock() + + ticker := time.NewTicker(currentInterval) + defer ticker.Stop() + + for { + select { + case <-ticker.C: + s.collectFingerprints(ctx) + case newInterval := <-s.intervalCh: + // Interval changed - reset the ticker + ticker.Stop() + ticker = time.NewTicker(newInterval) + log.Info().Dur("interval", newInterval).Msg("Fingerprint collection interval reset") + case <-s.stopCh: + log.Info().Msg("Stopping discovery service") + return + case <-ctx.Done(): + log.Info().Msg("Discovery context cancelled") + return + } + } +} + +// collectFingerprints collects fingerprints from all resources (Docker, LXC, VM). +// This is FREE (no AI calls) - it just hashes metadata to detect changes. +func (s *Service) collectFingerprints(ctx context.Context) { + defer func() { + if r := recover(); r != nil { + log.Error().Interface("panic", r).Stack().Msg("Recovered from panic in fingerprint collection") + } + }() + + s.mu.Lock() + s.lastRun = time.Now() + s.mu.Unlock() + + if s.stateProvider == nil { + return + } + + state := s.stateProvider.GetState() + changedCount := 0 + newCount := 0 + + // Process Docker containers + for _, host := range state.DockerHosts { + for _, container := range host.Containers { + select { + case <-ctx.Done(): + return + default: + } + + // Generate new fingerprint (prefixed with docker: to avoid collisions) + newFP := GenerateDockerFingerprint(host.AgentID, &container) + fpKey := "docker:" + host.AgentID + ":" + newFP.ResourceID + + // Get previous fingerprint + oldFP, _ := s.store.GetFingerprint(fpKey) + + // Update the fingerprint's ResourceID to include prefix for storage + newFP.ResourceID = fpKey + + // Save new fingerprint + if err := s.store.SaveFingerprint(newFP); err != nil { + log.Warn().Err(err).Str("container", container.Name).Msg("Failed to save Docker fingerprint") + continue + } + + // Check if this is new or changed + if oldFP == nil { + newCount++ + log.Debug(). + Str("type", "docker"). + Str("container", container.Name). + Str("hash", newFP.Hash). + Msg("New fingerprint captured") + } else if newFP.HasSchemaChanged(oldFP) { + // Schema changed - don't count as "changed" to avoid mass rediscovery + log.Debug(). + Str("type", "docker"). + Str("container", container.Name). + Int("old_schema", oldFP.SchemaVersion). + Int("new_schema", newFP.SchemaVersion). + Msg("Fingerprint schema updated") + } else if oldFP.Hash != newFP.Hash { + changedCount++ + log.Info(). + Str("type", "docker"). + Str("container", container.Name). + Str("old_hash", oldFP.Hash). + Str("new_hash", newFP.Hash). + Msg("Fingerprint changed - discovery will run on next request") + } + } + } + + // Process LXC containers + for _, lxc := range state.Containers { + select { + case <-ctx.Done(): + return + default: + } + + // Generate new fingerprint + newFP := GenerateLXCFingerprint(lxc.Node, &lxc) + fpKey := "lxc:" + lxc.Node + ":" + newFP.ResourceID + + // Get previous fingerprint + oldFP, _ := s.store.GetFingerprint(fpKey) + + // Update the fingerprint's ResourceID to include prefix for storage + newFP.ResourceID = fpKey + + // Save new fingerprint + if err := s.store.SaveFingerprint(newFP); err != nil { + log.Warn().Err(err).Str("lxc", lxc.Name).Msg("Failed to save LXC fingerprint") + continue + } + + // Check if this is new or changed + if oldFP == nil { + newCount++ + log.Debug(). + Str("type", "lxc"). + Str("name", lxc.Name). + Int("vmid", lxc.VMID). + Str("hash", newFP.Hash). + Msg("New fingerprint captured") + } else if newFP.HasSchemaChanged(oldFP) { + log.Debug(). + Str("type", "lxc"). + Str("name", lxc.Name). + Int("vmid", lxc.VMID). + Int("old_schema", oldFP.SchemaVersion). + Int("new_schema", newFP.SchemaVersion). + Msg("Fingerprint schema updated") + } else if oldFP.Hash != newFP.Hash { + changedCount++ + log.Info(). + Str("type", "lxc"). + Str("name", lxc.Name). + Int("vmid", lxc.VMID). + Str("old_hash", oldFP.Hash). + Str("new_hash", newFP.Hash). + Msg("Fingerprint changed - discovery will run on next request") + } + } + + // Process VMs + for _, vm := range state.VMs { + select { + case <-ctx.Done(): + return + default: + } + + // Generate new fingerprint + newFP := GenerateVMFingerprint(vm.Node, &vm) + fpKey := "vm:" + vm.Node + ":" + newFP.ResourceID + + // Get previous fingerprint + oldFP, _ := s.store.GetFingerprint(fpKey) + + // Update the fingerprint's ResourceID to include prefix for storage + newFP.ResourceID = fpKey + + // Save new fingerprint + if err := s.store.SaveFingerprint(newFP); err != nil { + log.Warn().Err(err).Str("vm", vm.Name).Msg("Failed to save VM fingerprint") + continue + } + + // Check if this is new or changed + if oldFP == nil { + newCount++ + log.Debug(). + Str("type", "vm"). + Str("name", vm.Name). + Int("vmid", vm.VMID). + Str("hash", newFP.Hash). + Msg("New fingerprint captured") + } else if newFP.HasSchemaChanged(oldFP) { + log.Debug(). + Str("type", "vm"). + Str("name", vm.Name). + Int("vmid", vm.VMID). + Int("old_schema", oldFP.SchemaVersion). + Int("new_schema", newFP.SchemaVersion). + Msg("Fingerprint schema updated") + } else if oldFP.Hash != newFP.Hash { + changedCount++ + log.Info(). + Str("type", "vm"). + Str("name", vm.Name). + Int("vmid", vm.VMID). + Str("old_hash", oldFP.Hash). + Str("new_hash", newFP.Hash). + Msg("Fingerprint changed - discovery will run on next request") + } + } + + // Process Kubernetes pods + for _, cluster := range state.KubernetesClusters { + for _, pod := range cluster.Pods { + select { + case <-ctx.Done(): + return + default: + } + + // Generate new fingerprint + newFP := GenerateK8sPodFingerprint(cluster.ID, &pod) + fpKey := "k8s:" + cluster.ID + ":" + pod.Namespace + "/" + pod.Name + + // Get previous fingerprint + oldFP, _ := s.store.GetFingerprint(fpKey) + + // Update the fingerprint's ResourceID to include prefix for storage + newFP.ResourceID = fpKey + + // Save new fingerprint + if err := s.store.SaveFingerprint(newFP); err != nil { + log.Warn().Err(err).Str("pod", pod.Name).Str("namespace", pod.Namespace).Msg("Failed to save K8s pod fingerprint") + continue + } + + // Check if this is new or changed + if oldFP == nil { + newCount++ + log.Debug(). + Str("type", "k8s"). + Str("name", pod.Name). + Str("namespace", pod.Namespace). + Str("cluster", cluster.Name). + Str("hash", newFP.Hash). + Msg("New fingerprint captured") + } else if newFP.HasSchemaChanged(oldFP) { + log.Debug(). + Str("type", "k8s"). + Str("name", pod.Name). + Str("namespace", pod.Namespace). + Str("cluster", cluster.Name). + Int("old_schema", oldFP.SchemaVersion). + Int("new_schema", newFP.SchemaVersion). + Msg("Fingerprint schema updated") + } else if oldFP.Hash != newFP.Hash { + changedCount++ + log.Info(). + Str("type", "k8s"). + Str("name", pod.Name). + Str("namespace", pod.Namespace). + Str("cluster", cluster.Name). + Str("old_hash", oldFP.Hash). + Str("new_hash", newFP.Hash). + Msg("Fingerprint changed - discovery will run on next request") + } + } + } + + // Update last scan time + s.store.SetLastFingerprintScan(time.Now()) + + if newCount > 0 || changedCount > 0 { + log.Info(). + Int("new", newCount). + Int("changed", changedCount). + Int("total", s.store.GetFingerprintCount()). + Msg("Fingerprint collection complete") + } else { + log.Debug(). + Int("total", s.store.GetFingerprintCount()). + Msg("Fingerprint collection complete - no changes") + } + + // Cleanup orphaned data (fingerprints/discoveries for removed resources) + s.cleanupOrphanedData(state) +} + +// cleanupOrphanedData removes fingerprints and discoveries for resources that no longer exist. +func (s *Service) cleanupOrphanedData(state StateSnapshot) { + // Safety check: Don't cleanup if state appears empty + // This prevents catastrophic deletion if state provider has an error + totalResources := len(state.Containers) + len(state.VMs) + len(state.KubernetesClusters) + for _, host := range state.DockerHosts { + totalResources += len(host.Containers) + } + if totalResources == 0 { + log.Debug().Msg("Skipping orphaned data cleanup - state is empty (may be an error)") + return + } + + // Build set of current resource IDs + currentIDs := make(map[string]bool) + + // Docker containers + for _, host := range state.DockerHosts { + for _, container := range host.Containers { + fpKey := "docker:" + host.AgentID + ":" + container.Name + currentIDs[fpKey] = true + } + } + + // LXC containers + for _, lxc := range state.Containers { + fpKey := "lxc:" + lxc.Node + ":" + strconv.Itoa(lxc.VMID) + currentIDs[fpKey] = true + } + + // VMs + for _, vm := range state.VMs { + fpKey := "vm:" + vm.Node + ":" + strconv.Itoa(vm.VMID) + currentIDs[fpKey] = true + } + + // Kubernetes pods + for _, cluster := range state.KubernetesClusters { + for _, pod := range cluster.Pods { + fpKey := "k8s:" + cluster.ID + ":" + pod.Namespace + "/" + pod.Name + currentIDs[fpKey] = true + } + } + + // Run cleanup + fpRemoved := s.store.CleanupOrphanedFingerprints(currentIDs) + discRemoved := s.store.CleanupOrphanedDiscoveries(currentIDs) + + if fpRemoved > 0 || discRemoved > 0 { + log.Info(). + Int("fingerprints_removed", fpRemoved). + Int("discoveries_removed", discRemoved). + Msg("Cleaned up orphaned data") + } +} + +// discoverDockerContainers runs discovery on Docker containers using metadata. +// Automatically runs deep scans when the shallow scan results are incomplete or low-confidence. +func (s *Service) discoverDockerContainers(ctx context.Context, hosts []DockerHost) { + s.mu.RLock() + analyzer := s.aiAnalyzer + s.mu.RUnlock() + + if analyzer == nil { + log.Debug().Msg("AI analyzer not set, skipping Docker discovery") + return + } + + for _, host := range hosts { + for _, container := range host.Containers { + select { + case <-ctx.Done(): + return + default: + } + + // Build resource ID + id := MakeResourceID(ResourceTypeDocker, host.AgentID, container.Name) + + // Check if we already have a recent discovery + if !s.store.NeedsRefresh(id, s.cacheExpiry) { + continue + } + + // Check existing discovery to see if it needs a deep scan + existing, _ := s.store.Get(id) + + // Analyze using metadata (shallow discovery) + discovery := s.analyzeDockerContainer(ctx, analyzer, container, host) + if discovery != nil { + // Smart auto deep scan: enhance if discovery is incomplete or low-confidence + // Also deep scan if there's no existing discovery (first time) + if s.scanner != nil && (existing == nil || s.needsDeepScan(discovery)) { + log.Info(). + Str("id", id). + Float64("confidence", discovery.Confidence). + Str("serviceType", discovery.ServiceType). + Bool("firstDiscovery", existing == nil). + Msg("Auto deep scan triggered due to incomplete discovery") + discovery = s.enhanceWithDeepScan(ctx, discovery, host) + } + + if err := s.store.Save(discovery); err != nil { + log.Warn().Err(err).Str("id", id).Msg("Failed to save discovery") + } + } + } + } +} + +// enhanceWithDeepScan runs a deep scan and merges the results into the discovery. +func (s *Service) enhanceWithDeepScan(ctx context.Context, discovery *ResourceDiscovery, host DockerHost) *ResourceDiscovery { + s.mu.RLock() + timeout := s.deepScanTimeout + analyzer := s.aiAnalyzer + s.mu.RUnlock() + + if s.scanner == nil || analyzer == nil { + return discovery + } + + // Create a timeout context for the deep scan + scanCtx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + + req := DiscoveryRequest{ + ResourceType: discovery.ResourceType, + ResourceID: discovery.ResourceID, + HostID: discovery.HostID, + Hostname: discovery.Hostname, + } + + scanResult, err := s.scanner.Scan(scanCtx, req) + if err != nil { + log.Debug().Err(err).Str("id", discovery.ID).Msg("Deep scan failed during background discovery") + return discovery + } + + if len(scanResult.CommandOutputs) == 0 { + return discovery + } + + // Build analysis request with command outputs + analysisReq := AIAnalysisRequest{ + ResourceType: discovery.ResourceType, + ResourceID: discovery.ResourceID, + HostID: discovery.HostID, + Hostname: discovery.Hostname, + CommandOutputs: scanResult.CommandOutputs, + } + + // Add metadata if available + if s.stateProvider != nil { + analysisReq.Metadata = s.getResourceMetadata(req) + } + + // Build prompt and analyze + prompt := s.buildDeepAnalysisPrompt(analysisReq) + response, err := analyzer.AnalyzeForDiscovery(scanCtx, prompt) + if err != nil { + log.Debug().Err(err).Str("id", discovery.ID).Msg("Deep analysis failed during background discovery") + return discovery + } + + result := s.parseAIResponse(response) + if result == nil { + return discovery + } + + // Merge results - deep scan results take precedence for non-empty fields + if result.ServiceType != "" && result.ServiceType != "unknown" { + discovery.ServiceType = result.ServiceType + } + if result.ServiceName != "" { + discovery.ServiceName = result.ServiceName + } + if result.ServiceVersion != "" { + discovery.ServiceVersion = result.ServiceVersion + } + if result.Category != "" && result.Category != CategoryUnknown { + discovery.Category = result.Category + } + if result.CLIAccess != "" { + discovery.CLIAccess = s.formatCLIAccess(discovery.ResourceType, discovery.ResourceID, result.CLIAccess) + } + if len(result.Facts) > 0 { + discovery.Facts = result.Facts + } + if len(result.ConfigPaths) > 0 { + discovery.ConfigPaths = result.ConfigPaths + } + if len(result.DataPaths) > 0 { + discovery.DataPaths = result.DataPaths + } + if len(result.LogPaths) > 0 { + discovery.LogPaths = result.LogPaths + } + if len(result.Ports) > 0 { + discovery.Ports = result.Ports + } + if result.Confidence > discovery.Confidence { + discovery.Confidence = result.Confidence + } + if result.Reasoning != "" { + discovery.AIReasoning = result.Reasoning + } + + // Store raw command outputs + discovery.RawCommandOutput = scanResult.CommandOutputs + discovery.ScanDuration = scanResult.CompletedAt.Sub(scanResult.StartedAt).Milliseconds() + discovery.UpdatedAt = time.Now() + + // Parse docker_mounts if present (for LXCs/VMs running Docker) + if dockerMountsOutput, ok := scanResult.CommandOutputs["docker_mounts"]; ok { + discovery.DockerMounts = parseDockerMounts(dockerMountsOutput) + if len(discovery.DockerMounts) > 0 { + log.Debug(). + Str("id", discovery.ID). + Int("mountCount", len(discovery.DockerMounts)). + Msg("Parsed Docker bind mounts from discovery") + } + } + + log.Info(). + Str("id", discovery.ID). + Int("commandOutputs", len(scanResult.CommandOutputs)). + Int("dockerMounts", len(discovery.DockerMounts)). + Dur("scanDuration", scanResult.CompletedAt.Sub(scanResult.StartedAt)). + Msg("Enhanced discovery with deep scan") + + return discovery +} + +// analyzeDockerContainer analyzes a Docker container using AI. +func (s *Service) analyzeDockerContainer(ctx context.Context, analyzer AIAnalyzer, c DockerContainer, host DockerHost) *ResourceDiscovery { + // Check cache first (per-image timestamp) + s.cacheMu.RLock() + entry, found := s.analysisCache[c.Image] + cacheValid := found && time.Since(entry.cachedAt) < s.cacheExpiry + s.cacheMu.RUnlock() + + var result *AIAnalysisResponse + + if cacheValid { + result = entry.result + } else { + // Build prompt for AI analysis + prompt := s.buildMetadataAnalysisPrompt(c, host) + + response, err := analyzer.AnalyzeForDiscovery(ctx, prompt) + if err != nil { + log.Warn().Err(err).Str("container", c.Name).Msg("AI analysis failed") + return nil + } + + result = s.parseAIResponse(response) + if result == nil { + log.Warn().Str("container", c.Name).Msg("Failed to parse AI response") + return nil + } + + // Cache the result with its own timestamp + s.cacheMu.Lock() + s.analysisCache[c.Image] = &analysisCacheEntry{ + result: result, + cachedAt: time.Now(), + } + s.cacheMu.Unlock() + } + + // Skip unknown/low-confidence results + if result.ServiceType == "unknown" || result.Confidence < 0.5 { + return nil + } + + // Build CLI access string + cliAccess := result.CLIAccess + if cliAccess != "" { + cliAccess = strings.ReplaceAll(cliAccess, "{container}", c.Name) + } + + // Extract ports + var ports []PortInfo + for _, p := range c.Ports { + ports = append(ports, PortInfo{ + Port: p.PrivatePort, + Protocol: p.Protocol, + Address: fmt.Sprintf(":%d", p.PublicPort), + }) + } + + return &ResourceDiscovery{ + ID: MakeResourceID(ResourceTypeDocker, host.AgentID, c.Name), + ResourceType: ResourceTypeDocker, + ResourceID: c.Name, + HostID: host.AgentID, + Hostname: host.Hostname, + ServiceType: result.ServiceType, + ServiceName: result.ServiceName, + ServiceVersion: result.ServiceVersion, + Category: result.Category, + CLIAccess: cliAccess, + Facts: result.Facts, + ConfigPaths: result.ConfigPaths, + DataPaths: result.DataPaths, + LogPaths: result.LogPaths, + Ports: ports, + Confidence: result.Confidence, + AIReasoning: result.Reasoning, + DiscoveredAt: time.Now(), + UpdatedAt: time.Now(), + } +} + +// DiscoverResource performs deep discovery on a specific resource. +// Uses fingerprint-based detection to avoid unnecessary AI calls: +// - Returns cached discovery if fingerprint hasn't changed +// - Runs discovery only when fingerprint changed or discovery is too old +// - Prevents duplicate concurrent discoveries for the same resource +func (s *Service) DiscoverResource(ctx context.Context, req DiscoveryRequest) (*ResourceDiscovery, error) { + resourceID := MakeResourceID(req.ResourceType, req.HostID, req.ResourceID) + + // Get current fingerprint (if available) + // Fingerprint key matches the resource ID format: type:host:id + currentFP, _ := s.store.GetFingerprint(resourceID) + + // Get existing discovery + existing, _ := s.store.Get(resourceID) + + // Determine if we need to run discovery + needsDiscovery := false + reason := "" + + if req.Force { + needsDiscovery = true + reason = "forced" + } else if existing == nil { + needsDiscovery = true + reason = "no existing discovery" + } else if currentFP != nil && existing.Fingerprint != currentFP.Hash { + // Fingerprint hash differs - check if it's just a schema version change + if existing.FingerprintSchemaVersion != 0 && existing.FingerprintSchemaVersion != currentFP.SchemaVersion { + // Schema changed but container didn't - don't trigger rediscovery + // This prevents mass rediscovery when we upgrade the fingerprint algorithm + log.Debug(). + Str("id", resourceID). + Int("old_schema", existing.FingerprintSchemaVersion). + Int("new_schema", currentFP.SchemaVersion). + Msg("Fingerprint schema changed, but not triggering rediscovery") + } else { + // Same schema version, different hash = real container change + needsDiscovery = true + reason = "fingerprint changed" + } + } else if time.Since(existing.DiscoveredAt) > s.maxDiscoveryAge { + needsDiscovery = true + reason = "discovery too old" + } + + // Return cached discovery if still valid + if !needsDiscovery && existing != nil { + log.Debug().Str("id", resourceID).Msg("Discovery still valid, returning cached") + return existing, nil + } + + // Check for duplicate concurrent discovery requests + s.inProgressMu.Lock() + if inProg, ok := s.inProgress[resourceID]; ok { + // Discovery already in progress - wait for it + s.inProgressMu.Unlock() + log.Debug().Str("id", resourceID).Msg("Discovery already in progress, waiting for result") + + select { + case <-inProg.done: + return inProg.result, inProg.err + case <-ctx.Done(): + return nil, ctx.Err() + } + } + + // Claim this discovery slot + inProg := &discoveryInProgress{ + done: make(chan struct{}), + } + s.inProgress[resourceID] = inProg + s.inProgressMu.Unlock() + + // Ensure we clean up and notify waiters when done + defer func() { + close(inProg.done) + s.inProgressMu.Lock() + delete(s.inProgress, resourceID) + s.inProgressMu.Unlock() + }() + + log.Info().Str("id", resourceID).Str("reason", reason).Msg("Running discovery") + + s.mu.RLock() + analyzer := s.aiAnalyzer + s.mu.RUnlock() + + if analyzer == nil { + inProg.err = fmt.Errorf("AI analyzer not configured") + return nil, inProg.err + } + + // Run deep scan if scanner is available + var scanResult *ScanResult + if s.scanner != nil { + var err error + scanResult, err = s.scanner.Scan(ctx, req) + if err != nil { + log.Warn().Err(err).Str("id", resourceID).Msg("Deep scan failed, using metadata only") + } + } + + // Build analysis request + analysisReq := AIAnalysisRequest{ + ResourceType: req.ResourceType, + ResourceID: req.ResourceID, + HostID: req.HostID, + Hostname: req.Hostname, + } + + if scanResult != nil { + analysisReq.CommandOutputs = scanResult.CommandOutputs + } + + // Add metadata if available + if s.stateProvider != nil { + analysisReq.Metadata = s.getResourceMetadata(req) + } + + // Build prompt and analyze + prompt := s.buildDeepAnalysisPrompt(analysisReq) + response, err := analyzer.AnalyzeForDiscovery(ctx, prompt) + if err != nil { + inProg.err = fmt.Errorf("AI analysis failed: %w", err) + return nil, inProg.err + } + + result := s.parseAIResponse(response) + if result == nil { + // Truncate response for error message + truncated := response + if len(truncated) > 500 { + truncated = truncated[:500] + "..." + } + inProg.err = fmt.Errorf("failed to parse AI response: %s", truncated) + return nil, inProg.err + } + + // Resolve hostname from metadata if not provided in request + hostname := req.Hostname + if hostname == "" && analysisReq.Metadata != nil { + if name, ok := analysisReq.Metadata["name"].(string); ok && name != "" { + hostname = name + } + } + + // Build discovery result + discovery := &ResourceDiscovery{ + ID: resourceID, + ResourceType: req.ResourceType, + ResourceID: req.ResourceID, + HostID: req.HostID, + Hostname: hostname, + ServiceType: result.ServiceType, + ServiceName: result.ServiceName, + ServiceVersion: result.ServiceVersion, + Category: result.Category, + CLIAccess: s.formatCLIAccess(req.ResourceType, req.ResourceID, result.CLIAccess), + CLIAccessVersion: CLIAccessVersion, + Facts: result.Facts, + ConfigPaths: result.ConfigPaths, + DataPaths: result.DataPaths, + LogPaths: result.LogPaths, + Ports: result.Ports, + Confidence: result.Confidence, + AIReasoning: result.Reasoning, + DiscoveredAt: time.Now(), + UpdatedAt: time.Now(), + } + + // Store fingerprint with discovery + if currentFP != nil { + discovery.Fingerprint = currentFP.Hash + discovery.FingerprintedAt = currentFP.GeneratedAt + discovery.FingerprintSchemaVersion = currentFP.SchemaVersion + } + + if scanResult != nil { + discovery.RawCommandOutput = scanResult.CommandOutputs + discovery.ScanDuration = scanResult.CompletedAt.Sub(scanResult.StartedAt).Milliseconds() + + // Parse docker_mounts if present (for LXCs/VMs running Docker) + if dockerMountsOutput, ok := scanResult.CommandOutputs["docker_mounts"]; ok { + discovery.DockerMounts = parseDockerMounts(dockerMountsOutput) + if len(discovery.DockerMounts) > 0 { + log.Debug(). + Str("id", discovery.ID). + Int("mountCount", len(discovery.DockerMounts)). + Msg("Parsed Docker bind mounts from on-demand discovery") + } + } + } + + // Preserve user notes from existing discovery + if existing != nil { + discovery.UserNotes = existing.UserNotes + discovery.UserSecrets = existing.UserSecrets + if discovery.DiscoveredAt.IsZero() || existing.DiscoveredAt.Before(discovery.DiscoveredAt) { + discovery.DiscoveredAt = existing.DiscoveredAt + } + } + + // Save discovery + if err := s.store.Save(discovery); err != nil { + inProg.err = fmt.Errorf("failed to save discovery: %w", err) + return nil, inProg.err + } + + // Store result for any waiting goroutines + inProg.result = discovery + return discovery, nil +} + +// getResourceMetadata retrieves metadata for a resource from the state. +func (s *Service) getResourceMetadata(req DiscoveryRequest) map[string]any { + if s.stateProvider == nil { + return nil + } + + state := s.stateProvider.GetState() + metadata := make(map[string]any) + + switch req.ResourceType { + case ResourceTypeLXC: + for _, c := range state.Containers { + if fmt.Sprintf("%d", c.VMID) == req.ResourceID && c.Node == req.HostID { + metadata["name"] = c.Name + metadata["status"] = c.Status + metadata["vmid"] = c.VMID + break + } + } + case ResourceTypeVM: + for _, vm := range state.VMs { + if fmt.Sprintf("%d", vm.VMID) == req.ResourceID && vm.Node == req.HostID { + metadata["name"] = vm.Name + metadata["status"] = vm.Status + metadata["vmid"] = vm.VMID + break + } + } + case ResourceTypeDocker: + for _, host := range state.DockerHosts { + if host.AgentID == req.HostID || host.Hostname == req.HostID { + for _, c := range host.Containers { + if c.Name == req.ResourceID { + metadata["image"] = c.Image + metadata["status"] = c.Status + metadata["labels"] = c.Labels + break + } + } + break + } + } + } + + return metadata +} + +// formatCLIAccess formats the CLI access string with actual values. +func (s *Service) formatCLIAccess(resourceType ResourceType, resourceID, cliTemplate string) string { + if cliTemplate == "" { + // Use default template + cliTemplate = GetCLIAccessTemplate(resourceType) + } + + result := cliTemplate + result = strings.ReplaceAll(result, "{vmid}", resourceID) + result = strings.ReplaceAll(result, "{container}", resourceID) + result = strings.ReplaceAll(result, "{command}", "...") + + return result +} + +// buildMetadataAnalysisPrompt builds a prompt for shallow metadata-based analysis. +func (s *Service) buildMetadataAnalysisPrompt(c DockerContainer, host DockerHost) string { + info := map[string]any{ + "name": c.Name, + "image": c.Image, + "status": c.Status, + "host": host.Hostname, + } + + if len(c.Ports) > 0 { + var ports []map[string]any + for _, p := range c.Ports { + ports = append(ports, map[string]any{ + "public": p.PublicPort, + "private": p.PrivatePort, + "protocol": p.Protocol, + }) + } + info["ports"] = ports + } + + if len(c.Labels) > 0 { + info["labels"] = c.Labels + } + + if len(c.Mounts) > 0 { + var mounts []string + for _, m := range c.Mounts { + mounts = append(mounts, m.Destination) + } + info["mounts"] = mounts + } + + infoJSON, _ := json.MarshalIndent(info, "", " ") + + return fmt.Sprintf(`Analyze this Docker container and identify what service it's running. + +Container Information: +%s + +Based on the image name, ports, labels, and mounts, determine: +1. What service/application is this? +2. What category does it belong to? +3. How should CLI commands be executed? + +Respond in this exact JSON format: +{ + "service_type": "lowercase_type", + "service_name": "Human Readable Name", + "service_version": "version if detectable from image tag", + "category": "database|web_server|cache|monitoring|backup|nvr|storage|container|network|security|media|home_automation|unknown", + "cli_access": "docker exec {container} ", + "facts": [], + "config_paths": [], + "data_paths": [], + "log_paths": [], + "ports": [], + "confidence": 0.0-1.0, + "reasoning": "Brief explanation" +} + +Respond with ONLY valid JSON.`, string(infoJSON)) +} + +// buildDeepAnalysisPrompt builds a prompt for deep analysis with command outputs. +func (s *Service) buildDeepAnalysisPrompt(req AIAnalysisRequest) string { + var sections []string + + sections = append(sections, fmt.Sprintf(`Resource Type: %s +Resource ID: %s +Host: %s (%s)`, req.ResourceType, req.ResourceID, req.Hostname, req.HostID)) + + if len(req.Metadata) > 0 { + metaJSON, _ := json.MarshalIndent(req.Metadata, "", " ") + sections = append(sections, fmt.Sprintf("Metadata:\n%s", string(metaJSON))) + } + + if len(req.CommandOutputs) > 0 { + sections = append(sections, "Command Outputs:") + for name, output := range req.CommandOutputs { + // Truncate long outputs + if len(output) > 2000 { + output = output[:2000] + "\n... (truncated)" + } + sections = append(sections, fmt.Sprintf("--- %s ---\n%s", name, output)) + } + } + + return fmt.Sprintf(`Analyze this infrastructure resource and provide detailed discovery information. + +%s + +Based on all available information, determine: +1. What service/application is running? +2. What version is it? +3. What are the important configuration paths? +4. What data paths should be backed up? +5. What log paths are useful for troubleshooting? +6. What ports are in use? +7. Any special hardware (GPU, TPU, etc.)? +8. Any dependencies (databases, message queues, etc.)? + +Respond in this exact JSON format: +{ + "service_type": "lowercase_type (e.g., frigate, postgres, pbs)", + "service_name": "Human Readable Name", + "service_version": "version number if found", + "category": "database|web_server|cache|monitoring|backup|nvr|storage|container|virtualizer|network|security|media|home_automation|unknown", + "cli_access": "command to access this service's CLI", + "facts": [ + {"category": "version|config|service|port|hardware|network|storage|dependency|security", "key": "fact_name", "value": "fact_value", "source": "command_name", "confidence": 0.9} + ], + "config_paths": ["/path/to/config.yml"], + "data_paths": ["/path/to/data"], + "log_paths": ["/var/log/service/", "/path/to/app.log"], + "ports": [{"port": 8080, "protocol": "tcp", "process": "nginx", "address": "0.0.0.0"}], + "confidence": 0.0-1.0, + "reasoning": "Explanation of identification" +} + +Important: +- Extract version numbers from package lists, process output, or config files +- Identify config and data paths from mount points and file listings +- Identify log paths (e.g., /var/log/, application-specific logs) for troubleshooting +- Note any special hardware like Coral TPU, NVIDIA GPU +- For LXC/VM, the CLI access should use pct exec or qm guest exec +- For Docker, use docker exec + +Respond with ONLY valid JSON.`, strings.Join(sections, "\n\n")) +} + +// parseAIResponse parses the AI's JSON response. +func (s *Service) parseAIResponse(response string) *AIAnalysisResponse { + log.Debug().Str("raw_response", response).Msg("Discovery raw response") + response = strings.TrimSpace(response) + + // Handle markdown code blocks + if strings.HasPrefix(response, "```") { + lines := strings.Split(response, "\n") + var jsonLines []string + inBlock := false + for _, line := range lines { + if strings.HasPrefix(line, "```") { + inBlock = !inBlock + continue + } + if inBlock { + jsonLines = append(jsonLines, line) + } + } + response = strings.Join(jsonLines, "\n") + } + + // Find JSON object + start := strings.Index(response, "{") + end := strings.LastIndex(response, "}") + if start >= 0 && end > start { + response = response[start : end+1] + } + + var result AIAnalysisResponse + if err := json.Unmarshal([]byte(response), &result); err != nil { + log.Debug().Err(err).Str("response", response).Msg("Failed to parse AI response") + return nil + } + + // Set discovered_at for facts + now := time.Now() + for i := range result.Facts { + result.Facts[i].DiscoveredAt = now + } + + return &result +} + +// parseDockerMounts parses the docker_mounts command output into a slice of DockerBindMount. +// The output format is: +// CONTAINER:container_name +// source|destination|type +// source|destination|type +// CONTAINER:another_container +// source|destination|type +func parseDockerMounts(output string) []DockerBindMount { + if output == "" || output == "no_docker_mounts" { + return nil + } + + var mounts []DockerBindMount + var currentContainer string + + lines := strings.Split(output, "\n") + for _, line := range lines { + line = strings.TrimSpace(line) + if line == "" { + continue + } + + // Check if this is a container header + if strings.HasPrefix(line, "CONTAINER:") { + currentContainer = strings.TrimPrefix(line, "CONTAINER:") + continue + } + + // Skip if we don't have a current container + if currentContainer == "" { + continue + } + + // Parse mount line: source|destination|type + parts := strings.Split(line, "|") + if len(parts) < 2 { + continue + } + + mount := DockerBindMount{ + ContainerName: currentContainer, + Source: parts[0], + Destination: parts[1], + } + if len(parts) >= 3 { + mount.Type = parts[2] + } + + // Only include bind mounts and volumes (skip tmpfs, etc.) + if mount.Type == "" || mount.Type == "bind" || mount.Type == "volume" { + mounts = append(mounts, mount) + } + } + + return mounts +} + +// GetDiscovery retrieves a discovery by ID. +func (s *Service) GetDiscovery(id string) (*ResourceDiscovery, error) { + d, err := s.store.Get(id) + if err != nil || d == nil { + return d, err + } + s.upgradeCLIAccessIfNeeded(d) + return d, nil +} + +// GetDiscoveryByResource retrieves a discovery by resource type and ID. +func (s *Service) GetDiscoveryByResource(resourceType ResourceType, hostID, resourceID string) (*ResourceDiscovery, error) { + d, err := s.store.GetByResource(resourceType, hostID, resourceID) + if err != nil || d == nil { + return d, err + } + s.upgradeCLIAccessIfNeeded(d) + return d, nil +} + +// ListDiscoveries returns all discoveries. +func (s *Service) ListDiscoveries() ([]*ResourceDiscovery, error) { + discoveries, err := s.store.List() + if err != nil { + return nil, err + } + for _, d := range discoveries { + s.upgradeCLIAccessIfNeeded(d) + } + return discoveries, nil +} + +// ListDiscoveriesByType returns discoveries for a specific resource type. +func (s *Service) ListDiscoveriesByType(resourceType ResourceType) ([]*ResourceDiscovery, error) { + discoveries, err := s.store.ListByType(resourceType) + if err != nil { + return nil, err + } + for _, d := range discoveries { + s.upgradeCLIAccessIfNeeded(d) + } + return discoveries, nil +} + +// ListDiscoveriesByHost returns discoveries for a specific host. +func (s *Service) ListDiscoveriesByHost(hostID string) ([]*ResourceDiscovery, error) { + discoveries, err := s.store.ListByHost(hostID) + if err != nil { + return nil, err + } + for _, d := range discoveries { + s.upgradeCLIAccessIfNeeded(d) + } + return discoveries, nil +} + +// upgradeDiscoveryIfNeeded upgrades cached discovery fields to current versions. +// This ensures cached discoveries get the new instructional CLI access format +// and have hostname populated without requiring a full re-discovery. +func (s *Service) upgradeCLIAccessIfNeeded(d *ResourceDiscovery) { + if d == nil { + return + } + + upgraded := false + + // Upgrade CLI access if version is outdated + if d.CLIAccessVersion < CLIAccessVersion { + oldCLI := d.CLIAccess + d.CLIAccess = GetCLIAccessTemplate(d.ResourceType) + d.CLIAccessVersion = CLIAccessVersion + upgraded = true + + log.Debug(). + Str("id", d.ID). + Str("old_cli", oldCLI). + Str("new_cli", d.CLIAccess). + Int("new_version", CLIAccessVersion). + Msg("Upgraded CLI access pattern to new version") + } + + // Fix empty hostname by looking up the resource name from state + if d.Hostname == "" && s.stateProvider != nil { + state := s.stateProvider.GetState() + hostname := s.lookupHostnameFromState(d.ResourceType, d.HostID, d.ResourceID, state) + if hostname != "" { + d.Hostname = hostname + upgraded = true + log.Debug(). + Str("id", d.ID). + Str("hostname", hostname). + Msg("Populated missing hostname from state") + } + } + + _ = upgraded // Suppress unused variable warning if logging is disabled +} + +// lookupHostnameFromState finds the hostname/name for a resource from state +func (s *Service) lookupHostnameFromState(resourceType ResourceType, hostID, resourceID string, state StateSnapshot) string { + switch resourceType { + case ResourceTypeLXC: + for _, c := range state.Containers { + if fmt.Sprintf("%d", c.VMID) == resourceID && c.Node == hostID { + return c.Name + } + } + case ResourceTypeVM: + for _, vm := range state.VMs { + if fmt.Sprintf("%d", vm.VMID) == resourceID && vm.Node == hostID { + return vm.Name + } + } + case ResourceTypeDocker: + for _, host := range state.DockerHosts { + if host.AgentID == hostID || host.Hostname == hostID { + for _, c := range host.Containers { + if c.Name == resourceID { + return host.Hostname + } + } + } + } + } + return "" +} + +// UpdateNotes updates user notes for a discovery. +func (s *Service) UpdateNotes(id string, notes string, secrets map[string]string) error { + return s.store.UpdateNotes(id, notes, secrets) +} + +// DeleteDiscovery deletes a discovery. +func (s *Service) DeleteDiscovery(id string) error { + return s.store.Delete(id) +} + +// GetProgress returns the progress of an ongoing discovery. +func (s *Service) GetProgress(resourceID string) *DiscoveryProgress { + if s.scanner == nil { + return nil + } + return s.scanner.GetProgress(resourceID) +} + +// GetStatus returns the service status including fingerprint statistics. +func (s *Service) GetStatus() map[string]any { + s.mu.RLock() + defer s.mu.RUnlock() + + s.cacheMu.RLock() + cacheSize := len(s.analysisCache) + s.cacheMu.RUnlock() + + // Get fingerprint stats + fingerprintCount := 0 + var lastFingerprintScan time.Time + if s.store != nil { + fingerprintCount = s.store.GetFingerprintCount() + lastFingerprintScan = s.store.GetLastFingerprintScan() + } + + return map[string]any{ + "running": s.running, + "last_run": s.lastRun, + "interval": s.interval.String(), + "cache_size": cacheSize, + "ai_analyzer_set": s.aiAnalyzer != nil, + "scanner_set": s.scanner != nil, + "store_set": s.store != nil, + "deep_scan_timeout": s.deepScanTimeout.String(), + "max_discovery_age": s.maxDiscoveryAge.String(), + "fingerprint_count": fingerprintCount, + "last_fingerprint_scan": lastFingerprintScan, + } +} + +// GetMaxDiscoveryAge returns the current max discovery age (staleness threshold). +func (s *Service) GetMaxDiscoveryAge() time.Duration { + s.mu.RLock() + defer s.mu.RUnlock() + return s.maxDiscoveryAge +} + +// SetMaxDiscoveryAge updates the max discovery age (staleness threshold). +// Discoveries older than this duration will be re-run when requested. +func (s *Service) SetMaxDiscoveryAge(age time.Duration) { + s.mu.Lock() + defer s.mu.Unlock() + + // Enforce minimum of 1 day + if age < 24*time.Hour { + age = 24 * time.Hour + } + + s.maxDiscoveryAge = age + log.Info().Dur("max_discovery_age", age).Msg("Max discovery age updated") +} + +// ClearCache clears the AI analysis cache. +func (s *Service) ClearCache() { + s.cacheMu.Lock() + defer s.cacheMu.Unlock() + s.analysisCache = make(map[string]*analysisCacheEntry) +} + +// --- AI Chat Integration Methods --- + +// GetDiscoveryForAIChat returns discovery data for AI chat context. +// It will run discovery if needed (fingerprint changed or no data exists). +// This is the just-in-time discovery approach: only call AI when data is actually needed. +func (s *Service) GetDiscoveryForAIChat(ctx context.Context, resourceType ResourceType, hostID, resourceID string) (*ResourceDiscovery, error) { + // This is the same as DiscoverResource but without Force + return s.DiscoverResource(ctx, DiscoveryRequest{ + ResourceType: resourceType, + ResourceID: resourceID, + HostID: hostID, + Force: false, // Let fingerprint logic decide + }) +} + +// GetDiscoveriesForAIContext returns discoveries for multiple resources. +// Used when AI chat needs context about the infrastructure. +// Only runs discovery for resources that actually need it (fingerprint changed). +func (s *Service) GetDiscoveriesForAIContext(ctx context.Context, resourceIDs []string) ([]*ResourceDiscovery, error) { + var results []*ResourceDiscovery + for _, id := range resourceIDs { + resourceType, hostID, resourceID, err := ParseResourceID(id) + if err != nil { + log.Debug().Err(err).Str("id", id).Msg("Failed to parse resource ID for AI context") + continue + } + discovery, err := s.GetDiscoveryForAIChat(ctx, resourceType, hostID, resourceID) + if err != nil { + log.Debug().Err(err).Str("id", id).Msg("Failed to get discovery for AI context") + continue + } + if discovery != nil { + results = append(results, discovery) + } + } + return results, nil +} + +// GetChangedResourceCount returns the count of resources whose fingerprint has changed +// since their last discovery. +func (s *Service) GetChangedResourceCount() (int, error) { + if s.store == nil { + return 0, nil + } + changed, err := s.store.GetChangedResources() + if err != nil { + return 0, err + } + return len(changed), nil +} + +// GetStaleResourceCount returns the count of resources whose discovery is older +// than maxDiscoveryAge. +func (s *Service) GetStaleResourceCount() (int, error) { + if s.store == nil { + return 0, nil + } + stale, err := s.store.GetStaleResources(s.maxDiscoveryAge) + if err != nil { + return 0, err + } + return len(stale), nil +} diff --git a/internal/servicediscovery/service_test.go b/internal/servicediscovery/service_test.go new file mode 100644 index 000000000..b4fb0af1b --- /dev/null +++ b/internal/servicediscovery/service_test.go @@ -0,0 +1,797 @@ +package servicediscovery + +import ( + "context" + "os" + "path/filepath" + "strings" + "sync" + "testing" + "time" +) + +type stubAnalyzer struct { + mu sync.Mutex + calls int + response string +} + +func (s *stubAnalyzer) AnalyzeForDiscovery(ctx context.Context, prompt string) (string, error) { + s.mu.Lock() + s.calls++ + s.mu.Unlock() + return s.response, nil +} + +type errorAnalyzer struct{} + +func (errorAnalyzer) AnalyzeForDiscovery(ctx context.Context, prompt string) (string, error) { + return "", context.Canceled +} + +type stubStateProvider struct { + state StateSnapshot +} + +func (s stubStateProvider) GetState() StateSnapshot { + return s.state +} + +type panicStateProvider struct{} + +func (panicStateProvider) GetState() StateSnapshot { + panic("boom") +} + +func TestService_parseAIResponse_Markdown(t *testing.T) { + service := &Service{} + response := "```json\n{\n \"service_type\": \"nginx\",\n \"service_name\": \"Nginx\",\n \"service_version\": \"1.2\",\n \"category\": \"web_server\",\n \"cli_access\": \"docker exec {container} bash\",\n \"facts\": [{\"category\": \"version\", \"key\": \"nginx\", \"value\": \"1.2\", \"source\": \"cmd\", \"confidence\": 0.9}],\n \"config_paths\": [\"/etc/nginx/nginx.conf\"],\n \"data_paths\": [\"/var/www\"],\n \"ports\": [{\"port\": 80, \"protocol\": \"tcp\", \"process\": \"nginx\", \"address\": \"0.0.0.0\"}],\n \"confidence\": 0.9,\n \"reasoning\": \"image name\"\n}\n```" + + parsed := service.parseAIResponse(response) + if parsed == nil { + t.Fatalf("expected parsed response") + } + if parsed.ServiceType != "nginx" || parsed.ServiceName != "Nginx" { + t.Fatalf("unexpected parsed result: %#v", parsed) + } + if len(parsed.Facts) != 1 || parsed.Facts[0].DiscoveredAt.IsZero() { + t.Fatalf("expected fact timestamp set: %#v", parsed.Facts) + } + + if service.parseAIResponse("not json") != nil { + t.Fatalf("expected nil for invalid json") + } +} + +func TestService_analyzeDockerContainer_CacheAndPorts(t *testing.T) { + store, err := NewStore(t.TempDir()) + if err != nil { + t.Fatalf("NewStore error: %v", err) + } + store.crypto = nil + service := NewService(store, nil, nil, Config{CacheExpiry: time.Hour}) + + analyzer := &stubAnalyzer{ + response: `{"service_type":"nginx","service_name":"Nginx","service_version":"1.2","category":"web_server","cli_access":"docker exec {container} nginx -v","facts":[],"config_paths":[],"data_paths":[],"ports":[],"confidence":0.9,"reasoning":"image"}`, + } + + container := DockerContainer{ + Name: "web", + Image: "nginx:latest", + Status: "running", + Ports: []DockerPort{ + {PublicPort: 8080, PrivatePort: 80, Protocol: "tcp"}, + }, + } + host := DockerHost{ + AgentID: "host1", + Hostname: "host1", + } + + first := service.analyzeDockerContainer(context.Background(), analyzer, container, host) + if first == nil { + t.Fatalf("expected discovery") + } + if !strings.Contains(first.CLIAccess, "web") { + t.Fatalf("expected cli access to include container name, got %s", first.CLIAccess) + } + if len(first.Ports) != 1 || first.Ports[0].Port != 80 || first.Ports[0].Address != ":8080" { + t.Fatalf("unexpected ports: %#v", first.Ports) + } + + second := service.analyzeDockerContainer(context.Background(), analyzer, container, host) + if second == nil { + t.Fatalf("expected cached discovery") + } + + analyzer.mu.Lock() + calls := analyzer.calls + analyzer.mu.Unlock() + if calls != 1 { + t.Fatalf("expected analyzer called once, got %d", calls) + } + + lowAnalyzer := &stubAnalyzer{ + response: `{"service_type":"unknown","service_name":"","service_version":"","category":"unknown","cli_access":"","facts":[],"config_paths":[],"data_paths":[],"ports":[],"confidence":0.4,"reasoning":""}`, + } + lowContainer := DockerContainer{Name: "mystery", Image: "unknown:latest"} + if got := service.analyzeDockerContainer(context.Background(), lowAnalyzer, lowContainer, host); got != nil { + t.Fatalf("expected low confidence discovery to be skipped") + } +} + +func TestService_DiscoverResource_RecentAndNoAnalyzer(t *testing.T) { + store, err := NewStore(t.TempDir()) + if err != nil { + t.Fatalf("NewStore error: %v", err) + } + store.crypto = nil + service := NewService(store, nil, nil, DefaultConfig()) + + req := DiscoveryRequest{ + ResourceType: ResourceTypeDocker, + ResourceID: "nginx", + HostID: "host1", + Hostname: "host1", + } + discovery := &ResourceDiscovery{ + ID: MakeResourceID(req.ResourceType, req.HostID, req.ResourceID), + ResourceType: req.ResourceType, + ResourceID: req.ResourceID, + HostID: req.HostID, + Hostname: req.Hostname, + ServiceName: "Existing", + } + if err := store.Save(discovery); err != nil { + t.Fatalf("Save error: %v", err) + } + + found, err := service.DiscoverResource(context.Background(), req) + if err != nil { + t.Fatalf("DiscoverResource error: %v", err) + } + if found == nil || found.ServiceName != "Existing" { + t.Fatalf("unexpected discovery: %#v", found) + } + + _, err = service.DiscoverResource(context.Background(), DiscoveryRequest{ + ResourceType: ResourceTypeVM, + ResourceID: "101", + HostID: "node1", + Hostname: "node1", + Force: true, + }) + if err == nil || !strings.Contains(err.Error(), "AI analyzer") { + t.Fatalf("expected analyzer error, got %v", err) + } + + service.SetAIAnalyzer(errorAnalyzer{}) + _, err = service.DiscoverResource(context.Background(), DiscoveryRequest{ + ResourceType: ResourceTypeVM, + ResourceID: "102", + HostID: "node1", + Hostname: "node1", + Force: true, + }) + if err == nil || !strings.Contains(err.Error(), "AI analysis failed") { + t.Fatalf("expected analysis error, got %v", err) + } + + service.SetAIAnalyzer(&stubAnalyzer{response: "not json"}) + _, err = service.DiscoverResource(context.Background(), DiscoveryRequest{ + ResourceType: ResourceTypeVM, + ResourceID: "103", + HostID: "node1", + Hostname: "node1", + Force: true, + }) + if err == nil || !strings.Contains(err.Error(), "failed to parse") { + t.Fatalf("expected parse error, got %v", err) + } +} + +func TestService_getResourceMetadata(t *testing.T) { + state := StateSnapshot{ + VMs: []VM{ + {VMID: 101, Name: "vm1", Node: "node1", Status: "running"}, + }, + Containers: []Container{ + {VMID: 201, Name: "lxc1", Node: "node2", Status: "stopped"}, + }, + DockerHosts: []DockerHost{ + { + AgentID: "agent1", + Hostname: "dock1", + Containers: []DockerContainer{ + {Name: "redis", Image: "redis:latest", Status: "running", Labels: map[string]string{"tier": "cache"}}, + }, + }, + }, + } + + service := NewService(nil, nil, stubStateProvider{state: state}, DefaultConfig()) + + vmMeta := service.getResourceMetadata(DiscoveryRequest{ + ResourceType: ResourceTypeVM, + ResourceID: "101", + HostID: "node1", + }) + if vmMeta["name"] != "vm1" || vmMeta["vmid"] != 101 { + t.Fatalf("unexpected vm metadata: %#v", vmMeta) + } + + lxcMeta := service.getResourceMetadata(DiscoveryRequest{ + ResourceType: ResourceTypeLXC, + ResourceID: "201", + HostID: "node2", + }) + if lxcMeta["name"] != "lxc1" || lxcMeta["status"] != "stopped" { + t.Fatalf("unexpected lxc metadata: %#v", lxcMeta) + } + + dockerMeta := service.getResourceMetadata(DiscoveryRequest{ + ResourceType: ResourceTypeDocker, + ResourceID: "redis", + HostID: "agent1", + }) + if dockerMeta["image"] != "redis:latest" || dockerMeta["status"] != "running" { + t.Fatalf("unexpected docker metadata: %#v", dockerMeta) + } + + dockerByHost := service.getResourceMetadata(DiscoveryRequest{ + ResourceType: ResourceTypeDocker, + ResourceID: "redis", + HostID: "dock1", + }) + if dockerByHost["image"] != "redis:latest" { + t.Fatalf("unexpected docker hostname metadata: %#v", dockerByHost) + } +} + +func TestService_formatCLIAccessAndStatus(t *testing.T) { + service := NewService(nil, nil, nil, DefaultConfig()) + formatted := service.formatCLIAccess(ResourceTypeDocker, "redis", "") + // New format is instructional, should mention the container name and pulse_control + if !strings.Contains(formatted, "redis") || !strings.Contains(formatted, "docker exec") { + t.Fatalf("unexpected cli access: %s", formatted) + } + + service.analysisCache = map[string]*analysisCacheEntry{ + "nginx:latest": { + result: &AIAnalysisResponse{ServiceType: "nginx"}, + cachedAt: time.Now(), + }, + } + service.running = true + status := service.GetStatus() + if status["running"] != true || status["cache_size"] != 1 { + t.Fatalf("unexpected status: %#v", status) + } + + service.ClearCache() + if len(service.analysisCache) != 0 { + t.Fatalf("expected cache cleared") + } +} + +func TestService_DefaultsAndSetAnalyzer(t *testing.T) { + service := NewService(nil, nil, nil, Config{}) + if service.interval == 0 || service.cacheExpiry == 0 { + t.Fatalf("expected defaults for interval and cache expiry") + } + + analyzer := &stubAnalyzer{response: `{}`} + service.SetAIAnalyzer(analyzer) + if service.aiAnalyzer == nil { + t.Fatalf("expected analyzer set") + } + if service.GetProgress("missing") != nil { + t.Fatalf("expected nil progress without scanner") + } + if service.getResourceMetadata(DiscoveryRequest{}) != nil { + t.Fatalf("expected nil metadata without state provider") + } +} + +func TestService_FingerprintCollectionAndDiscoveryWrappers(t *testing.T) { + store, err := NewStore(t.TempDir()) + if err != nil { + t.Fatalf("NewStore error: %v", err) + } + store.crypto = nil + state := StateSnapshot{ + DockerHosts: []DockerHost{ + { + AgentID: "host1", + Hostname: "host1", + Containers: []DockerContainer{ + {Name: "web", Image: "nginx:latest", Status: "running"}, + }, + }, + }, + } + service := NewService(store, nil, stubStateProvider{state: state}, DefaultConfig()) + service.SetAIAnalyzer(&stubAnalyzer{ + response: `{"service_type":"nginx","service_name":"Nginx","service_version":"1.2","category":"web_server","cli_access":"docker exec {container} nginx -v","facts":[],"config_paths":[],"data_paths":[],"ports":[],"confidence":0.9,"reasoning":"image"}`, + }) + + // First, collect fingerprints (no AI calls) + service.collectFingerprints(context.Background()) + + // Verify fingerprint was collected (key format is type:host:id) + fp, err := store.GetFingerprint("docker:host1:web") + if err != nil { + t.Fatalf("GetFingerprint error: %v", err) + } + if fp == nil { + t.Fatalf("expected fingerprint to be collected") + } + + // Now trigger on-demand discovery (this makes AI call) + id := MakeResourceID(ResourceTypeDocker, "host1", "web") + discovery, err := service.DiscoverResource(context.Background(), DiscoveryRequest{ + ResourceType: ResourceTypeDocker, + ResourceID: "web", + HostID: "host1", + Hostname: "host1", + }) + if err != nil { + t.Fatalf("DiscoverResource error: %v", err) + } + if discovery == nil { + t.Fatalf("expected discovery result") + } + + if got, err := service.GetDiscovery(id); err != nil || got == nil { + t.Fatalf("GetDiscovery error: %v", err) + } + if got, err := service.GetDiscoveryByResource(ResourceTypeDocker, "host1", "web"); err != nil || got == nil { + t.Fatalf("GetDiscoveryByResource error: %v", err) + } + + if list, err := service.ListDiscoveries(); err != nil || len(list) != 1 { + t.Fatalf("ListDiscoveries unexpected: %v len=%d", err, len(list)) + } + if list, err := service.ListDiscoveriesByType(ResourceTypeDocker); err != nil || len(list) != 1 { + t.Fatalf("ListDiscoveriesByType unexpected: %v len=%d", err, len(list)) + } + if list, err := service.ListDiscoveriesByHost("host1"); err != nil || len(list) != 1 { + t.Fatalf("ListDiscoveriesByHost unexpected: %v len=%d", err, len(list)) + } + + if err := service.UpdateNotes(id, "note", map[string]string{"k": "v"}); err != nil { + t.Fatalf("UpdateNotes error: %v", err) + } + updated, err := service.GetDiscovery(id) + if err != nil || updated.UserNotes != "note" { + t.Fatalf("expected updated notes: %#v err=%v", updated, err) + } + + scanner := NewDeepScanner(&stubExecutor{}) + scanner.progress[id] = &DiscoveryProgress{ResourceID: id} + service.scanner = scanner + if service.GetProgress(id) == nil { + t.Fatalf("expected progress") + } + + if err := service.DeleteDiscovery(id); err != nil { + t.Fatalf("DeleteDiscovery error: %v", err) + } + + service.stateProvider = nil + service.collectFingerprints(context.Background()) +} + +func TestService_PromptsAndDiscoveryLoop(t *testing.T) { + service := NewService(nil, nil, nil, DefaultConfig()) + + container := DockerContainer{ + Name: "web", + Image: "nginx:latest", + Status: "running", + Ports: []DockerPort{ + {PublicPort: 8080, PrivatePort: 80, Protocol: "tcp"}, + }, + Labels: map[string]string{"app": "nginx"}, + Mounts: []DockerMount{{Destination: "/etc/nginx"}}, + } + host := DockerHost{Hostname: "host1"} + prompt := service.buildMetadataAnalysisPrompt(container, host) + if !strings.Contains(prompt, "\"ports\"") || !strings.Contains(prompt, "\"labels\"") || !strings.Contains(prompt, "\"mounts\"") { + t.Fatalf("unexpected metadata prompt: %s", prompt) + } + + longOutput := strings.Repeat("a", 2100) + deepPrompt := service.buildDeepAnalysisPrompt(AIAnalysisRequest{ + ResourceType: ResourceTypeDocker, + ResourceID: "web", + HostID: "host1", + Hostname: "host1", + Metadata: map[string]any{"image": "nginx"}, + CommandOutputs: map[string]string{ + "ps": longOutput, + }, + }) + if !strings.Contains(deepPrompt, "(truncated)") || !strings.Contains(deepPrompt, "Metadata:") { + t.Fatalf("unexpected deep prompt") + } + + ctx, cancel := context.WithCancel(context.Background()) + cancel() + service.initialDelay = time.Millisecond + service.Start(ctx) + service.Start(ctx) + service.Stop() + + service.stopCh = make(chan struct{}) + close(service.stopCh) + service.discoveryLoop(context.Background()) + + service.initialDelay = 0 + service.stopCh = make(chan struct{}) + close(service.stopCh) + service.discoveryLoop(context.Background()) +} + +func TestService_FingerprintLoop_StopAndCancel(t *testing.T) { + state := StateSnapshot{ + DockerHosts: []DockerHost{ + { + AgentID: "host1", + Hostname: "host1", + Containers: []DockerContainer{ + {Name: "web", Image: "nginx:latest", Status: "running"}, + }, + }, + }, + } + + runLoop := func(stopWithCancel bool) { + store, err := NewStore(t.TempDir()) + if err != nil { + t.Fatalf("NewStore error: %v", err) + } + store.crypto = nil + + service := NewService(store, nil, stubStateProvider{state: state}, DefaultConfig()) + // Analyzer should NOT be called - background loop only collects fingerprints + analyzer := &stubAnalyzer{ + response: `{"service_type":"nginx","service_name":"Nginx","service_version":"1.2","category":"web_server","cli_access":"docker exec {container} nginx -v","facts":[],"config_paths":[],"data_paths":[],"ports":[],"confidence":0.9,"reasoning":"image"}`, + } + service.SetAIAnalyzer(analyzer) + service.initialDelay = time.Millisecond + service.interval = time.Millisecond + service.cacheExpiry = time.Nanosecond + + done := make(chan struct{}) + ctx, cancel := context.WithCancel(context.Background()) + go func() { + service.discoveryLoop(ctx) + close(done) + }() + + time.Sleep(5 * time.Millisecond) + if stopWithCancel { + cancel() + } else { + close(service.stopCh) + } + + select { + case <-done: + case <-time.After(50 * time.Millisecond): + t.Fatalf("discoveryLoop did not stop") + } + + // Verify fingerprints were collected (background loop does NOT make AI calls) + // Key format is type:host:id + fp, err := store.GetFingerprint("docker:host1:web") + if err != nil { + t.Fatalf("GetFingerprint error: %v", err) + } + if fp == nil { + t.Fatalf("expected fingerprint to be collected") + } + + // Verify NO AI calls were made in background loop + analyzer.mu.Lock() + calls := analyzer.calls + analyzer.mu.Unlock() + if calls > 0 { + t.Fatalf("expected no AI calls in background loop (fingerprint-only), got %d", calls) + } + } + + runLoop(false) + runLoop(true) +} + +func TestService_DiscoverDockerContainersSkips(t *testing.T) { + store, err := NewStore(t.TempDir()) + if err != nil { + t.Fatalf("NewStore error: %v", err) + } + store.crypto = nil + + service := NewService(store, nil, nil, DefaultConfig()) + service.discoverDockerContainers(context.Background(), []DockerHost{{AgentID: "host1"}}) + + service.SetAIAnalyzer(&stubAnalyzer{ + response: `{"service_type":"nginx","service_name":"Nginx","service_version":"1.2","category":"web_server","cli_access":"docker exec {container} nginx -v","facts":[],"config_paths":[],"data_paths":[],"ports":[],"confidence":0.9,"reasoning":"image"}`, + }) + + id := MakeResourceID(ResourceTypeDocker, "host1", "web") + if err := store.Save(&ResourceDiscovery{ID: id, ResourceType: ResourceTypeDocker}); err != nil { + t.Fatalf("Save error: %v", err) + } + service.cacheExpiry = time.Hour + service.discoverDockerContainers(context.Background(), []DockerHost{ + {AgentID: "host1", Containers: []DockerContainer{{Name: "web", Image: "nginx:latest"}}}, + }) + + badAnalyzer := &stubAnalyzer{response: "not json"} + if got := service.analyzeDockerContainer(context.Background(), badAnalyzer, DockerContainer{Name: "bad", Image: "bad"}, DockerHost{AgentID: "host1"}); got != nil { + t.Fatalf("expected nil for bad analysis") + } + + canceled, cancel := context.WithCancel(context.Background()) + cancel() + analyzer := &stubAnalyzer{response: `{"service_type":"nginx","service_name":"Nginx","service_version":"1.2","category":"web_server","cli_access":"docker exec {container} nginx -v","facts":[],"config_paths":[],"data_paths":[],"ports":[],"confidence":0.9,"reasoning":"image"}`} + service.SetAIAnalyzer(analyzer) + service.discoverDockerContainers(canceled, []DockerHost{ + {AgentID: "host1", Containers: []DockerContainer{{Name: "web2", Image: "nginx:latest"}}}, + }) + analyzer.mu.Lock() + calls := analyzer.calls + analyzer.mu.Unlock() + if calls != 0 { + t.Fatalf("expected analyzer not called on canceled context") + } + + errAnalyzer := errorAnalyzer{} + if got := service.analyzeDockerContainer(context.Background(), errAnalyzer, DockerContainer{Name: "err", Image: "err"}, DockerHost{AgentID: "host1"}); got != nil { + t.Fatalf("expected nil when analyzer returns error") + } + + storePath := filepath.Join(t.TempDir(), "file") + if err := os.WriteFile(storePath, []byte("x"), 0600); err != nil { + t.Fatalf("WriteFile error: %v", err) + } + service.store.dataDir = storePath + service.discoverDockerContainers(context.Background(), []DockerHost{ + {AgentID: "host1", Containers: []DockerContainer{{Name: "web3", Image: "nginx:latest"}}}, + }) +} + +func TestService_CollectFingerprintsRecover(t *testing.T) { + service := NewService(nil, nil, panicStateProvider{}, DefaultConfig()) + service.collectFingerprints(context.Background()) +} + +func TestService_DiscoverResource_SaveError(t *testing.T) { + store, err := NewStore(t.TempDir()) + if err != nil { + t.Fatalf("NewStore error: %v", err) + } + store.crypto = nil + + badPath := filepath.Join(t.TempDir(), "file") + if err := os.WriteFile(badPath, []byte("x"), 0600); err != nil { + t.Fatalf("WriteFile error: %v", err) + } + store.dataDir = badPath + + service := NewService(store, nil, nil, DefaultConfig()) + service.SetAIAnalyzer(&stubAnalyzer{ + response: `{"service_type":"nginx","service_name":"Nginx","service_version":"1.2","category":"web_server","cli_access":"docker exec {container} nginx -v","facts":[],"config_paths":[],"data_paths":[],"ports":[],"confidence":0.9,"reasoning":"image"}`, + }) + + _, err = service.DiscoverResource(context.Background(), DiscoveryRequest{ + ResourceType: ResourceTypeDocker, + ResourceID: "web", + HostID: "host1", + Hostname: "host1", + Force: true, + }) + if err == nil || !strings.Contains(err.Error(), "failed to save discovery") { + t.Fatalf("expected save error, got %v", err) + } +} + +func TestService_DiscoverResource_ScanError(t *testing.T) { + store, err := NewStore(t.TempDir()) + if err != nil { + t.Fatalf("NewStore error: %v", err) + } + store.crypto = nil + + scanner := NewDeepScanner(nil) + service := NewService(store, scanner, nil, DefaultConfig()) + service.SetAIAnalyzer(&stubAnalyzer{ + response: `{"service_type":"nginx","service_name":"Nginx","service_version":"1.2","category":"web_server","cli_access":"docker exec {container} nginx -v","facts":[],"config_paths":[],"data_paths":[],"ports":[],"confidence":0.9,"reasoning":"image"}`, + }) + + _, err = service.DiscoverResource(context.Background(), DiscoveryRequest{ + ResourceType: ResourceTypeDocker, + ResourceID: "web", + HostID: "host1", + Hostname: "host1", + Force: true, + }) + if err != nil { + t.Fatalf("expected scan error to be tolerated, got %v", err) + } +} + +func TestService_DiscoveryLoop_ContextDoneAtStart(t *testing.T) { + service := NewService(nil, nil, nil, DefaultConfig()) + service.initialDelay = time.Hour + service.stopCh = make(chan struct{}) + + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + service.discoveryLoop(ctx) +} + +func TestService_DiscoverResource_WithScanResult(t *testing.T) { + store, err := NewStore(t.TempDir()) + if err != nil { + t.Fatalf("NewStore error: %v", err) + } + store.crypto = nil + + exec := &stubExecutor{ + agents: []ConnectedAgent{{AgentID: "host1", Hostname: "host1"}}, + } + scanner := NewDeepScanner(exec) + scanner.maxParallel = 1 + + state := StateSnapshot{ + DockerHosts: []DockerHost{ + { + AgentID: "host1", + Hostname: "host1", + Containers: []DockerContainer{ + {Name: "web", Image: "nginx:latest", Status: "running"}, + }, + }, + }, + } + + service := NewService(store, scanner, stubStateProvider{state: state}, DefaultConfig()) + service.SetAIAnalyzer(&stubAnalyzer{ + response: `{"service_type":"nginx","service_name":"Nginx","service_version":"1.2","category":"web_server","cli_access":"docker exec {container} nginx -v","facts":[],"config_paths":[],"data_paths":[],"ports":[{"port":80,"protocol":"tcp","process":"nginx","address":"0.0.0.0"}],"confidence":0.9,"reasoning":"image"}`, + }) + + existing := &ResourceDiscovery{ + ID: MakeResourceID(ResourceTypeDocker, "host1", "web"), + ResourceType: ResourceTypeDocker, + ResourceID: "web", + HostID: "host1", + Hostname: "host1", + UserNotes: "keep", + UserSecrets: map[string]string{"token": "secret"}, + DiscoveredAt: time.Now().Add(-2 * time.Hour), + } + if err := store.Save(existing); err != nil { + t.Fatalf("Save error: %v", err) + } + + found, err := service.DiscoverResource(context.Background(), DiscoveryRequest{ + ResourceType: ResourceTypeDocker, + ResourceID: "web", + HostID: "host1", + Hostname: "host1", + Force: true, + }) + if err != nil { + t.Fatalf("DiscoverResource error: %v", err) + } + if found.UserNotes != "keep" || found.UserSecrets["token"] != "secret" { + t.Fatalf("expected user fields preserved: %#v", found) + } + if len(found.RawCommandOutput) == 0 { + t.Fatalf("expected raw command output") + } + if found.DiscoveredAt.After(existing.DiscoveredAt) { + t.Fatalf("expected older discovered_at preserved") + } +} + +func TestParseDockerMounts(t *testing.T) { + tests := []struct { + name string + input string + expected []DockerBindMount + }{ + { + name: "empty input", + input: "", + expected: nil, + }, + { + name: "no_docker_mounts marker", + input: "no_docker_mounts", + expected: nil, + }, + { + name: "only done marker", + input: "docker_mounts_done", + expected: nil, + }, + { + name: "single container with bind mount", + input: "CONTAINER:homepage\n/home/user/homepage/config|/app/config|bind\ndocker_mounts_done", + expected: []DockerBindMount{ + {ContainerName: "homepage", Source: "/home/user/homepage/config", Destination: "/app/config", Type: "bind"}, + }, + }, + { + name: "single container with volume", + input: "CONTAINER:nginx\nnginx_data|/usr/share/nginx/html|volume\ndocker_mounts_done", + expected: []DockerBindMount{ + {ContainerName: "nginx", Source: "nginx_data", Destination: "/usr/share/nginx/html", Type: "volume"}, + }, + }, + { + name: "multiple containers", + input: "CONTAINER:homepage\n/home/user/config|/app/config|bind\nCONTAINER:watchtower\n/var/run/docker.sock|/var/run/docker.sock|bind\ndocker_mounts_done", + expected: []DockerBindMount{ + {ContainerName: "homepage", Source: "/home/user/config", Destination: "/app/config", Type: "bind"}, + {ContainerName: "watchtower", Source: "/var/run/docker.sock", Destination: "/var/run/docker.sock", Type: "bind"}, + }, + }, + { + name: "container with multiple mounts", + input: "CONTAINER:jellyfin\n/media/movies|/movies|bind\n/media/tv|/tv|bind\n/config/jellyfin|/config|bind\ndocker_mounts_done", + expected: []DockerBindMount{ + {ContainerName: "jellyfin", Source: "/media/movies", Destination: "/movies", Type: "bind"}, + {ContainerName: "jellyfin", Source: "/media/tv", Destination: "/tv", Type: "bind"}, + {ContainerName: "jellyfin", Source: "/config/jellyfin", Destination: "/config", Type: "bind"}, + }, + }, + { + name: "container with no mounts", + input: "CONTAINER:alpine\ndocker_mounts_done", + expected: nil, + }, + { + name: "filters out tmpfs", + input: "CONTAINER:app\n/data|/data|bind\n||tmpfs\ndocker_mounts_done", + expected: []DockerBindMount{ + {ContainerName: "app", Source: "/data", Destination: "/data", Type: "bind"}, + }, + }, + { + name: "mount without type defaults to included", + input: "CONTAINER:app\n/config|/app/config\ndocker_mounts_done", + expected: []DockerBindMount{ + {ContainerName: "app", Source: "/config", Destination: "/app/config", Type: ""}, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := parseDockerMounts(tt.input) + if len(result) != len(tt.expected) { + t.Fatalf("expected %d mounts, got %d: %#v", len(tt.expected), len(result), result) + } + for i := range tt.expected { + if result[i].ContainerName != tt.expected[i].ContainerName { + t.Errorf("mount %d: expected container %q, got %q", i, tt.expected[i].ContainerName, result[i].ContainerName) + } + if result[i].Source != tt.expected[i].Source { + t.Errorf("mount %d: expected source %q, got %q", i, tt.expected[i].Source, result[i].Source) + } + if result[i].Destination != tt.expected[i].Destination { + t.Errorf("mount %d: expected destination %q, got %q", i, tt.expected[i].Destination, result[i].Destination) + } + if result[i].Type != tt.expected[i].Type { + t.Errorf("mount %d: expected type %q, got %q", i, tt.expected[i].Type, result[i].Type) + } + } + }) + } +} diff --git a/internal/servicediscovery/store.go b/internal/servicediscovery/store.go new file mode 100644 index 000000000..26b43b519 --- /dev/null +++ b/internal/servicediscovery/store.go @@ -0,0 +1,651 @@ +package servicediscovery + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + "strings" + "sync" + "time" + + "github.com/rcourtman/pulse-go-rewrite/internal/crypto" + "github.com/rs/zerolog/log" +) + +// CryptoManager interface for encryption/decryption. +type CryptoManager interface { + Encrypt(plaintext []byte) ([]byte, error) + Decrypt(ciphertext []byte) ([]byte, error) +} + +// Store provides encrypted per-resource storage for discovery data. +type Store struct { + mu sync.RWMutex + dataDir string + crypto CryptoManager + cache map[string]*ResourceDiscovery // In-memory cache + cacheTime map[string]time.Time // Cache timestamps + cacheTTL time.Duration + + // Fingerprint storage (in-memory with file persistence) + fingerprintDir string + fingerprints map[string]*ContainerFingerprint // resourceID -> fingerprint + fingerprintMu sync.RWMutex + lastFingerprintScan time.Time +} + +// For testing - allows injecting a mock crypto manager +var newCryptoManagerAt = crypto.NewCryptoManagerAt + +// For testing - allows injecting a mock marshaler. +var marshalDiscovery = json.Marshal + +// NewStore creates a new discovery store with automatic encryption. +func NewStore(dataDir string) (*Store, error) { + discoveryDir := filepath.Join(dataDir, "discovery") + if err := os.MkdirAll(discoveryDir, 0700); err != nil { + return nil, fmt.Errorf("failed to create discovery directory: %w", err) + } + + // Create fingerprint subdirectory + fingerprintDir := filepath.Join(discoveryDir, "fingerprints") + if err := os.MkdirAll(fingerprintDir, 0700); err != nil { + return nil, fmt.Errorf("failed to create fingerprint directory: %w", err) + } + + // Initialize crypto manager for encryption (uses same key as other Pulse secrets) + cryptoMgr, err := newCryptoManagerAt(dataDir) + if err != nil { + log.Warn().Err(err).Msg("Failed to initialize crypto for discovery store, data will be unencrypted") + } + + store := &Store{ + dataDir: discoveryDir, + fingerprintDir: fingerprintDir, + crypto: cryptoMgr, + cache: make(map[string]*ResourceDiscovery), + cacheTime: make(map[string]time.Time), + cacheTTL: 5 * time.Minute, + fingerprints: make(map[string]*ContainerFingerprint), + } + + // Load existing fingerprints from disk + store.loadFingerprints() + + return store, nil +} + +// getFilePath returns the file path for a resource ID. +func (s *Store) getFilePath(id string) string { + // Sanitize ID for filename: replace : with _ + safeID := strings.ReplaceAll(id, ":", "_") + safeID = strings.ReplaceAll(safeID, "/", "_") + return filepath.Join(s.dataDir, safeID+".enc") +} + +// Save persists a discovery to encrypted storage. +func (s *Store) Save(d *ResourceDiscovery) error { + s.mu.Lock() + defer s.mu.Unlock() + + if d.ID == "" { + return fmt.Errorf("discovery ID is required") + } + + // Update timestamp + d.UpdatedAt = time.Now() + if d.DiscoveredAt.IsZero() { + d.DiscoveredAt = d.UpdatedAt + } + + data, err := marshalDiscovery(d) + if err != nil { + return fmt.Errorf("failed to marshal discovery: %w", err) + } + + // Encrypt if crypto is available + if s.crypto != nil { + encrypted, err := s.crypto.Encrypt(data) + if err != nil { + return fmt.Errorf("failed to encrypt discovery: %w", err) + } + data = encrypted + } + + // Write atomically using tmp file + rename + filePath := s.getFilePath(d.ID) + tmpPath := filePath + ".tmp" + + if err := os.WriteFile(tmpPath, data, 0600); err != nil { + return fmt.Errorf("failed to write discovery file: %w", err) + } + + if err := os.Rename(tmpPath, filePath); err != nil { + _ = os.Remove(tmpPath) + return fmt.Errorf("failed to finalize discovery file: %w", err) + } + + // Update cache + s.cache[d.ID] = d + s.cacheTime[d.ID] = time.Now() + + log.Debug().Str("id", d.ID).Str("service", d.ServiceType).Msg("Discovery saved") + return nil +} + +// Get retrieves a discovery from storage. +func (s *Store) Get(id string) (*ResourceDiscovery, error) { + s.mu.RLock() + // Check cache first + if cached, ok := s.cache[id]; ok { + if cacheTime, hasTime := s.cacheTime[id]; hasTime { + if time.Since(cacheTime) < s.cacheTTL { + s.mu.RUnlock() + return cached, nil + } + } + } + s.mu.RUnlock() + + s.mu.Lock() + defer s.mu.Unlock() + + filePath := s.getFilePath(id) + data, err := os.ReadFile(filePath) + if err != nil { + if os.IsNotExist(err) { + return nil, nil // Not found is not an error + } + return nil, fmt.Errorf("failed to read discovery file: %w", err) + } + + // Decrypt if crypto is available + if s.crypto != nil { + decrypted, err := s.crypto.Decrypt(data) + if err != nil { + return nil, fmt.Errorf("failed to decrypt discovery: %w", err) + } + data = decrypted + } + + var discovery ResourceDiscovery + if err := json.Unmarshal(data, &discovery); err != nil { + return nil, fmt.Errorf("failed to unmarshal discovery: %w", err) + } + + // Update cache + s.cache[id] = &discovery + s.cacheTime[id] = time.Now() + + return &discovery, nil +} + +// GetByResource retrieves a discovery by resource type and ID. +func (s *Store) GetByResource(resourceType ResourceType, hostID, resourceID string) (*ResourceDiscovery, error) { + id := MakeResourceID(resourceType, hostID, resourceID) + return s.Get(id) +} + +// Delete removes a discovery from storage. +func (s *Store) Delete(id string) error { + s.mu.Lock() + defer s.mu.Unlock() + + filePath := s.getFilePath(id) + if err := os.Remove(filePath); err != nil { + if os.IsNotExist(err) { + return nil // Already deleted + } + return fmt.Errorf("failed to delete discovery file: %w", err) + } + + // Remove from cache + delete(s.cache, id) + delete(s.cacheTime, id) + + log.Debug().Str("id", id).Msg("Discovery deleted") + return nil +} + +// List returns all discoveries. +func (s *Store) List() ([]*ResourceDiscovery, error) { + s.mu.RLock() + defer s.mu.RUnlock() + + entries, err := os.ReadDir(s.dataDir) + if err != nil { + if os.IsNotExist(err) { + return []*ResourceDiscovery{}, nil + } + return nil, fmt.Errorf("failed to list discovery directory: %w", err) + } + + var discoveries []*ResourceDiscovery + for _, entry := range entries { + // Skip tmp files first to avoid reading partial writes. + if strings.HasSuffix(entry.Name(), ".tmp") { + continue + } + if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".enc") { + continue + } + + data, err := os.ReadFile(filepath.Join(s.dataDir, entry.Name())) + if err != nil { + log.Warn().Err(err).Str("file", entry.Name()).Msg("Failed to read discovery file") + continue + } + + // Decrypt if crypto is available + if s.crypto != nil { + decrypted, err := s.crypto.Decrypt(data) + if err != nil { + log.Warn().Err(err).Str("file", entry.Name()).Msg("Failed to decrypt discovery") + continue + } + data = decrypted + } + + var discovery ResourceDiscovery + if err := json.Unmarshal(data, &discovery); err != nil { + log.Warn().Err(err).Str("file", entry.Name()).Msg("Failed to unmarshal discovery") + continue + } + + discoveries = append(discoveries, &discovery) + } + + return discoveries, nil +} + +// ListByType returns discoveries for a specific resource type. +func (s *Store) ListByType(resourceType ResourceType) ([]*ResourceDiscovery, error) { + all, err := s.List() + if err != nil { + return nil, err + } + + var filtered []*ResourceDiscovery + for _, d := range all { + if d.ResourceType == resourceType { + filtered = append(filtered, d) + } + } + return filtered, nil +} + +// ListByHost returns discoveries for a specific host. +func (s *Store) ListByHost(hostID string) ([]*ResourceDiscovery, error) { + all, err := s.List() + if err != nil { + return nil, err + } + + var filtered []*ResourceDiscovery + for _, d := range all { + if d.HostID == hostID { + filtered = append(filtered, d) + } + } + return filtered, nil +} + +// UpdateNotes updates just the user notes and secrets for a discovery. +func (s *Store) UpdateNotes(id string, notes string, secrets map[string]string) error { + discovery, err := s.Get(id) + if err != nil { + return err + } + if discovery == nil { + return fmt.Errorf("discovery not found: %s", id) + } + + discovery.UserNotes = notes + if secrets != nil { + discovery.UserSecrets = secrets + } + + return s.Save(discovery) +} + +// GetMultiple retrieves multiple discoveries by ID. +func (s *Store) GetMultiple(ids []string) ([]*ResourceDiscovery, error) { + var discoveries []*ResourceDiscovery + for _, id := range ids { + d, err := s.Get(id) + if err != nil { + log.Warn().Err(err).Str("id", id).Msg("Failed to get discovery") + continue + } + if d != nil { + discoveries = append(discoveries, d) + } + } + return discoveries, nil +} + +// ClearCache clears the in-memory cache. +func (s *Store) ClearCache() { + s.mu.Lock() + defer s.mu.Unlock() + s.cache = make(map[string]*ResourceDiscovery) + s.cacheTime = make(map[string]time.Time) +} + +// Exists checks if a discovery exists for the given ID. +func (s *Store) Exists(id string) bool { + s.mu.RLock() + if _, ok := s.cache[id]; ok { + s.mu.RUnlock() + return true + } + s.mu.RUnlock() + + filePath := s.getFilePath(id) + _, err := os.Stat(filePath) + return err == nil +} + +// GetAge returns how old the discovery is, or -1 if not found. +func (s *Store) GetAge(id string) time.Duration { + d, err := s.Get(id) + if err != nil || d == nil { + return -1 + } + return time.Since(d.UpdatedAt) +} + +// NeedsRefresh checks if a discovery needs to be refreshed. +func (s *Store) NeedsRefresh(id string, maxAge time.Duration) bool { + age := s.GetAge(id) + if age < 0 { + return true // Not found, needs discovery + } + return age > maxAge +} + +// --- Fingerprint Storage Methods --- + +// getFingerprintFilePath returns the file path for a fingerprint. +func (s *Store) getFingerprintFilePath(resourceID string) string { + // Sanitize ID for filename + safeID := strings.ReplaceAll(resourceID, ":", "_") + safeID = strings.ReplaceAll(safeID, "/", "_") + return filepath.Join(s.fingerprintDir, safeID+".json") +} + +// loadFingerprints loads all fingerprints from disk into memory. +func (s *Store) loadFingerprints() { + s.fingerprintMu.Lock() + defer s.fingerprintMu.Unlock() + + entries, err := os.ReadDir(s.fingerprintDir) + if err != nil { + if !os.IsNotExist(err) { + log.Warn().Err(err).Msg("Failed to read fingerprint directory") + } + return + } + + for _, entry := range entries { + if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".json") { + continue + } + + data, err := os.ReadFile(filepath.Join(s.fingerprintDir, entry.Name())) + if err != nil { + log.Warn().Err(err).Str("file", entry.Name()).Msg("Failed to read fingerprint file") + continue + } + + var fp ContainerFingerprint + if err := json.Unmarshal(data, &fp); err != nil { + log.Warn().Err(err).Str("file", entry.Name()).Msg("Failed to unmarshal fingerprint") + continue + } + + s.fingerprints[fp.ResourceID] = &fp + } + + log.Debug().Int("count", len(s.fingerprints)).Msg("Loaded fingerprints from disk") +} + +// SaveFingerprint stores a container fingerprint. +func (s *Store) SaveFingerprint(fp *ContainerFingerprint) error { + if fp == nil || fp.ResourceID == "" { + return fmt.Errorf("fingerprint or resource ID is required") + } + + s.fingerprintMu.Lock() + defer s.fingerprintMu.Unlock() + + // Update in-memory cache + s.fingerprints[fp.ResourceID] = fp + + // Persist to disk + data, err := json.Marshal(fp) + if err != nil { + return fmt.Errorf("failed to marshal fingerprint: %w", err) + } + + filePath := s.getFingerprintFilePath(fp.ResourceID) + tmpPath := filePath + ".tmp" + + if err := os.WriteFile(tmpPath, data, 0600); err != nil { + return fmt.Errorf("failed to write fingerprint file: %w", err) + } + + if err := os.Rename(tmpPath, filePath); err != nil { + _ = os.Remove(tmpPath) + return fmt.Errorf("failed to finalize fingerprint file: %w", err) + } + + return nil +} + +// GetFingerprint retrieves the last known fingerprint for a resource. +func (s *Store) GetFingerprint(resourceID string) (*ContainerFingerprint, error) { + s.fingerprintMu.RLock() + defer s.fingerprintMu.RUnlock() + + fp, ok := s.fingerprints[resourceID] + if !ok { + return nil, nil // Not found is not an error + } + return fp, nil +} + +// GetAllFingerprints returns all stored fingerprints. +func (s *Store) GetAllFingerprints() map[string]*ContainerFingerprint { + s.fingerprintMu.RLock() + defer s.fingerprintMu.RUnlock() + + result := make(map[string]*ContainerFingerprint, len(s.fingerprints)) + for k, v := range s.fingerprints { + result[k] = v + } + return result +} + +// GetChangedResources returns resource IDs where the fingerprint changed since last discovery. +// It compares the stored fingerprint hash against the discovery's fingerprint field. +func (s *Store) GetChangedResources() ([]string, error) { + s.fingerprintMu.RLock() + fingerprints := make(map[string]*ContainerFingerprint, len(s.fingerprints)) + for k, v := range s.fingerprints { + fingerprints[k] = v + } + s.fingerprintMu.RUnlock() + + var changed []string + for resourceID, fp := range fingerprints { + // Build the full discovery ID + discoveryID := MakeResourceID(ResourceTypeDocker, fp.HostID, resourceID) + + // Get the discovery + discovery, err := s.Get(discoveryID) + if err != nil { + continue + } + + // If no discovery exists, it needs discovery + if discovery == nil { + changed = append(changed, discoveryID) + continue + } + + // If fingerprint hash differs from discovery's stored fingerprint, it changed + if discovery.Fingerprint != fp.Hash { + changed = append(changed, discoveryID) + } + } + + return changed, nil +} + +// GetStaleResources returns resources not discovered in maxAge duration. +func (s *Store) GetStaleResources(maxAge time.Duration) ([]string, error) { + discoveries, err := s.List() + if err != nil { + return nil, err + } + + var stale []string + now := time.Now() + for _, d := range discoveries { + if now.Sub(d.DiscoveredAt) > maxAge { + stale = append(stale, d.ID) + } + } + + return stale, nil +} + +// SetLastFingerprintScan updates the timestamp of the last fingerprint scan. +func (s *Store) SetLastFingerprintScan(t time.Time) { + s.fingerprintMu.Lock() + defer s.fingerprintMu.Unlock() + s.lastFingerprintScan = t +} + +// GetLastFingerprintScan returns the timestamp of the last fingerprint scan. +func (s *Store) GetLastFingerprintScan() time.Time { + s.fingerprintMu.RLock() + defer s.fingerprintMu.RUnlock() + return s.lastFingerprintScan +} + +// GetFingerprintCount returns the number of stored fingerprints. +func (s *Store) GetFingerprintCount() int { + s.fingerprintMu.RLock() + defer s.fingerprintMu.RUnlock() + return len(s.fingerprints) +} + +// CleanupOrphanedFingerprints removes fingerprints for resources that no longer exist. +// Pass in a set of current resource IDs (e.g., "docker:host1:nginx", "lxc:node1:101"). +// Returns the number of fingerprints removed. +func (s *Store) CleanupOrphanedFingerprints(currentResourceIDs map[string]bool) int { + s.fingerprintMu.Lock() + defer s.fingerprintMu.Unlock() + + removed := 0 + for fpID := range s.fingerprints { + if !currentResourceIDs[fpID] { + // Remove from memory + delete(s.fingerprints, fpID) + + // Remove from disk + filePath := s.getFingerprintFilePath(fpID) + if err := os.Remove(filePath); err != nil && !os.IsNotExist(err) { + log.Warn().Err(err).Str("id", fpID).Msg("Failed to remove orphaned fingerprint file") + } else { + log.Debug().Str("id", fpID).Msg("Removed orphaned fingerprint") + } + removed++ + } + } + + return removed +} + +// CleanupOrphanedDiscoveries removes discoveries for resources that no longer exist. +// Pass in a set of current resource IDs. +// Returns the number of discoveries removed. +func (s *Store) CleanupOrphanedDiscoveries(currentResourceIDs map[string]bool) int { + // List all discovery files + entries, err := os.ReadDir(s.dataDir) + if err != nil { + log.Warn().Err(err).Msg("Failed to read discovery directory for cleanup") + return 0 + } + + removed := 0 + for _, entry := range entries { + if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".enc") { + continue + } + + // Convert filename back to resource ID + // Filename format: type_host_id.enc (underscores replace colons and slashes) + baseName := strings.TrimSuffix(entry.Name(), ".enc") + resourceID := filenameToResourceID(baseName) + + if !currentResourceIDs[resourceID] { + filePath := filepath.Join(s.dataDir, entry.Name()) + if err := os.Remove(filePath); err != nil { + log.Warn().Err(err).Str("file", entry.Name()).Msg("Failed to remove orphaned discovery file") + } else { + log.Debug().Str("id", resourceID).Msg("Removed orphaned discovery") + removed++ + } + } + } + + return removed +} + +// filenameToResourceID converts a discovery filename back to a resource ID. +// Reverses the transformation done in getFilePath. +func filenameToResourceID(filename string) string { + // The filename uses underscores for colons and slashes + // We need to be smart about this - the format is type_host_resourceid + // First underscore separates type, rest could have underscores in host/resource names + + parts := strings.SplitN(filename, "_", 3) + if len(parts) < 3 { + return filename // Can't parse, return as-is + } + + resourceType := parts[0] + host := parts[1] + resourceID := parts[2] + + // For k8s, the resource ID might have been namespace/name which became namespace_name + // We convert back: k8s:cluster:namespace/name + if resourceType == "k8s" && strings.Contains(resourceID, "_") { + // Could be namespace_name, convert back to namespace/name + resourceID = strings.Replace(resourceID, "_", "/", 1) + } + + return resourceType + ":" + host + ":" + resourceID +} + +// ListDiscoveryIDs returns all discovery IDs currently stored. +func (s *Store) ListDiscoveryIDs() []string { + entries, err := os.ReadDir(s.dataDir) + if err != nil { + return nil + } + + var ids []string + for _, entry := range entries { + if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".enc") { + continue + } + baseName := strings.TrimSuffix(entry.Name(), ".enc") + ids = append(ids, filenameToResourceID(baseName)) + } + return ids +} diff --git a/internal/servicediscovery/store_test.go b/internal/servicediscovery/store_test.go new file mode 100644 index 000000000..374797af4 --- /dev/null +++ b/internal/servicediscovery/store_test.go @@ -0,0 +1,469 @@ +package servicediscovery + +import ( + "encoding/json" + "os" + "path/filepath" + "strings" + "testing" + "time" + + "github.com/rcourtman/pulse-go-rewrite/internal/crypto" +) + +type fakeCrypto struct{} + +func (fakeCrypto) Encrypt(plaintext []byte) ([]byte, error) { + out := make([]byte, len(plaintext)) + for i := range plaintext { + out[i] = plaintext[len(plaintext)-1-i] + } + return out, nil +} + +func (fakeCrypto) Decrypt(ciphertext []byte) ([]byte, error) { + return fakeCrypto{}.Encrypt(ciphertext) +} + +type errorCrypto struct{} + +func (errorCrypto) Encrypt(plaintext []byte) ([]byte, error) { + return nil, os.ErrInvalid +} + +func (errorCrypto) Decrypt(ciphertext []byte) ([]byte, error) { + return nil, os.ErrInvalid +} + +func TestStore_SaveGetListAndNotes(t *testing.T) { + store, err := NewStore(t.TempDir()) + if err != nil { + t.Fatalf("NewStore error: %v", err) + } + store.crypto = nil + + d1 := &ResourceDiscovery{ + ID: MakeResourceID(ResourceTypeDocker, "host1", "nginx"), + ResourceType: ResourceTypeDocker, + ResourceID: "nginx", + HostID: "host1", + ServiceName: "Nginx", + } + if err := store.Save(d1); err != nil { + t.Fatalf("Save error: %v", err) + } + + got, err := store.Get(d1.ID) + if err != nil { + t.Fatalf("Get error: %v", err) + } + if got == nil || got.ServiceName != "Nginx" { + t.Fatalf("unexpected discovery: %#v", got) + } + if !store.Exists(d1.ID) { + t.Fatalf("expected discovery to exist") + } + + if err := store.UpdateNotes(d1.ID, "notes", map[string]string{"token": "abc"}); err != nil { + t.Fatalf("UpdateNotes error: %v", err) + } + updated, err := store.Get(d1.ID) + if err != nil { + t.Fatalf("Get updated error: %v", err) + } + if updated.UserNotes != "notes" || updated.UserSecrets["token"] != "abc" { + t.Fatalf("notes not updated: %#v", updated) + } + + d2 := &ResourceDiscovery{ + ID: MakeResourceID(ResourceTypeVM, "node1", "101"), + ResourceType: ResourceTypeVM, + ResourceID: "101", + HostID: "node1", + ServiceName: "VM", + } + if err := store.Save(d2); err != nil { + t.Fatalf("Save d2 error: %v", err) + } + + list, err := store.List() + if err != nil { + t.Fatalf("List error: %v", err) + } + if len(list) != 2 { + t.Fatalf("expected 2 discoveries, got %d", len(list)) + } + + byType, err := store.ListByType(ResourceTypeVM) + if err != nil { + t.Fatalf("ListByType error: %v", err) + } + if len(byType) != 1 || byType[0].ID != d2.ID { + t.Fatalf("unexpected ListByType: %#v", byType) + } + + byHost, err := store.ListByHost("host1") + if err != nil { + t.Fatalf("ListByHost error: %v", err) + } + if len(byHost) != 1 || byHost[0].ID != d1.ID { + t.Fatalf("unexpected ListByHost: %#v", byHost) + } + + summary := updated.ToSummary() + if summary.ID != d1.ID || !summary.HasUserNotes { + t.Fatalf("unexpected summary: %#v", summary) + } + + if err := store.Delete(d1.ID); err != nil { + t.Fatalf("Delete error: %v", err) + } + if store.Exists(d1.ID) { + t.Fatalf("expected discovery to be deleted") + } +} + +func TestStore_CryptoRoundTripAndPaths(t *testing.T) { + store, err := NewStore(t.TempDir()) + if err != nil { + t.Fatalf("NewStore error: %v", err) + } + store.crypto = fakeCrypto{} + + id := "docker:host1:app/name" + d := &ResourceDiscovery{ + ID: id, + ResourceType: ResourceTypeDocker, + ResourceID: "app/name", + HostID: "host1", + ServiceName: "App", + } + if err := store.Save(d); err != nil { + t.Fatalf("Save error: %v", err) + } + + path := store.getFilePath(id) + base := filepath.Base(path) + if strings.Contains(base, ":") || strings.Contains(base, "/") { + t.Fatalf("expected sanitized base filename, got %s", base) + } + + loaded, err := store.Get(id) + if err != nil { + t.Fatalf("Get error: %v", err) + } + if loaded == nil || loaded.ServiceName != "App" { + t.Fatalf("unexpected discovery: %#v", loaded) + } + + store.ClearCache() + if _, err := store.Get(id); err != nil { + t.Fatalf("Get with decrypt error: %v", err) + } + list, err := store.List() + if err != nil || len(list) != 1 { + t.Fatalf("List with decrypt error: %v len=%d", err, len(list)) + } +} + +func TestStore_NeedsRefreshAndGetMultiple(t *testing.T) { + store, err := NewStore(t.TempDir()) + if err != nil { + t.Fatalf("NewStore error: %v", err) + } + store.crypto = nil + + if !store.NeedsRefresh("missing", time.Minute) { + t.Fatalf("expected missing discovery to need refresh") + } + + d := &ResourceDiscovery{ + ID: MakeResourceID(ResourceTypeHost, "host1", "host1"), + ResourceType: ResourceTypeHost, + ResourceID: "host1", + HostID: "host1", + ServiceName: "Host", + } + if err := store.Save(d); err != nil { + t.Fatalf("Save error: %v", err) + } + + path := store.getFilePath(d.ID) + data, err := os.ReadFile(path) + if err != nil { + t.Fatalf("ReadFile error: %v", err) + } + var saved ResourceDiscovery + if err := json.Unmarshal(data, &saved); err != nil { + t.Fatalf("Unmarshal error: %v", err) + } + saved.UpdatedAt = time.Now().Add(-2 * time.Hour) + data, err = json.Marshal(&saved) + if err != nil { + t.Fatalf("Marshal error: %v", err) + } + if err := os.WriteFile(path, data, 0600); err != nil { + t.Fatalf("WriteFile error: %v", err) + } + + store.ClearCache() + if !store.NeedsRefresh(d.ID, time.Minute) { + t.Fatalf("expected old discovery to need refresh") + } + + ids := []string{d.ID, "missing"} + multi, err := store.GetMultiple(ids) + if err != nil { + t.Fatalf("GetMultiple error: %v", err) + } + if len(multi) != 1 || multi[0].ID != d.ID { + t.Fatalf("unexpected GetMultiple: %#v", multi) + } +} + +func TestStore_ErrorsAndListSkips(t *testing.T) { + dir := t.TempDir() + store, err := NewStore(dir) + if err != nil { + t.Fatalf("NewStore error: %v", err) + } + store.crypto = nil + + if err := store.Save(&ResourceDiscovery{}); err == nil { + t.Fatalf("expected error for empty ID") + } + + store.crypto = errorCrypto{} + if err := store.Save(&ResourceDiscovery{ID: "bad"}); err == nil { + t.Fatalf("expected encrypt error") + } + + store.crypto = nil + if _, err := store.Get("missing"); err != nil { + t.Fatalf("unexpected missing error: %v", err) + } + + d := &ResourceDiscovery{ + ID: MakeResourceID(ResourceTypeDocker, "host1", "web"), + ResourceType: ResourceTypeDocker, + ResourceID: "web", + HostID: "host1", + ServiceName: "Web", + UserSecrets: map[string]string{"token": "abc"}, + } + if err := store.Save(d); err != nil { + t.Fatalf("Save error: %v", err) + } + + // Corrupt file to force unmarshal error during List. + badPath := filepath.Join(store.dataDir, "bad.enc") + if err := os.WriteFile(badPath, []byte("{bad"), 0600); err != nil { + t.Fatalf("WriteFile error: %v", err) + } + if err := os.WriteFile(filepath.Join(store.dataDir, "note.txt"), []byte("skip"), 0600); err != nil { + t.Fatalf("WriteFile error: %v", err) + } + if err := os.WriteFile(filepath.Join(store.dataDir, "skip.enc.tmp"), []byte("skip"), 0600); err != nil { + t.Fatalf("WriteFile error: %v", err) + } + if err := os.MkdirAll(filepath.Join(store.dataDir, "dir"), 0700); err != nil { + t.Fatalf("MkdirAll error: %v", err) + } + unreadable := filepath.Join(store.dataDir, "unreadable.enc") + if err := os.WriteFile(unreadable, []byte("nope"), 0000); err != nil { + t.Fatalf("WriteFile error: %v", err) + } + + list, err := store.List() + if err != nil { + t.Fatalf("List error: %v", err) + } + if len(list) != 1 { + t.Fatalf("expected 1 discovery, got %d", len(list)) + } + + store.crypto = errorCrypto{} + list, err = store.List() + if err != nil { + t.Fatalf("List with crypto error: %v", err) + } + if len(list) != 0 { + t.Fatalf("expected crypto errors to skip entries") + } + + store.crypto = errorCrypto{} + store.ClearCache() + if _, err := store.Get(d.ID); err == nil { + t.Fatalf("expected decrypt error") + } + + store.crypto = nil + if got, err := store.GetByResource(ResourceTypeDocker, "host1", "web"); err != nil || got == nil { + t.Fatalf("GetByResource error: %v", err) + } + + if err := store.UpdateNotes(d.ID, "notes-only", nil); err != nil { + t.Fatalf("UpdateNotes error: %v", err) + } + updated, err := store.Get(d.ID) + if err != nil || updated.UserSecrets == nil { + t.Fatalf("expected secrets to be preserved: %#v err=%v", updated, err) + } + + store.crypto = errorCrypto{} + store.ClearCache() + if err := store.UpdateNotes(d.ID, "notes", nil); err == nil { + t.Fatalf("expected update notes error with crypto failure") + } + if got, err := store.GetMultiple([]string{d.ID}); err != nil || len(got) != 0 { + t.Fatalf("expected GetMultiple to skip errors") + } + + if err := store.UpdateNotes("missing", "notes", nil); err == nil { + t.Fatalf("expected error for missing discovery") + } + + if err := store.Delete("missing"); err != nil { + t.Fatalf("unexpected delete error: %v", err) + } +} + +func TestStore_NewStoreError(t *testing.T) { + dir := t.TempDir() + file := filepath.Join(dir, "file") + if err := os.WriteFile(file, []byte("x"), 0600); err != nil { + t.Fatalf("WriteFile error: %v", err) + } + + if _, err := NewStore(file); err == nil { + t.Fatalf("expected error for file data dir") + } +} + +func TestStore_NewStoreCryptoFailure(t *testing.T) { + orig := newCryptoManagerAt + newCryptoManagerAt = func(dataDir string) (*crypto.CryptoManager, error) { + manager, err := crypto.NewCryptoManagerAt(dataDir) + if err != nil { + return nil, err + } + return manager, os.ErrInvalid + } + t.Cleanup(func() { + newCryptoManagerAt = orig + }) + + store, err := NewStore(t.TempDir()) + if err != nil { + t.Fatalf("NewStore error: %v", err) + } + if store.crypto == nil { + t.Fatalf("expected crypto manager despite init warning") + } +} + +func TestStore_SaveMarshalError(t *testing.T) { + store, err := NewStore(t.TempDir()) + if err != nil { + t.Fatalf("NewStore error: %v", err) + } + store.crypto = nil + + orig := marshalDiscovery + marshalDiscovery = func(any) ([]byte, error) { + return nil, os.ErrInvalid + } + t.Cleanup(func() { + marshalDiscovery = orig + }) + + if err := store.Save(&ResourceDiscovery{ID: "marshal"}); err == nil { + t.Fatalf("expected marshal error") + } +} + +func TestStore_SaveAndGetErrors(t *testing.T) { + store, err := NewStore(t.TempDir()) + if err != nil { + t.Fatalf("NewStore error: %v", err) + } + store.crypto = nil + + id := MakeResourceID(ResourceTypeDocker, "host1", "web") + filePath := store.getFilePath(id) + if err := os.MkdirAll(filePath, 0700); err != nil { + t.Fatalf("MkdirAll error: %v", err) + } + if err := store.Save(&ResourceDiscovery{ID: id}); err == nil { + t.Fatalf("expected rename error") + } + + tmpFile := filepath.Join(t.TempDir(), "file") + if err := os.WriteFile(tmpFile, []byte("x"), 0600); err != nil { + t.Fatalf("WriteFile error: %v", err) + } + store.dataDir = tmpFile + if err := store.Save(&ResourceDiscovery{ID: "bad"}); err == nil { + t.Fatalf("expected write error") + } + + store.dataDir = t.TempDir() + store.crypto = nil + badPath := store.getFilePath("bad") + if err := os.WriteFile(badPath, []byte("{bad"), 0600); err != nil { + t.Fatalf("WriteFile error: %v", err) + } + if _, err := store.Get("bad"); err == nil { + t.Fatalf("expected unmarshal error") + } +} + +func TestStore_ListErrors(t *testing.T) { + store, err := NewStore(t.TempDir()) + if err != nil { + t.Fatalf("NewStore error: %v", err) + } + store.crypto = nil + + store.dataDir = filepath.Join(t.TempDir(), "missing") + list, err := store.List() + if err != nil || len(list) != 0 { + t.Fatalf("expected empty list for missing dir") + } + + file := filepath.Join(t.TempDir(), "file") + if err := os.WriteFile(file, []byte("x"), 0600); err != nil { + t.Fatalf("WriteFile error: %v", err) + } + store.dataDir = file + if _, err := store.List(); err == nil { + t.Fatalf("expected list error for file path") + } + if _, err := store.ListByType(ResourceTypeDocker); err == nil { + t.Fatalf("expected list by type error") + } + if _, err := store.ListByHost("host1"); err == nil { + t.Fatalf("expected list by host error") + } +} + +func TestStore_DeleteError(t *testing.T) { + store, err := NewStore(t.TempDir()) + if err != nil { + t.Fatalf("NewStore error: %v", err) + } + store.crypto = nil + + id := MakeResourceID(ResourceTypeDocker, "host1", "dir") + filePath := store.getFilePath(id) + if err := os.MkdirAll(filePath, 0700); err != nil { + t.Fatalf("MkdirAll error: %v", err) + } + nested := filepath.Join(filePath, "nested") + if err := os.WriteFile(nested, []byte("x"), 0600); err != nil { + t.Fatalf("WriteFile error: %v", err) + } + if err := store.Delete(id); err == nil { + t.Fatalf("expected delete error for non-empty dir") + } +} diff --git a/internal/servicediscovery/tools_adapter.go b/internal/servicediscovery/tools_adapter.go new file mode 100644 index 000000000..143063a28 --- /dev/null +++ b/internal/servicediscovery/tools_adapter.go @@ -0,0 +1,226 @@ +package servicediscovery + +import ( + "context" + + "github.com/rcourtman/pulse-go-rewrite/internal/ai/tools" +) + +// ToolsAdapter wraps Service to implement tools.DiscoverySource +type ToolsAdapter struct { + service *Service +} + +// NewToolsAdapter creates a new adapter for the discovery service +func NewToolsAdapter(service *Service) *ToolsAdapter { + if service == nil { + return nil + } + return &ToolsAdapter{service: service} +} + +// GetDiscovery implements tools.DiscoverySource +func (a *ToolsAdapter) GetDiscovery(id string) (tools.DiscoverySourceData, error) { + discovery, err := a.service.GetDiscovery(id) + if err != nil { + return tools.DiscoverySourceData{}, err + } + if discovery == nil { + return tools.DiscoverySourceData{}, nil + } + return a.convertToSourceData(discovery), nil +} + +// GetDiscoveryByResource implements tools.DiscoverySource +func (a *ToolsAdapter) GetDiscoveryByResource(resourceType, hostID, resourceID string) (tools.DiscoverySourceData, error) { + discovery, err := a.service.GetDiscoveryByResource(ResourceType(resourceType), hostID, resourceID) + if err != nil { + return tools.DiscoverySourceData{}, err + } + if discovery == nil { + return tools.DiscoverySourceData{}, nil + } + return a.convertToSourceData(discovery), nil +} + +// ListDiscoveries implements tools.DiscoverySource +func (a *ToolsAdapter) ListDiscoveries() ([]tools.DiscoverySourceData, error) { + discoveries, err := a.service.ListDiscoveries() + if err != nil { + return nil, err + } + return a.convertList(discoveries), nil +} + +// ListDiscoveriesByType implements tools.DiscoverySource +func (a *ToolsAdapter) ListDiscoveriesByType(resourceType string) ([]tools.DiscoverySourceData, error) { + discoveries, err := a.service.ListDiscoveriesByType(ResourceType(resourceType)) + if err != nil { + return nil, err + } + return a.convertList(discoveries), nil +} + +// ListDiscoveriesByHost implements tools.DiscoverySource +func (a *ToolsAdapter) ListDiscoveriesByHost(hostID string) ([]tools.DiscoverySourceData, error) { + discoveries, err := a.service.ListDiscoveriesByHost(hostID) + if err != nil { + return nil, err + } + return a.convertList(discoveries), nil +} + +// FormatForAIContext implements tools.DiscoverySource +func (a *ToolsAdapter) FormatForAIContext(sourceData []tools.DiscoverySourceData) string { + // Convert back to ResourceDiscovery for formatting + discoveries := make([]*ResourceDiscovery, 0, len(sourceData)) + for _, sd := range sourceData { + discoveries = append(discoveries, a.convertFromSourceData(sd)) + } + return FormatForAIContext(discoveries) +} + +// TriggerDiscovery implements tools.DiscoverySource - initiates discovery for a resource +func (a *ToolsAdapter) TriggerDiscovery(ctx context.Context, resourceType, hostID, resourceID string) (tools.DiscoverySourceData, error) { + req := DiscoveryRequest{ + ResourceType: ResourceType(resourceType), + HostID: hostID, + ResourceID: resourceID, + Force: false, // Don't force if recently discovered + } + + discovery, err := a.service.DiscoverResource(ctx, req) + if err != nil { + return tools.DiscoverySourceData{}, err + } + if discovery == nil { + return tools.DiscoverySourceData{}, nil + } + return a.convertToSourceData(discovery), nil +} + +func (a *ToolsAdapter) convertToSourceData(d *ResourceDiscovery) tools.DiscoverySourceData { + facts := make([]tools.DiscoverySourceFact, 0, len(d.Facts)) + for _, f := range d.Facts { + facts = append(facts, tools.DiscoverySourceFact{ + Category: string(f.Category), + Key: f.Key, + Value: f.Value, + Source: f.Source, + Confidence: f.Confidence, + }) + } + + ports := make([]tools.DiscoverySourcePort, 0, len(d.Ports)) + for _, p := range d.Ports { + ports = append(ports, tools.DiscoverySourcePort{ + Port: p.Port, + Protocol: p.Protocol, + Process: p.Process, + Address: p.Address, + }) + } + + dockerMounts := make([]tools.DiscoverySourceDockerMount, 0, len(d.DockerMounts)) + for _, m := range d.DockerMounts { + dockerMounts = append(dockerMounts, tools.DiscoverySourceDockerMount{ + ContainerName: m.ContainerName, + Source: m.Source, + Destination: m.Destination, + Type: m.Type, + ReadOnly: m.ReadOnly, + }) + } + + return tools.DiscoverySourceData{ + ID: d.ID, + ResourceType: string(d.ResourceType), + ResourceID: d.ResourceID, + HostID: d.HostID, + Hostname: d.Hostname, + ServiceType: d.ServiceType, + ServiceName: d.ServiceName, + ServiceVersion: d.ServiceVersion, + Category: string(d.Category), + CLIAccess: d.CLIAccess, + Facts: facts, + ConfigPaths: d.ConfigPaths, + DataPaths: d.DataPaths, + LogPaths: d.LogPaths, + Ports: ports, + DockerMounts: dockerMounts, + UserNotes: d.UserNotes, + Confidence: d.Confidence, + AIReasoning: d.AIReasoning, + DiscoveredAt: d.DiscoveredAt, + UpdatedAt: d.UpdatedAt, + } +} + +func (a *ToolsAdapter) convertFromSourceData(sd tools.DiscoverySourceData) *ResourceDiscovery { + facts := make([]DiscoveryFact, 0, len(sd.Facts)) + for _, f := range sd.Facts { + facts = append(facts, DiscoveryFact{ + Category: FactCategory(f.Category), + Key: f.Key, + Value: f.Value, + Source: f.Source, + Confidence: f.Confidence, + }) + } + + ports := make([]PortInfo, 0, len(sd.Ports)) + for _, p := range sd.Ports { + ports = append(ports, PortInfo{ + Port: p.Port, + Protocol: p.Protocol, + Process: p.Process, + Address: p.Address, + }) + } + + dockerMounts := make([]DockerBindMount, 0, len(sd.DockerMounts)) + for _, m := range sd.DockerMounts { + dockerMounts = append(dockerMounts, DockerBindMount{ + ContainerName: m.ContainerName, + Source: m.Source, + Destination: m.Destination, + Type: m.Type, + ReadOnly: m.ReadOnly, + }) + } + + return &ResourceDiscovery{ + ID: sd.ID, + ResourceType: ResourceType(sd.ResourceType), + ResourceID: sd.ResourceID, + HostID: sd.HostID, + Hostname: sd.Hostname, + ServiceType: sd.ServiceType, + ServiceName: sd.ServiceName, + ServiceVersion: sd.ServiceVersion, + Category: ServiceCategory(sd.Category), + CLIAccess: sd.CLIAccess, + Facts: facts, + ConfigPaths: sd.ConfigPaths, + DataPaths: sd.DataPaths, + LogPaths: sd.LogPaths, + Ports: ports, + DockerMounts: dockerMounts, + UserNotes: sd.UserNotes, + Confidence: sd.Confidence, + AIReasoning: sd.AIReasoning, + DiscoveredAt: sd.DiscoveredAt, + UpdatedAt: sd.UpdatedAt, + } +} + +func (a *ToolsAdapter) convertList(discoveries []*ResourceDiscovery) []tools.DiscoverySourceData { + result := make([]tools.DiscoverySourceData, 0, len(discoveries)) + for _, d := range discoveries { + if d != nil { + result = append(result, a.convertToSourceData(d)) + } + } + return result +} diff --git a/internal/servicediscovery/types.go b/internal/servicediscovery/types.go new file mode 100644 index 000000000..7bc8b8362 --- /dev/null +++ b/internal/servicediscovery/types.go @@ -0,0 +1,298 @@ +// Package discovery provides AI-powered infrastructure discovery capabilities. +// It discovers services, versions, configurations, and CLI access methods +// for VMs, LXCs, Docker containers, Kubernetes pods, and hosts. +package servicediscovery + +import ( + "fmt" + "time" +) + +// ResourceType identifies the type of infrastructure resource. +type ResourceType string + +const ( + ResourceTypeVM ResourceType = "vm" + ResourceTypeLXC ResourceType = "lxc" + ResourceTypeDocker ResourceType = "docker" + ResourceTypeK8s ResourceType = "k8s" + ResourceTypeHost ResourceType = "host" + ResourceTypeDockerVM ResourceType = "docker_vm" // Docker on a VM + ResourceTypeDockerLXC ResourceType = "docker_lxc" // Docker in an LXC +) + +// FactCategory categorizes discovery facts. +type FactCategory string + +const ( + FactCategoryVersion FactCategory = "version" + FactCategoryConfig FactCategory = "config" + FactCategoryService FactCategory = "service" + FactCategoryPort FactCategory = "port" + FactCategoryHardware FactCategory = "hardware" + FactCategoryNetwork FactCategory = "network" + FactCategoryStorage FactCategory = "storage" + FactCategoryDependency FactCategory = "dependency" + FactCategorySecurity FactCategory = "security" +) + +// ServiceCategory categorizes the type of service discovered. +type ServiceCategory string + +const ( + CategoryDatabase ServiceCategory = "database" + CategoryWebServer ServiceCategory = "web_server" + CategoryCache ServiceCategory = "cache" + CategoryMessageQueue ServiceCategory = "message_queue" + CategoryMonitoring ServiceCategory = "monitoring" + CategoryBackup ServiceCategory = "backup" + CategoryNVR ServiceCategory = "nvr" + CategoryStorage ServiceCategory = "storage" + CategoryContainer ServiceCategory = "container" + CategoryVirtualizer ServiceCategory = "virtualizer" + CategoryNetwork ServiceCategory = "network" + CategorySecurity ServiceCategory = "security" + CategoryMedia ServiceCategory = "media" + CategoryHomeAuto ServiceCategory = "home_automation" + CategoryUnknown ServiceCategory = "unknown" +) + +// ResourceDiscovery is the main data model for discovered resource information. +type ResourceDiscovery struct { + // Identity + ID string `json:"id"` // Unique ID: "lxc:minipc:101" + ResourceType ResourceType `json:"resource_type"` // vm, lxc, docker, k8s, host + ResourceID string `json:"resource_id"` // 101, container-name, etc. + HostID string `json:"host_id"` // Proxmox node name or host agent ID + Hostname string `json:"hostname"` // Human-readable host name + + // AI-discovered info + ServiceType string `json:"service_type"` // frigate, postgres, pbs + ServiceName string `json:"service_name"` // Human-readable name + ServiceVersion string `json:"service_version"` // v0.13.2 + Category ServiceCategory `json:"category"` // nvr, database, backup + CLIAccess string `json:"cli_access"` // pct exec 101 -- ... + + // Deep discovery facts + Facts []DiscoveryFact `json:"facts"` + ConfigPaths []string `json:"config_paths"` + DataPaths []string `json:"data_paths"` + LogPaths []string `json:"log_paths"` + Ports []PortInfo `json:"ports"` + DockerMounts []DockerBindMount `json:"docker_mounts,omitempty"` // Docker container bind mounts (source->dest) + + // User-added (also encrypted) + UserNotes string `json:"user_notes"` + UserSecrets map[string]string `json:"user_secrets"` // tokens, creds + + // Metadata + Confidence float64 `json:"confidence"` // 0-1 confidence score + AIReasoning string `json:"ai_reasoning"` // AI explanation + DiscoveredAt time.Time `json:"discovered_at"` // First discovery + UpdatedAt time.Time `json:"updated_at"` // Last update + ScanDuration int64 `json:"scan_duration"` // Scan duration in ms + + // Fingerprint tracking for just-in-time discovery + Fingerprint string `json:"fingerprint,omitempty"` // Hash when discovery was done + FingerprintedAt time.Time `json:"fingerprinted_at,omitempty"` // When fingerprint was captured + FingerprintSchemaVersion int `json:"fingerprint_schema_version,omitempty"` // Schema version when fingerprint was captured + CLIAccessVersion int `json:"cli_access_version,omitempty"` // Version of CLI access pattern format + + // Raw data for debugging/re-analysis + RawCommandOutput map[string]string `json:"raw_command_output,omitempty"` +} + +// DiscoveryFact represents a single discovered fact about a resource. +type DiscoveryFact struct { + Category FactCategory `json:"category"` // version, config, service, port + Key string `json:"key"` // e.g., "coral_tpu", "mqtt_broker" + Value string `json:"value"` // e.g., "/dev/apex_0", "mosquitto:1883" + Source string `json:"source"` // command that found this + Confidence float64 `json:"confidence"` // 0-1 confidence for this fact + DiscoveredAt time.Time `json:"discovered_at"` +} + +// PortInfo represents information about a listening port. +type PortInfo struct { + Port int `json:"port"` + Protocol string `json:"protocol"` // tcp, udp + Process string `json:"process"` // process name + Address string `json:"address"` // bind address +} + +// DockerBindMount represents a Docker bind mount with source and destination paths. +// This is critical for knowing where to actually edit files - the source path on the +// host filesystem, not the destination path inside the container. +type DockerBindMount struct { + ContainerName string `json:"container_name"` // Docker container name + Source string `json:"source"` // Host path (where to actually write files) + Destination string `json:"destination"` // Container path (what the service sees) + Type string `json:"type,omitempty"` // Mount type: bind, volume, tmpfs + ReadOnly bool `json:"read_only,omitempty"` // Whether mount is read-only +} + +// MakeResourceID creates a standardized resource ID. +func MakeResourceID(resourceType ResourceType, hostID, resourceID string) string { + return fmt.Sprintf("%s:%s:%s", resourceType, hostID, resourceID) +} + +// ParseResourceID parses a resource ID into its components. +func ParseResourceID(id string) (resourceType ResourceType, hostID, resourceID string, err error) { + var parts [3]string + count := 0 + start := 0 + for i, c := range id { + if c == ':' { + if count < 2 { + parts[count] = id[start:i] + count++ + start = i + 1 + } + } + } + if count == 2 { + parts[2] = id[start:] + return ResourceType(parts[0]), parts[1], parts[2], nil + } + return "", "", "", fmt.Errorf("invalid resource ID format: %s", id) +} + +// DiscoveryRequest represents a request to discover a resource. +type DiscoveryRequest struct { + ResourceType ResourceType `json:"resource_type"` + ResourceID string `json:"resource_id"` + HostID string `json:"host_id"` + Hostname string `json:"hostname"` + Force bool `json:"force"` // Force re-scan even if recent +} + +// DiscoveryStatus represents the status of a discovery scan. +type DiscoveryStatus string + +const ( + DiscoveryStatusPending DiscoveryStatus = "pending" + DiscoveryStatusRunning DiscoveryStatus = "running" + DiscoveryStatusCompleted DiscoveryStatus = "completed" + DiscoveryStatusFailed DiscoveryStatus = "failed" + DiscoveryStatusNotStarted DiscoveryStatus = "not_started" +) + +// DiscoveryProgress represents the progress of an ongoing discovery. +type DiscoveryProgress struct { + ResourceID string `json:"resource_id"` + Status DiscoveryStatus `json:"status"` + CurrentStep string `json:"current_step"` + CurrentCommand string `json:"current_command,omitempty"` + TotalSteps int `json:"total_steps"` + CompletedSteps int `json:"completed_steps"` + ElapsedMs int64 `json:"elapsed_ms,omitempty"` + PercentComplete float64 `json:"percent_complete,omitempty"` + StartedAt time.Time `json:"started_at"` + Error string `json:"error,omitempty"` +} + +// UpdateNotesRequest represents a request to update user notes. +type UpdateNotesRequest struct { + UserNotes string `json:"user_notes"` + UserSecrets map[string]string `json:"user_secrets,omitempty"` +} + +// DiscoverySummary provides a summary of discoveries for listing. +type DiscoverySummary struct { + ID string `json:"id"` + ResourceType ResourceType `json:"resource_type"` + ResourceID string `json:"resource_id"` + HostID string `json:"host_id"` + Hostname string `json:"hostname"` + ServiceType string `json:"service_type"` + ServiceName string `json:"service_name"` + ServiceVersion string `json:"service_version"` + Category ServiceCategory `json:"category"` + Confidence float64 `json:"confidence"` + HasUserNotes bool `json:"has_user_notes"` + UpdatedAt time.Time `json:"updated_at"` + Fingerprint string `json:"fingerprint,omitempty"` // Current fingerprint + NeedsDiscovery bool `json:"needs_discovery"` // True if fingerprint changed +} + +// ToSummary converts a full discovery to a summary. +func (d *ResourceDiscovery) ToSummary() DiscoverySummary { + return DiscoverySummary{ + ID: d.ID, + ResourceType: d.ResourceType, + ResourceID: d.ResourceID, + HostID: d.HostID, + Hostname: d.Hostname, + ServiceType: d.ServiceType, + ServiceName: d.ServiceName, + ServiceVersion: d.ServiceVersion, + Category: d.Category, + Confidence: d.Confidence, + HasUserNotes: d.UserNotes != "", + UpdatedAt: d.UpdatedAt, + Fingerprint: d.Fingerprint, + NeedsDiscovery: false, // Will be set by caller if fingerprint changed + } +} + +// AIAnalysisRequest is sent to the AI for analysis. +type AIAnalysisRequest struct { + ResourceType ResourceType `json:"resource_type"` + ResourceID string `json:"resource_id"` + HostID string `json:"host_id"` + Hostname string `json:"hostname"` + CommandOutputs map[string]string `json:"command_outputs"` + ExistingFacts []DiscoveryFact `json:"existing_facts,omitempty"` + Metadata map[string]any `json:"metadata,omitempty"` // Image, labels, etc. +} + +// AIAnalysisResponse is returned by the AI. +type AIAnalysisResponse struct { + ServiceType string `json:"service_type"` + ServiceName string `json:"service_name"` + ServiceVersion string `json:"service_version"` + Category ServiceCategory `json:"category"` + CLIAccess string `json:"cli_access"` + Facts []DiscoveryFact `json:"facts"` + ConfigPaths []string `json:"config_paths"` + DataPaths []string `json:"data_paths"` + LogPaths []string `json:"log_paths"` + Ports []PortInfo `json:"ports"` + Confidence float64 `json:"confidence"` + Reasoning string `json:"reasoning"` +} + +// ContainerFingerprint captures the key metadata that indicates a container changed. +// This is used for just-in-time discovery - only running discovery when something +// actually changed rather than on a fixed timer. +// FingerprintSchemaVersion is incremented when the fingerprint algorithm changes. +// This prevents mass rediscovery when we add new fields to the fingerprint hash. +// Old fingerprints with different schema versions are treated as "schema changed" +// rather than "container changed", allowing for more controlled migration. +const FingerprintSchemaVersion = 3 // v3: Removed IP addresses (DHCP churn caused false positives) + +// CLIAccessVersion is incremented when the CLI access pattern format changes. +// When a discovery has an older version, its CLIAccess field is regenerated +// to use the new instructional format. +const CLIAccessVersion = 2 // v2: Changed from shell commands to pulse_control instructions + +type ContainerFingerprint struct { + ResourceID string `json:"resource_id"` + HostID string `json:"host_id"` + Hash string `json:"hash"` // SHA256 of metadata (truncated to 16 chars) + SchemaVersion int `json:"schema_version"` // Version of fingerprint algorithm + GeneratedAt time.Time `json:"generated_at"` + + // Components that went into the hash (for debugging) + ImageID string `json:"image_id,omitempty"` + ImageName string `json:"image_name,omitempty"` + Ports []string `json:"ports,omitempty"` + MountPaths []string `json:"mount_paths,omitempty"` + EnvKeys []string `json:"env_keys,omitempty"` // Keys only, not values (security) + CreatedAt string `json:"created_at,omitempty"` // Container creation time +} + +// IsSchemaOutdated returns true if this fingerprint was created with an older schema. +func (fp *ContainerFingerprint) IsSchemaOutdated() bool { + return fp.SchemaVersion < FingerprintSchemaVersion +} diff --git a/internal/servicediscovery/types_test.go b/internal/servicediscovery/types_test.go new file mode 100644 index 000000000..34646a9ac --- /dev/null +++ b/internal/servicediscovery/types_test.go @@ -0,0 +1,22 @@ +package servicediscovery + +import "testing" + +func TestResourceIDHelpers(t *testing.T) { + id := MakeResourceID(ResourceTypeDocker, "host1", "app") + if id != "docker:host1:app" { + t.Fatalf("unexpected id: %s", id) + } + + rt, host, res, err := ParseResourceID(id) + if err != nil { + t.Fatalf("ParseResourceID error: %v", err) + } + if rt != ResourceTypeDocker || host != "host1" || res != "app" { + t.Fatalf("unexpected parse result: %s %s %s", rt, host, res) + } + + if _, _, _, err := ParseResourceID("invalid"); err == nil { + t.Fatalf("expected parse error for invalid id") + } +} diff --git a/internal/websocket/hub_tenant_test.go b/internal/websocket/hub_tenant_test.go index 085e29a17..cd3e0ed21 100644 --- a/internal/websocket/hub_tenant_test.go +++ b/internal/websocket/hub_tenant_test.go @@ -83,6 +83,8 @@ func TestHub_Setters_Coverage(t *testing.T) { func TestHub_DispatchToTenantClients(t *testing.T) { // This tests the internal logic of iterating clients hub := NewHub(nil) + go hub.Run() + defer hub.Stop() // Create a mock client client := &Client{ @@ -93,11 +95,9 @@ func TestHub_DispatchToTenantClients(t *testing.T) { } // Manually register (simulating register channel) - hub.clients[client] = true hub.register <- client // Allow registration to process - go hub.Run() time.Sleep(50 * time.Millisecond) // Now broadcast to org1 (internal method)