@@ -4862,9 +4853,8 @@ function HistoryTab() {
{(alert) => (
{/* Timestamp */}
|
@@ -4882,17 +4872,16 @@ function HistoryTab() {
{/* Type */}
|
{alert.type}
@@ -4901,11 +4890,10 @@ function HistoryTab() {
{/* Severity */}
|
{alert.level}
@@ -4927,13 +4915,12 @@ function HistoryTab() {
{/* Status */}
|
{alert.status}
@@ -4943,6 +4930,31 @@ function HistoryTab() {
|
{alert.node || '—'}
|
+
+ {/* Actions */}
+
+
+
+
+ |
)}
diff --git a/frontend-modern/src/stores/metricsHistory.ts b/frontend-modern/src/stores/metricsHistory.ts
index 60f17ce38..83737b534 100644
--- a/frontend-modern/src/stores/metricsHistory.ts
+++ b/frontend-modern/src/stores/metricsHistory.ts
@@ -6,6 +6,8 @@
*/
import { logger } from '@/utils/logger';
+import { ChartsAPI, type ChartData, type TimeRange } from '@/api/charts';
+import { buildMetricKey } from '@/utils/metricsKeys';
export interface MetricSnapshot {
timestamp: number; // Unix timestamp in ms
@@ -218,6 +220,184 @@ function debouncedSave() {
}, 5000); // Save 5 seconds after last change
}
+// Track if we've already seeded from backend to avoid redundant fetches
+let hasSeededFromBackend = false;
+let seedingPromise: Promise | null = null;
+
+/**
+ * Seed metrics history from backend historical data.
+ * This provides immediate trend data instead of waiting for 30s samples.
+ * Called automatically when switching to sparklines/trends view.
+ */
+export async function seedFromBackend(range: TimeRange = '1h'): Promise {
+ // Don't re-fetch if we've already seeded
+ if (hasSeededFromBackend) {
+ return;
+ }
+
+ // If already seeding, wait for that request
+ if (seedingPromise) {
+ return seedingPromise;
+ }
+
+ seedingPromise = (async () => {
+ try {
+ logger.info('[MetricsHistory] Seeding from backend', { range });
+ const response = await ChartsAPI.getCharts(range);
+
+ // Get current state to determine guest types
+ // Import dynamically to avoid circular dependency
+ const { getGlobalWebSocketStore } = await import('./websocket-global');
+ const wsStore = getGlobalWebSocketStore();
+
+ // Wait a bit for WebSocket state to populate if it's empty
+ let state = wsStore?.state;
+ if (!state?.vms?.length && !state?.containers?.length) {
+ // Wait up to 2 seconds for state to populate
+ for (let i = 0; i < 4; i++) {
+ await new Promise(resolve => setTimeout(resolve, 500));
+ state = wsStore?.state;
+ if (state?.vms?.length || state?.containers?.length) break;
+ }
+ }
+
+ const now = Date.now();
+ const cutoff = now - MAX_AGE_MS;
+ let seededCount = 0;
+
+ // Helper to convert backend ChartData to our MetricSnapshot format
+ const processChartData = (resourceId: string, chartData: ChartData) => {
+ const cpuPoints = chartData.cpu || [];
+ const memPoints = chartData.memory || [];
+ const diskPoints = chartData.disk || [];
+
+ // If no data, skip
+ if (cpuPoints.length === 0 && memPoints.length === 0 && diskPoints.length === 0) {
+ return;
+ }
+
+ // Get or create ring buffer
+ let ring = metricsHistoryMap.get(resourceId);
+ if (!ring) {
+ ring = createRingBuffer();
+ metricsHistoryMap.set(resourceId, ring);
+ }
+
+ // Find all unique timestamps across all metrics
+ const timestampSet = new Set();
+ cpuPoints.forEach(p => timestampSet.add(p.timestamp));
+ memPoints.forEach(p => timestampSet.add(p.timestamp));
+ diskPoints.forEach(p => timestampSet.add(p.timestamp));
+
+ // Create lookup maps for efficient access
+ const cpuMap = new Map(cpuPoints.map(p => [p.timestamp, p.value]));
+ const memMap = new Map(memPoints.map(p => [p.timestamp, p.value]));
+ const diskMap = new Map(diskPoints.map(p => [p.timestamp, p.value]));
+
+ // Sort timestamps and create snapshots
+ const timestamps = Array.from(timestampSet).sort((a, b) => a - b);
+
+ for (const ts of timestamps) {
+ // Skip if too old
+ if (ts < cutoff) continue;
+
+ // Skip if we already have data around this timestamp (within 15s)
+ let skipDuplicate = false;
+ for (let i = 0; i < ring.size; i++) {
+ const idx = (ring.head + i) % MAX_POINTS;
+ const existing = ring.buffer[idx];
+ if (existing && Math.abs(existing.timestamp - ts) < 15000) {
+ skipDuplicate = true;
+ break;
+ }
+ }
+ if (skipDuplicate) continue;
+
+ const snapshot: MetricSnapshot = {
+ timestamp: ts,
+ cpu: Math.round((cpuMap.get(ts) ?? 0) * 10) / 10,
+ memory: Math.round((memMap.get(ts) ?? 0) * 10) / 10,
+ disk: Math.round((diskMap.get(ts) ?? 0) * 10) / 10,
+ };
+
+ pushToRingBuffer(ring, snapshot);
+ seededCount++;
+ }
+ };
+
+ // Process VMs and containers
+ if (response.data) {
+ // Build a map from ID -> type using WebSocket state
+ const guestTypeMap = new Map();
+ if (state?.vms) {
+ for (const vm of state.vms) {
+ if (vm.id) guestTypeMap.set(vm.id, 'vm');
+ }
+ }
+ if (state?.containers) {
+ for (const ct of state.containers) {
+ if (ct.id) guestTypeMap.set(ct.id, 'container');
+ }
+ }
+
+ const backendIds = Object.keys(response.data);
+ const stateIds = Array.from(guestTypeMap.keys());
+
+ // Debug: Find IDs in backend but not in state
+ const missingInState = backendIds.filter(id => !guestTypeMap.has(id));
+
+ console.log('[SPARKLINE DEBUG] Backend chart IDs:', backendIds);
+ console.log('[SPARKLINE DEBUG] State guest IDs:', stateIds);
+ console.log('[SPARKLINE DEBUG] Missing in state (will be wrong type):', missingInState);
+
+ for (const [id, chartData] of Object.entries(response.data)) {
+ // Look up the guest type from state, default to 'vm' if unknown
+ const guestType = guestTypeMap.get(id) ?? 'vm';
+ const resourceKey = buildMetricKey(guestType, id);
+ processChartData(resourceKey, chartData as ChartData);
+ }
+ }
+
+ // Process nodes
+ if (response.nodeData) {
+ for (const [id, chartData] of Object.entries(response.nodeData)) {
+ const resourceKey = buildMetricKey('node', id);
+ processChartData(resourceKey, chartData as ChartData);
+ }
+ }
+
+
+ hasSeededFromBackend = true;
+ logger.info('[MetricsHistory] Seeded from backend', { seededCount, totalResources: metricsHistoryMap.size });
+
+ // Save to localStorage
+ saveToLocalStorage();
+ } catch (error) {
+ logger.error('[MetricsHistory] Failed to seed from backend', { error });
+ // Don't throw - gracefully degrade to client-side sampling
+ } finally {
+ seedingPromise = null;
+ }
+ })();
+
+ return seedingPromise;
+}
+
+/**
+ * Force re-seed from backend (useful when range changes)
+ */
+export function resetSeedingState(): void {
+ hasSeededFromBackend = false;
+}
+
+/**
+ * Check if we have seeded from backend
+ */
+export function hasSeedData(): boolean {
+ return hasSeededFromBackend;
+}
+
+
/**
* Get metric history for a resource
*/
diff --git a/frontend-modern/src/stores/metricsViewMode.ts b/frontend-modern/src/stores/metricsViewMode.ts
index 4d0df7edd..df157e983 100644
--- a/frontend-modern/src/stores/metricsViewMode.ts
+++ b/frontend-modern/src/stores/metricsViewMode.ts
@@ -7,6 +7,7 @@
import { createSignal } from 'solid-js';
import { STORAGE_KEYS } from '@/utils/localStorage';
+import { seedFromBackend } from './metricsHistory';
export type MetricsViewMode = 'bars' | 'sparklines';
@@ -52,6 +53,14 @@ export function setMetricsViewModePreference(mode: MetricsViewMode): void {
console.warn('Failed to save metrics view mode preference', err);
}
}
+
+ // When switching to sparklines, seed historical data from backend
+ if (mode === 'sparklines') {
+ // Fire and forget - don't block the UI
+ seedFromBackend('1h').catch(() => {
+ // Errors are already logged in seedFromBackend
+ });
+ }
}
/**
diff --git a/frontend-modern/src/styles/animations.css b/frontend-modern/src/styles/animations.css
index 3668a5504..88e397a4e 100644
--- a/frontend-modern/src/styles/animations.css
+++ b/frontend-modern/src/styles/animations.css
@@ -210,3 +210,179 @@
.node-click {
animation: nodeClick 0.15s ease-out;
}
+
+/* AI Context row highlight with mergeable borders */
+/* Using purple-600 (147, 51, 234) for a true purple */
+
+/* Full border - single row or first/last of group */
+@keyframes ai-context-pulse-full {
+ 0%, 100% {
+ background-color: rgba(147, 51, 234, 0.08);
+ box-shadow:
+ inset 2px 0 0 0 rgba(147, 51, 234, 0.4),
+ inset -2px 0 0 0 rgba(147, 51, 234, 0.4),
+ inset 0 2px 0 0 rgba(147, 51, 234, 0.4),
+ inset 0 -2px 0 0 rgba(147, 51, 234, 0.4);
+ }
+ 50% {
+ background-color: rgba(147, 51, 234, 0.16);
+ box-shadow:
+ inset 2px 0 0 0 rgba(147, 51, 234, 0.8),
+ inset -2px 0 0 0 rgba(147, 51, 234, 0.8),
+ inset 0 2px 0 0 rgba(147, 51, 234, 0.8),
+ inset 0 -2px 0 0 rgba(147, 51, 234, 0.8);
+ }
+}
+
+/* No top border - middle or bottom of group */
+@keyframes ai-context-pulse-no-top {
+ 0%, 100% {
+ background-color: rgba(147, 51, 234, 0.08);
+ box-shadow:
+ inset 2px 0 0 0 rgba(147, 51, 234, 0.4),
+ inset -2px 0 0 0 rgba(147, 51, 234, 0.4),
+ inset 0 -2px 0 0 rgba(147, 51, 234, 0.4);
+ }
+ 50% {
+ background-color: rgba(147, 51, 234, 0.16);
+ box-shadow:
+ inset 2px 0 0 0 rgba(147, 51, 234, 0.8),
+ inset -2px 0 0 0 rgba(147, 51, 234, 0.8),
+ inset 0 -2px 0 0 rgba(147, 51, 234, 0.8);
+ }
+}
+
+/* No bottom border - top of group */
+@keyframes ai-context-pulse-no-bottom {
+ 0%, 100% {
+ background-color: rgba(147, 51, 234, 0.08);
+ box-shadow:
+ inset 2px 0 0 0 rgba(147, 51, 234, 0.4),
+ inset -2px 0 0 0 rgba(147, 51, 234, 0.4),
+ inset 0 2px 0 0 rgba(147, 51, 234, 0.4);
+ }
+ 50% {
+ background-color: rgba(147, 51, 234, 0.16);
+ box-shadow:
+ inset 2px 0 0 0 rgba(147, 51, 234, 0.8),
+ inset -2px 0 0 0 rgba(147, 51, 234, 0.8),
+ inset 0 2px 0 0 rgba(147, 51, 234, 0.8);
+ }
+}
+
+/* Side borders only - middle of group */
+@keyframes ai-context-pulse-sides {
+ 0%, 100% {
+ background-color: rgba(147, 51, 234, 0.08);
+ box-shadow:
+ inset 2px 0 0 0 rgba(147, 51, 234, 0.4),
+ inset -2px 0 0 0 rgba(147, 51, 234, 0.4);
+ }
+ 50% {
+ background-color: rgba(147, 51, 234, 0.16);
+ box-shadow:
+ inset 2px 0 0 0 rgba(147, 51, 234, 0.8),
+ inset -2px 0 0 0 rgba(147, 51, 234, 0.8);
+ }
+}
+
+.ai-context-row {
+ animation: ai-context-pulse-full 2s ease-in-out infinite;
+}
+
+.ai-context-row.ai-context-no-top {
+ animation: ai-context-pulse-no-top 2s ease-in-out infinite;
+}
+
+.ai-context-row.ai-context-no-bottom {
+ animation: ai-context-pulse-no-bottom 2s ease-in-out infinite;
+}
+
+.ai-context-row.ai-context-no-top.ai-context-no-bottom {
+ animation: ai-context-pulse-sides 2s ease-in-out infinite;
+}
+
+/* Dark mode - using purple-400 (167, 139, 250) */
+@keyframes ai-context-pulse-full-dark {
+ 0%, 100% {
+ background-color: rgba(147, 51, 234, 0.12);
+ box-shadow:
+ inset 2px 0 0 0 rgba(167, 139, 250, 0.5),
+ inset -2px 0 0 0 rgba(167, 139, 250, 0.5),
+ inset 0 2px 0 0 rgba(167, 139, 250, 0.5),
+ inset 0 -2px 0 0 rgba(167, 139, 250, 0.5);
+ }
+ 50% {
+ background-color: rgba(147, 51, 234, 0.24);
+ box-shadow:
+ inset 2px 0 0 0 rgba(167, 139, 250, 0.9),
+ inset -2px 0 0 0 rgba(167, 139, 250, 0.9),
+ inset 0 2px 0 0 rgba(167, 139, 250, 0.9),
+ inset 0 -2px 0 0 rgba(167, 139, 250, 0.9);
+ }
+}
+
+@keyframes ai-context-pulse-no-top-dark {
+ 0%, 100% {
+ background-color: rgba(147, 51, 234, 0.12);
+ box-shadow:
+ inset 2px 0 0 0 rgba(167, 139, 250, 0.5),
+ inset -2px 0 0 0 rgba(167, 139, 250, 0.5),
+ inset 0 -2px 0 0 rgba(167, 139, 250, 0.5);
+ }
+ 50% {
+ background-color: rgba(147, 51, 234, 0.24);
+ box-shadow:
+ inset 2px 0 0 0 rgba(167, 139, 250, 0.9),
+ inset -2px 0 0 0 rgba(167, 139, 250, 0.9),
+ inset 0 -2px 0 0 rgba(167, 139, 250, 0.9);
+ }
+}
+
+@keyframes ai-context-pulse-no-bottom-dark {
+ 0%, 100% {
+ background-color: rgba(147, 51, 234, 0.12);
+ box-shadow:
+ inset 2px 0 0 0 rgba(167, 139, 250, 0.5),
+ inset -2px 0 0 0 rgba(167, 139, 250, 0.5),
+ inset 0 2px 0 0 rgba(167, 139, 250, 0.5);
+ }
+ 50% {
+ background-color: rgba(147, 51, 234, 0.24);
+ box-shadow:
+ inset 2px 0 0 0 rgba(167, 139, 250, 0.9),
+ inset -2px 0 0 0 rgba(167, 139, 250, 0.9),
+ inset 0 2px 0 0 rgba(167, 139, 250, 0.9);
+ }
+}
+
+@keyframes ai-context-pulse-sides-dark {
+ 0%, 100% {
+ background-color: rgba(147, 51, 234, 0.12);
+ box-shadow:
+ inset 2px 0 0 0 rgba(167, 139, 250, 0.5),
+ inset -2px 0 0 0 rgba(167, 139, 250, 0.5);
+ }
+ 50% {
+ background-color: rgba(147, 51, 234, 0.24);
+ box-shadow:
+ inset 2px 0 0 0 rgba(167, 139, 250, 0.9),
+ inset -2px 0 0 0 rgba(167, 139, 250, 0.9);
+ }
+}
+
+.dark .ai-context-row {
+ animation: ai-context-pulse-full-dark 2s ease-in-out infinite;
+}
+
+.dark .ai-context-row.ai-context-no-top {
+ animation: ai-context-pulse-no-top-dark 2s ease-in-out infinite;
+}
+
+.dark .ai-context-row.ai-context-no-bottom {
+ animation: ai-context-pulse-no-bottom-dark 2s ease-in-out infinite;
+}
+
+.dark .ai-context-row.ai-context-no-top.ai-context-no-bottom {
+ animation: ai-context-pulse-sides-dark 2s ease-in-out infinite;
+}
diff --git a/frontend-modern/src/types/ai.ts b/frontend-modern/src/types/ai.ts
index 543d9e699..fa602e5f2 100644
--- a/frontend-modern/src/types/ai.ts
+++ b/frontend-modern/src/types/ai.ts
@@ -1,6 +1,6 @@
// AI feature types
-export type AIProvider = 'anthropic' | 'openai' | 'ollama';
+export type AIProvider = 'anthropic' | 'openai' | 'ollama' | 'deepseek';
export interface AISettings {
enabled: boolean;
@@ -34,6 +34,7 @@ export const DEFAULT_MODELS: Record = {
anthropic: 'claude-opus-4-5-20251101',
openai: 'gpt-4o',
ollama: 'llama3',
+ deepseek: 'deepseek-reasoner',
};
// Provider display names
@@ -41,6 +42,7 @@ export const PROVIDER_NAMES: Record = {
anthropic: 'Anthropic',
openai: 'OpenAI',
ollama: 'Ollama',
+ deepseek: 'DeepSeek',
};
// Provider descriptions
@@ -48,6 +50,7 @@ export const PROVIDER_DESCRIPTIONS: Record = {
anthropic: 'Claude models from Anthropic',
openai: 'GPT models from OpenAI',
ollama: 'Local models via Ollama',
+ deepseek: 'DeepSeek reasoning models',
};
// Conversation history for multi-turn chats
@@ -82,7 +85,7 @@ export interface AIExecuteResponse {
}
// Streaming event types
-export type AIStreamEventType = 'tool_start' | 'tool_end' | 'content' | 'done' | 'error' | 'complete' | 'approval_needed';
+export type AIStreamEventType = 'tool_start' | 'tool_end' | 'content' | 'thinking' | 'done' | 'error' | 'complete' | 'approval_needed' | 'processing';
export interface AIStreamToolStartData {
name: string;
@@ -101,8 +104,10 @@ export interface AIStreamApprovalNeededData {
tool_id: string;
tool_name: string;
run_on_host: boolean;
+ target_host?: string; // Explicit host to route the command to
}
+
export interface AIStreamEvent {
type: AIStreamEventType;
data?: string | AIStreamToolStartData | AIStreamToolEndData | AIStreamCompleteData | AIStreamApprovalNeededData;
diff --git a/frontend-modern/src/utils/apiClient.ts b/frontend-modern/src/utils/apiClient.ts
index eb0514f90..089e0e7b0 100644
--- a/frontend-modern/src/utils/apiClient.ts
+++ b/frontend-modern/src/utils/apiClient.ts
@@ -181,6 +181,28 @@ class ApiClient {
return false;
}
+ // Ensure CSRF token is available by making a GET request if needed
+ // The backend issues CSRF cookies on GET requests to /api/* endpoints
+ private async ensureCSRFToken(): Promise {
+ try {
+ // Make a simple GET request to trigger CSRF cookie issuance
+ const response = await fetch('/api/health', {
+ method: 'GET',
+ credentials: 'include',
+ });
+
+ // The response should have set the pulse_csrf cookie
+ if (response.ok) {
+ // Small delay to ensure cookie is set
+ await new Promise(resolve => setTimeout(resolve, 10));
+ return this.loadCSRFToken();
+ }
+ } catch (err) {
+ logger.warn('Failed to fetch CSRF token', err);
+ }
+ return null;
+ }
+
// Main fetch wrapper that adds authentication
async fetch(url: string, options: FetchOptions = {}): Promise {
const { skipAuth = false, headers = {}, ...fetchOptions } = options;
@@ -206,7 +228,12 @@ class ApiClient {
// Add CSRF token for state-changing requests
const method = (fetchOptions.method || 'GET').toUpperCase();
if (method !== 'GET' && method !== 'HEAD' && method !== 'OPTIONS') {
- const token = this.loadCSRFToken();
+ // Try to get CSRF token, or fetch one if missing
+ let token = this.loadCSRFToken();
+ if (!token) {
+ // No CSRF token available - try to get one by making a GET request
+ token = await this.ensureCSRFToken();
+ }
if (token) {
finalHeaders['X-CSRF-Token'] = token;
}
@@ -223,7 +250,7 @@ class ApiClient {
// If we get a 401 on an API call (not during initial auth check), redirect to login
// Skip redirect for specific auth-check endpoints to avoid loops
- if (response.status === 401 && !url.includes('/api/security/status') && !url.includes('/api/state')) {
+ if (response.status === 401 && !url.includes('/api/security/status') && !url.includes('/api/state') && !url.includes('/api/settings/ai')) {
logger.warn('Authentication expired - redirecting to login');
// Clear auth and redirect to login
if (typeof window !== 'undefined') {
@@ -234,13 +261,15 @@ class ApiClient {
return response;
}
- // Handle CSRF token failures
+ // Handle CSRF token failures - the 403 response should have set a new CSRF cookie
if (response.status === 403) {
- const csrfHeader = response.headers.get('X-CSRF-Token');
- let refreshedToken: string | null = null;
- if (csrfHeader) {
- refreshedToken = csrfHeader;
- } else {
+ // First try the response header (backend sends new token in X-CSRF-Token header)
+ let refreshedToken = response.headers.get('X-CSRF-Token');
+
+ // If not in header, reload from cookie (backend also sets pulse_csrf cookie on 403)
+ if (!refreshedToken) {
+ // Force reload from cookie - the 403 response just set it
+ this.csrfToken = null;
refreshedToken = this.loadCSRFToken();
}
diff --git a/frontend-modern/vite.config.ts b/frontend-modern/vite.config.ts
index e8ce005c5..c5b7eb3e7 100644
--- a/frontend-modern/vite.config.ts
+++ b/frontend-modern/vite.config.ts
@@ -70,20 +70,25 @@ export default defineConfig({
});
},
},
+ // SSE endpoint for AI chat streaming
'/api/ai/execute/stream': {
target: backendUrl,
changeOrigin: true,
// SSE requires special handling to prevent proxy timeouts
- // Set timeout to 10 minutes (600000ms) for long-running AI requests
- timeout: 600000,
- proxyTimeout: 600000,
+ // Set timeout to 0 to completely disable
+ timeout: 0,
+ proxyTimeout: 0,
configure: (proxy, _options) => {
+ // Completely disable http-proxy internal timeouts
+ proxy.options.timeout = 0;
+ proxy.options.proxyTimeout = 0;
+
// Set proxy-level timeouts
proxy.on('proxyReq', (proxyReq, req, res) => {
// Disable socket timeouts for SSE
req.socket.setTimeout(0);
req.socket.setNoDelay(true);
- req.socket.setKeepAlive(true);
+ req.socket.setKeepAlive(true, 30000);
// Also set on the proxy request
proxyReq.socket?.setTimeout(0);
});
@@ -91,7 +96,7 @@ export default defineConfig({
// Disable response socket timeout
res.socket?.setTimeout(0);
res.socket?.setNoDelay(true);
- res.socket?.setKeepAlive(true);
+ res.socket?.setKeepAlive(true, 30000);
// Also disable on proxy response socket
proxyRes.socket?.setTimeout(0);
});
@@ -100,6 +105,34 @@ export default defineConfig({
});
},
},
+ // SSE endpoint for AI alert investigation (one-click investigate from alerts page)
+ '/api/ai/investigate-alert': {
+ target: backendUrl,
+ changeOrigin: true,
+ // SSE requires special handling to prevent proxy timeouts
+ timeout: 0,
+ proxyTimeout: 0,
+ configure: (proxy, _options) => {
+ proxy.options.timeout = 0;
+ proxy.options.proxyTimeout = 0;
+
+ proxy.on('proxyReq', (proxyReq, req, res) => {
+ req.socket.setTimeout(0);
+ req.socket.setNoDelay(true);
+ req.socket.setKeepAlive(true, 30000);
+ proxyReq.socket?.setTimeout(0);
+ });
+ proxy.on('proxyRes', (proxyRes, req, res) => {
+ res.socket?.setTimeout(0);
+ res.socket?.setNoDelay(true);
+ res.socket?.setKeepAlive(true, 30000);
+ proxyRes.socket?.setTimeout(0);
+ });
+ proxy.on('error', (err, req, res) => {
+ console.error('[SSE Proxy Error - Investigate Alert]', err.message);
+ });
+ },
+ },
'/api/agent/ws': {
target: backendWsUrl,
ws: true,
diff --git a/go.mod b/go.mod
index 8fe201ae1..2fdd72927 100644
--- a/go.mod
+++ b/go.mod
@@ -40,6 +40,7 @@ require (
github.com/distribution/reference v0.6.0 // indirect
github.com/docker/go-connections v0.6.0 // indirect
github.com/docker/go-units v0.5.0 // indirect
+ github.com/dustin/go-humanize v1.0.1 // indirect
github.com/ebitengine/purego v0.9.1 // indirect
github.com/felixge/httpsnoop v1.0.4 // indirect
github.com/go-jose/go-jose/v4 v4.1.3 // indirect
@@ -58,6 +59,7 @@ require (
github.com/moby/term v0.5.2 // indirect
github.com/morikuni/aec v1.0.0 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
+ github.com/ncruces/go-strftime v0.1.9 // indirect
github.com/opencontainers/go-digest v1.0.0 // indirect
github.com/opencontainers/image-spec v1.1.1 // indirect
github.com/pkg/errors v0.9.1 // indirect
@@ -65,6 +67,7 @@ require (
github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 // indirect
github.com/prometheus/common v0.67.4 // indirect
github.com/prometheus/procfs v0.19.2 // indirect
+ github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
github.com/spf13/pflag v1.0.10 // indirect
github.com/tklauser/go-sysconf v0.3.16 // indirect
github.com/tklauser/numcpus v0.11.0 // indirect
@@ -76,7 +79,12 @@ require (
go.opentelemetry.io/otel/metric v1.38.0 // indirect
go.opentelemetry.io/otel/trace v1.38.0 // indirect
go.yaml.in/yaml/v2 v2.4.3 // indirect
+ golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b // indirect
google.golang.org/grpc v1.75.1 // indirect
google.golang.org/protobuf v1.36.10 // indirect
gotest.tools/v3 v3.5.2 // indirect
+ modernc.org/libc v1.66.10 // indirect
+ modernc.org/mathutil v1.7.1 // indirect
+ modernc.org/memory v1.11.0 // indirect
+ modernc.org/sqlite v1.40.1 // indirect
)
diff --git a/go.sum b/go.sum
index 39ca4b27c..fcd51f985 100644
--- a/go.sum
+++ b/go.sum
@@ -28,6 +28,8 @@ github.com/docker/go-connections v0.6.0 h1:LlMG9azAe1TqfR7sO+NJttz1gy6KO7VJBh+pM
github.com/docker/go-connections v0.6.0/go.mod h1:AahvXYshr6JgfUJGdDCs2b5EZG/vmaMAntpSFH5BFKE=
github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4=
github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk=
+github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
+github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
github.com/ebitengine/purego v0.9.1 h1:a/k2f2HQU3Pi399RPW1MOaZyhKJL9w/xFpKAg4q1s0A=
github.com/ebitengine/purego v0.9.1/go.mod h1:iIjxzd6CiRiOG0UyXP+V1+jWqUXVjPKLAI0mRfJZTmQ=
github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
@@ -88,6 +90,8 @@ github.com/morikuni/aec v1.0.0 h1:nP9CBfwrvYnBRgY6qfDQkygYDmYwOilePFkwzv4dU8A=
github.com/morikuni/aec v1.0.0/go.mod h1:BbKIizmSmc5MMPqRYbxO4ZU0S0+P200+tUnFx7PXmsc=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
+github.com/ncruces/go-strftime v0.1.9 h1:bY0MQC28UADQmHmaF5dgpLmImcShSi2kHU9XLdhx/f4=
+github.com/ncruces/go-strftime v0.1.9/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
github.com/oklog/ulid/v2 v2.1.1 h1:suPZ4ARWLOJLegGFiZZ1dFAkqzhMjL3J1TzI+5wHz8s=
github.com/oklog/ulid/v2 v2.1.1/go.mod h1:rcEKHmBBKfef9DhnvX7y1HZBYxjXb0cP5ExxNsTT1QQ=
github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U=
@@ -109,6 +113,8 @@ github.com/prometheus/common v0.67.4 h1:yR3NqWO1/UyO1w2PhUvXlGQs/PtFmoveVO0KZ4+L
github.com/prometheus/common v0.67.4/go.mod h1:gP0fq6YjjNCLssJCQp0yk4M8W6ikLURwkdd/YKtTbyI=
github.com/prometheus/procfs v0.19.2 h1:zUMhqEW66Ex7OXIiDkll3tl9a1ZdilUOd/F6ZXw4Vws=
github.com/prometheus/procfs v0.19.2/go.mod h1:M0aotyiemPhBCM0z5w87kL22CxfcH05ZpYlu+b4J7mw=
+github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
+github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII=
github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o=
github.com/rs/dnscache v0.0.0-20230804202142-fc85eb664529 h1:18kd+8ZUlt/ARXhljq+14TwAoKa61q6dX8jtwOf6DH8=
@@ -162,6 +168,8 @@ go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0=
go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8=
golang.org/x/crypto v0.45.0 h1:jMBrvKuj23MTlT0bQEOBcAE0mjg8mK9RXFhRH6nyF3Q=
golang.org/x/crypto v0.45.0/go.mod h1:XTGrrkGJve7CYK7J8PEww4aY7gM3qMCElcJQ8n8JdX4=
+golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b h1:M2rDM6z3Fhozi9O7NWsxAkg/yqS/lQJ6PmkyIV3YP+o=
+golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b/go.mod h1:3//PLf8L/X+8b4vuAfHzxeRUl04Adcb341+IGKfnqS8=
golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY=
golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU=
golang.org/x/oauth2 v0.33.0 h1:4Q+qn+E5z8gPRJfmRy7C2gGG3T4jIprK6aSYgTXGRpo=
@@ -200,3 +208,11 @@ gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gotest.tools/v3 v3.5.2 h1:7koQfIKdy+I8UTetycgUqXWSDwpgv193Ka+qRsmBY8Q=
gotest.tools/v3 v3.5.2/go.mod h1:LtdLGcnqToBH83WByAAi/wiwSFCArdFIUV/xxN4pcjA=
+modernc.org/libc v1.66.10 h1:yZkb3YeLx4oynyR+iUsXsybsX4Ubx7MQlSYEw4yj59A=
+modernc.org/libc v1.66.10/go.mod h1:8vGSEwvoUoltr4dlywvHqjtAqHBaw0j1jI7iFBTAr2I=
+modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU=
+modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg=
+modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI=
+modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw=
+modernc.org/sqlite v1.40.1 h1:VfuXcxcUWWKRBuP8+BR9L7VnmusMgBNNnBYGEe9w/iY=
+modernc.org/sqlite v1.40.1/go.mod h1:9fjQZ0mB1LLP0GYrp39oOJXx/I2sxEnZtzCmEQIKvGE=
diff --git a/internal/ai/alert_adapter.go b/internal/ai/alert_adapter.go
new file mode 100644
index 000000000..b1c8a39ad
--- /dev/null
+++ b/internal/ai/alert_adapter.go
@@ -0,0 +1,216 @@
+package ai
+
+import (
+ "fmt"
+ "time"
+
+ "github.com/rcourtman/pulse-go-rewrite/internal/alerts"
+ "github.com/rcourtman/pulse-go-rewrite/internal/models"
+)
+
+// AlertManagerAdapter adapts the alerts.Manager to the AI's AlertProvider interface
+type AlertManagerAdapter struct {
+ manager *alerts.Manager
+}
+
+// NewAlertManagerAdapter creates a new adapter for the alert manager
+func NewAlertManagerAdapter(manager *alerts.Manager) *AlertManagerAdapter {
+ return &AlertManagerAdapter{manager: manager}
+}
+
+// GetActiveAlerts returns all currently active alerts
+func (a *AlertManagerAdapter) GetActiveAlerts() []AlertInfo {
+ if a.manager == nil {
+ return nil
+ }
+
+ activeAlerts := a.manager.GetActiveAlerts()
+ result := make([]AlertInfo, 0, len(activeAlerts))
+
+ for _, alert := range activeAlerts {
+ result = append(result, convertAlertFromManager(&alert))
+ }
+
+ return result
+}
+
+// GetRecentlyResolved returns alerts resolved in the last N minutes
+func (a *AlertManagerAdapter) GetRecentlyResolved(minutes int) []ResolvedAlertInfo {
+ if a.manager == nil {
+ return nil
+ }
+
+ resolvedAlerts := a.manager.GetRecentlyResolved()
+ cutoff := time.Now().Add(-time.Duration(minutes) * time.Minute)
+ result := make([]ResolvedAlertInfo, 0)
+
+ for _, resolved := range resolvedAlerts {
+ if resolved.ResolvedTime.After(cutoff) {
+ info := ResolvedAlertInfo{
+ AlertInfo: convertAlertFromModels(&resolved.Alert),
+ ResolvedTime: resolved.ResolvedTime,
+ Duration: formatDuration(resolved.ResolvedTime.Sub(resolved.StartTime)),
+ }
+ result = append(result, info)
+ }
+ }
+
+ return result
+}
+
+// GetAlertsByResource returns active alerts for a specific resource
+func (a *AlertManagerAdapter) GetAlertsByResource(resourceID string) []AlertInfo {
+ if a.manager == nil {
+ return nil
+ }
+
+ activeAlerts := a.manager.GetActiveAlerts()
+ result := make([]AlertInfo, 0)
+
+ for _, alert := range activeAlerts {
+ if alert.ResourceID == resourceID {
+ result = append(result, convertAlertFromManager(&alert))
+ }
+ }
+
+ return result
+}
+
+// GetAlertHistory returns historical alerts for a resource
+func (a *AlertManagerAdapter) GetAlertHistory(resourceID string, limit int) []ResolvedAlertInfo {
+ if a.manager == nil {
+ return nil
+ }
+
+ // Get from recently resolved and filter by resource
+ resolvedAlerts := a.manager.GetRecentlyResolved()
+ result := make([]ResolvedAlertInfo, 0)
+
+ for _, resolved := range resolvedAlerts {
+ if resolved.ResourceID == resourceID {
+ info := ResolvedAlertInfo{
+ AlertInfo: convertAlertFromModels(&resolved.Alert),
+ ResolvedTime: resolved.ResolvedTime,
+ Duration: formatDuration(resolved.ResolvedTime.Sub(resolved.StartTime)),
+ }
+ result = append(result, info)
+ if len(result) >= limit {
+ break
+ }
+ }
+ }
+
+ return result
+}
+
+// convertAlertFromManager converts an alerts.Alert to AI's AlertInfo
+func convertAlertFromManager(alert *alerts.Alert) AlertInfo {
+ if alert == nil {
+ return AlertInfo{}
+ }
+
+ resourceType := inferResourceType(alert.Type, alert.Metadata)
+
+ return AlertInfo{
+ ID: alert.ID,
+ Type: alert.Type,
+ Level: string(alert.Level),
+ ResourceID: alert.ResourceID,
+ ResourceName: alert.ResourceName,
+ ResourceType: resourceType,
+ Node: alert.Node,
+ Instance: alert.Instance,
+ Message: alert.Message,
+ Value: alert.Value,
+ Threshold: alert.Threshold,
+ StartTime: alert.StartTime,
+ Duration: formatDuration(time.Since(alert.StartTime)),
+ Acknowledged: alert.Acknowledged,
+ }
+}
+
+// convertAlertFromModels converts a models.Alert to AI's AlertInfo
+func convertAlertFromModels(alert *models.Alert) AlertInfo {
+ if alert == nil {
+ return AlertInfo{}
+ }
+
+ resourceType := inferResourceType(alert.Type, nil)
+
+ return AlertInfo{
+ ID: alert.ID,
+ Type: alert.Type,
+ Level: alert.Level,
+ ResourceID: alert.ResourceID,
+ ResourceName: alert.ResourceName,
+ ResourceType: resourceType,
+ Node: alert.Node,
+ Instance: alert.Instance,
+ Message: alert.Message,
+ Value: alert.Value,
+ Threshold: alert.Threshold,
+ StartTime: alert.StartTime,
+ Duration: formatDuration(time.Since(alert.StartTime)),
+ Acknowledged: alert.Acknowledged,
+ }
+}
+
+// inferResourceType infers resource type from alert type
+func inferResourceType(alertType string, metadata map[string]interface{}) string {
+ if metadata != nil {
+ if rt, ok := metadata["resourceType"].(string); ok {
+ return rt
+ }
+ }
+
+ switch {
+ case alertType == "node_offline" || alertType == "node_cpu" || alertType == "node_memory" || alertType == "node_temperature":
+ return "node"
+ case alertType == "storage_usage" || alertType == "storage":
+ return "storage"
+ case alertType == "docker_cpu" || alertType == "docker_memory" || alertType == "docker_restart" || alertType == "docker_offline":
+ return "docker"
+ case alertType == "host_cpu" || alertType == "host_memory" || alertType == "host_offline" || alertType == "host_disk":
+ return "host"
+ case alertType == "pmg" || alertType == "pmg_queue" || alertType == "pmg_quarantine":
+ return "pmg"
+ case alertType == "backup" || alertType == "backup_missing":
+ return "backup"
+ case alertType == "snapshot" || alertType == "snapshot_age":
+ return "snapshot"
+ default:
+ return "guest"
+ }
+}
+
+// formatDuration returns a human-readable duration string
+func formatDuration(d time.Duration) string {
+ if d < time.Minute {
+ return "< 1 min"
+ } else if d < time.Hour {
+ mins := int(d.Minutes())
+ if mins == 1 {
+ return "1 min"
+ }
+ return fmt.Sprintf("%d mins", mins)
+ } else if d < 24*time.Hour {
+ hours := int(d.Hours())
+ mins := int(d.Minutes()) % 60
+ if mins > 0 {
+ return fmt.Sprintf("%dh %dm", hours, mins)
+ }
+ if hours == 1 {
+ return "1 hour"
+ }
+ return fmt.Sprintf("%d hours", hours)
+ }
+ days := int(d.Hours() / 24)
+ hours := int(d.Hours()) % 24
+ if hours > 0 {
+ return fmt.Sprintf("%dd %dh", days, hours)
+ }
+ if days == 1 {
+ return "1 day"
+ }
+ return fmt.Sprintf("%d days", days)
+}
diff --git a/internal/ai/alert_provider.go b/internal/ai/alert_provider.go
new file mode 100644
index 000000000..3034b7cdc
--- /dev/null
+++ b/internal/ai/alert_provider.go
@@ -0,0 +1,237 @@
+// Package ai provides AI-powered infrastructure investigation and remediation.
+package ai
+
+import (
+ "fmt"
+ "strings"
+ "time"
+)
+
+// AlertInfo contains information about an alert for AI context
+type AlertInfo struct {
+ ID string `json:"id"`
+ Type string `json:"type"` // cpu, memory, disk, offline, etc.
+ Level string `json:"level"` // warning, critical
+ ResourceID string `json:"resource_id"` // unique resource identifier
+ ResourceName string `json:"resource_name"` // human-readable name
+ ResourceType string `json:"resource_type"` // guest, node, storage, docker, etc.
+ Node string `json:"node"` // PVE node (if applicable)
+ Instance string `json:"instance"` // Proxmox instance name
+ Message string `json:"message"` // Alert description
+ Value float64 `json:"value"` // Current metric value
+ Threshold float64 `json:"threshold"` // Threshold that was exceeded
+ StartTime time.Time `json:"start_time"` // When alert started
+ Duration string `json:"duration"` // Human-readable duration
+ Acknowledged bool `json:"acknowledged"` // Whether alert has been acked
+}
+
+// ResolvedAlertInfo contains information about a recently resolved alert
+type ResolvedAlertInfo struct {
+ AlertInfo
+ ResolvedTime time.Time `json:"resolved_time"`
+ Duration string `json:"total_duration"` // How long the alert lasted
+}
+
+// AlertProvider provides access to the current alert state
+type AlertProvider interface {
+ // GetActiveAlerts returns all currently active alerts
+ GetActiveAlerts() []AlertInfo
+
+ // GetRecentlyResolved returns alerts resolved in the last N minutes
+ GetRecentlyResolved(minutes int) []ResolvedAlertInfo
+
+ // GetAlertsByResource returns active alerts for a specific resource
+ GetAlertsByResource(resourceID string) []AlertInfo
+
+ // GetAlertHistory returns historical alerts for a resource
+ GetAlertHistory(resourceID string, limit int) []ResolvedAlertInfo
+}
+
+// SetAlertProvider sets the alert provider for AI context
+func (s *Service) SetAlertProvider(ap AlertProvider) {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+ s.alertProvider = ap
+}
+
+// buildAlertContext generates AI context from current alerts
+func (s *Service) buildAlertContext() string {
+ s.mu.RLock()
+ ap := s.alertProvider
+ s.mu.RUnlock()
+
+ if ap == nil {
+ return ""
+ }
+
+ activeAlerts := ap.GetActiveAlerts()
+ recentlyResolved := ap.GetRecentlyResolved(30) // Last 30 minutes
+
+ if len(activeAlerts) == 0 && len(recentlyResolved) == 0 {
+ return ""
+ }
+
+ var sections []string
+ sections = append(sections, "\n## Alert Status")
+
+ // Active alerts
+ if len(activeAlerts) > 0 {
+ sections = append(sections, "\n### Active Alerts")
+ sections = append(sections, fmt.Sprintf("There are **%d active alert(s)** that may need attention:\n", len(activeAlerts)))
+
+ // Group by severity
+ var critical, warning []AlertInfo
+ for _, a := range activeAlerts {
+ if a.Level == "critical" {
+ critical = append(critical, a)
+ } else {
+ warning = append(warning, a)
+ }
+ }
+
+ if len(critical) > 0 {
+ sections = append(sections, "**Critical:**")
+ for _, a := range critical {
+ sections = append(sections, formatAlertForAI(a))
+ }
+ }
+
+ if len(warning) > 0 {
+ sections = append(sections, "**Warning:**")
+ for _, a := range warning {
+ sections = append(sections, formatAlertForAI(a))
+ }
+ }
+ } else {
+ sections = append(sections, "\n### No Active Alerts")
+ sections = append(sections, "All systems are operating within normal thresholds.")
+ }
+
+ // Recently resolved
+ if len(recentlyResolved) > 0 {
+ sections = append(sections, fmt.Sprintf("\n### Recently Resolved (%d)", len(recentlyResolved)))
+ sections = append(sections, "These alerts were resolved in the last 30 minutes:")
+ // Show up to 5 most recent
+ limit := 5
+ if len(recentlyResolved) < limit {
+ limit = len(recentlyResolved)
+ }
+ for i := 0; i < limit; i++ {
+ a := recentlyResolved[i]
+ sections = append(sections, fmt.Sprintf("- **%s** on %s: %s (lasted %s, resolved %s ago)",
+ a.Type, a.ResourceName, a.Message, a.Duration,
+ formatTimeAgo(a.ResolvedTime)))
+ }
+ if len(recentlyResolved) > limit {
+ sections = append(sections, fmt.Sprintf(" ... and %d more", len(recentlyResolved)-limit))
+ }
+ }
+
+ return strings.Join(sections, "\n")
+}
+
+// buildTargetAlertContext builds alert context for a specific target
+func (s *Service) buildTargetAlertContext(resourceID string) string {
+ s.mu.RLock()
+ ap := s.alertProvider
+ s.mu.RUnlock()
+
+ if ap == nil || resourceID == "" {
+ return ""
+ }
+
+ alerts := ap.GetAlertsByResource(resourceID)
+ if len(alerts) == 0 {
+ return ""
+ }
+
+ var lines []string
+ lines = append(lines, "\n### Active Alerts for This Resource")
+ for _, a := range alerts {
+ lines = append(lines, formatAlertForAI(a))
+ }
+
+ return strings.Join(lines, "\n")
+}
+
+// formatAlertForAI formats an alert for inclusion in AI context
+func formatAlertForAI(a AlertInfo) string {
+ ackedNote := ""
+ if a.Acknowledged {
+ ackedNote = " [ACKNOWLEDGED]"
+ }
+
+ nodeInfo := ""
+ if a.Node != "" {
+ nodeInfo = fmt.Sprintf(" on node %s", a.Node)
+ }
+
+ return fmt.Sprintf("- **%s** %s: %s (current: %.1f%%, threshold: %.1f%%) - active for %s%s%s",
+ strings.ToUpper(a.Level), a.Type, a.ResourceName,
+ a.Value, a.Threshold, a.Duration, nodeInfo, ackedNote)
+}
+
+// formatTimeAgo returns a human-readable time-ago string
+func formatTimeAgo(t time.Time) string {
+ d := time.Since(t)
+ if d < time.Minute {
+ return "just now"
+ } else if d < time.Hour {
+ mins := int(d.Minutes())
+ if mins == 1 {
+ return "1 minute"
+ }
+ return fmt.Sprintf("%d minutes", mins)
+ } else if d < 24*time.Hour {
+ hours := int(d.Hours())
+ if hours == 1 {
+ return "1 hour"
+ }
+ return fmt.Sprintf("%d hours", hours)
+ }
+ days := int(d.Hours() / 24)
+ if days == 1 {
+ return "1 day"
+ }
+ return fmt.Sprintf("%d days", days)
+}
+
+// AlertInvestigationRequest represents a request to investigate an alert
+type AlertInvestigationRequest struct {
+ AlertID string `json:"alert_id"`
+ ResourceID string `json:"resource_id"`
+ ResourceName string `json:"resource_name"`
+ ResourceType string `json:"resource_type"` // guest, node, storage, docker
+ AlertType string `json:"alert_type"` // cpu, memory, disk, offline, etc.
+ Level string `json:"level"` // warning, critical
+ Value float64 `json:"value"`
+ Threshold float64 `json:"threshold"`
+ Message string `json:"message"`
+ Duration string `json:"duration"` // How long the alert has been active
+ Node string `json:"node,omitempty"`
+ VMID int `json:"vmid,omitempty"`
+}
+
+// GenerateAlertInvestigationPrompt creates a focused prompt for alert investigation
+func GenerateAlertInvestigationPrompt(req AlertInvestigationRequest) string {
+ var prompt strings.Builder
+
+ prompt.WriteString(fmt.Sprintf("Investigate this %s alert:\n\n", strings.ToUpper(req.Level)))
+ prompt.WriteString(fmt.Sprintf("**Resource:** %s (%s)\n", req.ResourceName, req.ResourceType))
+ prompt.WriteString(fmt.Sprintf("**Alert Type:** %s\n", req.AlertType))
+ prompt.WriteString(fmt.Sprintf("**Current Value:** %.1f%%\n", req.Value))
+ prompt.WriteString(fmt.Sprintf("**Threshold:** %.1f%%\n", req.Threshold))
+ prompt.WriteString(fmt.Sprintf("**Duration:** %s\n", req.Duration))
+
+ if req.Node != "" {
+ prompt.WriteString(fmt.Sprintf("**Node:** %s\n", req.Node))
+ }
+
+ prompt.WriteString("\n**Action Required:**\n")
+ prompt.WriteString("1. Identify the root cause of this alert\n")
+ prompt.WriteString("2. Check related metrics and system state\n")
+ prompt.WriteString("3. Suggest specific remediation steps\n")
+ prompt.WriteString("4. If safe, execute diagnostic commands to gather more info\n")
+
+ return prompt.String()
+}
diff --git a/internal/ai/config.go b/internal/ai/config.go
index af3229003..f5d898ed0 100644
--- a/internal/ai/config.go
+++ b/internal/ai/config.go
@@ -11,6 +11,7 @@ const (
ProviderAnthropic = config.AIProviderAnthropic
ProviderOpenAI = config.AIProviderOpenAI
ProviderOllama = config.AIProviderOllama
+ ProviderDeepSeek = config.AIProviderDeepSeek
)
// NewDefaultConfig returns a new AI config with sensible defaults
diff --git a/internal/ai/knowledge/store.go b/internal/ai/knowledge/store.go
new file mode 100644
index 000000000..c72379b81
--- /dev/null
+++ b/internal/ai/knowledge/store.go
@@ -0,0 +1,421 @@
+// Package knowledge provides persistent storage for AI-learned information about guests
+package knowledge
+
+import (
+ "encoding/json"
+ "fmt"
+ "os"
+ "path/filepath"
+ "strings"
+ "sync"
+ "time"
+
+ "github.com/rcourtman/pulse-go-rewrite/internal/crypto"
+ "github.com/rs/zerolog/log"
+)
+
+// Note represents a single piece of learned information
+type Note struct {
+ ID string `json:"id"`
+ Category string `json:"category"` // "service", "path", "credential", "config", "learning", "history"
+ Title string `json:"title"`
+ Content string `json:"content"`
+ CreatedAt time.Time `json:"created_at"`
+ UpdatedAt time.Time `json:"updated_at"`
+}
+
+// GuestKnowledge represents all knowledge about a specific guest
+type GuestKnowledge struct {
+ GuestID string `json:"guest_id"`
+ GuestName string `json:"guest_name"`
+ GuestType string `json:"guest_type"` // "vm", "container", "node", "host"
+ Notes []Note `json:"notes"`
+ UpdatedAt time.Time `json:"updated_at"`
+}
+
+// Store manages persistent knowledge storage with encryption
+type Store struct {
+ dataDir string
+ mu sync.RWMutex
+ cache map[string]*GuestKnowledge
+ crypto *crypto.CryptoManager
+}
+
+// NewStore creates a new knowledge store with encryption
+func NewStore(dataDir string) (*Store, error) {
+ knowledgeDir := filepath.Join(dataDir, "knowledge")
+ if err := os.MkdirAll(knowledgeDir, 0700); err != nil {
+ return nil, fmt.Errorf("failed to create knowledge directory: %w", err)
+ }
+
+ // Initialize crypto manager for encryption (uses same key as other Pulse secrets)
+ cryptoMgr, err := crypto.NewCryptoManagerAt(dataDir)
+ if err != nil {
+ log.Warn().Err(err).Msg("Failed to initialize crypto for knowledge store, data will be unencrypted")
+ }
+
+ return &Store{
+ dataDir: knowledgeDir,
+ cache: make(map[string]*GuestKnowledge),
+ crypto: cryptoMgr,
+ }, nil
+}
+
+// guestFilePath returns the file path for a guest's knowledge
+func (s *Store) guestFilePath(guestID string) string {
+ // Sanitize guest ID for filesystem
+ safeID := filepath.Base(guestID) // Prevent path traversal
+ // Use .enc extension for encrypted files
+ if s.crypto != nil {
+ return filepath.Join(s.dataDir, safeID+".enc")
+ }
+ return filepath.Join(s.dataDir, safeID+".json")
+}
+
+// GetKnowledge retrieves knowledge for a guest
+func (s *Store) GetKnowledge(guestID string) (*GuestKnowledge, error) {
+ s.mu.RLock()
+ if cached, ok := s.cache[guestID]; ok {
+ s.mu.RUnlock()
+ return cached, nil
+ }
+ s.mu.RUnlock()
+
+ // Load from disk
+ s.mu.Lock()
+ defer s.mu.Unlock()
+
+ // Double-check after acquiring write lock
+ if cached, ok := s.cache[guestID]; ok {
+ return cached, nil
+ }
+
+ filePath := s.guestFilePath(guestID)
+ data, err := os.ReadFile(filePath)
+ if os.IsNotExist(err) {
+ // Try legacy unencrypted file
+ legacyPath := filepath.Join(s.dataDir, filepath.Base(guestID)+".json")
+ data, err = os.ReadFile(legacyPath)
+ if os.IsNotExist(err) {
+ // No knowledge yet, return empty
+ knowledge := &GuestKnowledge{
+ GuestID: guestID,
+ Notes: []Note{},
+ }
+ s.cache[guestID] = knowledge
+ return knowledge, nil
+ }
+ if err != nil {
+ return nil, fmt.Errorf("failed to read knowledge file: %w", err)
+ }
+ // Legacy file found - will be encrypted on next save
+ log.Info().Str("guest_id", guestID).Msg("Found unencrypted knowledge file, will encrypt on next save")
+ } else if err != nil {
+ return nil, fmt.Errorf("failed to read knowledge file: %w", err)
+ }
+
+ // Decrypt if crypto is available and file is encrypted
+ if s.crypto != nil && filepath.Ext(filePath) == ".enc" {
+ decrypted, err := s.crypto.Decrypt(data)
+ if err != nil {
+ // Try as plain JSON (migration case)
+ var knowledge GuestKnowledge
+ if jsonErr := json.Unmarshal(data, &knowledge); jsonErr == nil {
+ log.Info().Str("guest_id", guestID).Msg("Loaded unencrypted knowledge (will encrypt on next save)")
+ s.cache[guestID] = &knowledge
+ return &knowledge, nil
+ }
+ return nil, fmt.Errorf("failed to decrypt knowledge: %w", err)
+ }
+ data = decrypted
+ }
+
+ var knowledge GuestKnowledge
+ if err := json.Unmarshal(data, &knowledge); err != nil {
+ return nil, fmt.Errorf("failed to parse knowledge file: %w", err)
+ }
+
+ s.cache[guestID] = &knowledge
+ return &knowledge, nil
+}
+
+// SaveNote adds or updates a note for a guest
+func (s *Store) SaveNote(guestID, guestName, guestType, category, title, content string) error {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+
+ // Get or create knowledge
+ knowledge, ok := s.cache[guestID]
+ if !ok {
+ // Try to load from disk first
+ knowledge = &GuestKnowledge{
+ GuestID: guestID,
+ GuestName: guestName,
+ GuestType: guestType,
+ Notes: []Note{},
+ }
+
+ // Check for existing file
+ filePath := s.guestFilePath(guestID)
+ if data, err := os.ReadFile(filePath); err == nil {
+ // Decrypt if needed
+ if s.crypto != nil && filepath.Ext(filePath) == ".enc" {
+ if decrypted, err := s.crypto.Decrypt(data); err == nil {
+ data = decrypted
+ }
+ }
+ if err := json.Unmarshal(data, &knowledge); err != nil {
+ log.Warn().Err(err).Str("guest_id", guestID).Msg("Failed to parse existing knowledge, starting fresh")
+ }
+ }
+ s.cache[guestID] = knowledge
+ }
+
+ // Update guest info if provided
+ if guestName != "" {
+ knowledge.GuestName = guestName
+ }
+ if guestType != "" {
+ knowledge.GuestType = guestType
+ }
+
+ now := time.Now()
+
+ // Check if note with same title exists in category
+ found := false
+ for i, note := range knowledge.Notes {
+ if note.Category == category && note.Title == title {
+ // Update existing note
+ knowledge.Notes[i].Content = content
+ knowledge.Notes[i].UpdatedAt = now
+ found = true
+ break
+ }
+ }
+
+ if !found {
+ // Add new note
+ note := Note{
+ ID: fmt.Sprintf("%s-%d", category, len(knowledge.Notes)+1),
+ Category: category,
+ Title: title,
+ Content: content,
+ CreatedAt: now,
+ UpdatedAt: now,
+ }
+ knowledge.Notes = append(knowledge.Notes, note)
+ }
+
+ knowledge.UpdatedAt = now
+
+ // Save to disk (encrypted)
+ return s.saveToFile(guestID, knowledge)
+}
+
+// DeleteNote removes a note
+func (s *Store) DeleteNote(guestID, noteID string) error {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+
+ knowledge, ok := s.cache[guestID]
+ if !ok {
+ return fmt.Errorf("guest not found: %s", guestID)
+ }
+
+ // Find and remove note
+ for i, note := range knowledge.Notes {
+ if note.ID == noteID {
+ knowledge.Notes = append(knowledge.Notes[:i], knowledge.Notes[i+1:]...)
+ knowledge.UpdatedAt = time.Now()
+ return s.saveToFile(guestID, knowledge)
+ }
+ }
+
+ return fmt.Errorf("note not found: %s", noteID)
+}
+
+// GetNotesByCategory returns notes filtered by category
+func (s *Store) GetNotesByCategory(guestID, category string) ([]Note, error) {
+ knowledge, err := s.GetKnowledge(guestID)
+ if err != nil {
+ return nil, err
+ }
+
+ var notes []Note
+ for _, note := range knowledge.Notes {
+ if category == "" || note.Category == category {
+ notes = append(notes, note)
+ }
+ }
+ return notes, nil
+}
+
+// FormatForContext formats knowledge for injection into AI context
+func (s *Store) FormatForContext(guestID string) string {
+ knowledge, err := s.GetKnowledge(guestID)
+ if err != nil {
+ log.Warn().Err(err).Str("guest_id", guestID).Msg("Failed to load guest knowledge")
+ return ""
+ }
+
+ if len(knowledge.Notes) == 0 {
+ return ""
+ }
+
+ // Group notes by category
+ byCategory := make(map[string][]Note)
+ for _, note := range knowledge.Notes {
+ byCategory[note.Category] = append(byCategory[note.Category], note)
+ }
+
+ // Build formatted output with guidance on using this knowledge
+ var result string
+ result = fmt.Sprintf("\n## Previously Learned Information about %s\n", knowledge.GuestName)
+ result += "**If relevant to the current task, use this saved information directly instead of rediscovering it.**\n"
+
+ categoryOrder := []string{"credential", "service", "path", "config", "learning", "history"}
+ categoryNames := map[string]string{
+ "credential": "Credentials",
+ "service": "Services",
+ "path": "Important Paths",
+ "config": "Configuration",
+ "learning": "Learnings",
+ "history": "Session History",
+ }
+
+ for _, cat := range categoryOrder {
+ notes, ok := byCategory[cat]
+ if !ok || len(notes) == 0 {
+ continue
+ }
+
+ result += fmt.Sprintf("\n### %s\n", categoryNames[cat])
+ for _, note := range notes {
+ result += fmt.Sprintf("- **%s**: %s\n", note.Title, note.Content)
+ }
+ }
+
+ return result
+}
+
+// saveToFile persists knowledge to disk with encryption
+func (s *Store) saveToFile(guestID string, knowledge *GuestKnowledge) error {
+ data, err := json.MarshalIndent(knowledge, "", " ")
+ if err != nil {
+ return fmt.Errorf("failed to marshal knowledge: %w", err)
+ }
+
+ // Encrypt if crypto manager is available
+ if s.crypto != nil {
+ encrypted, err := s.crypto.Encrypt(data)
+ if err != nil {
+ return fmt.Errorf("failed to encrypt knowledge: %w", err)
+ }
+ data = encrypted
+ }
+
+ filePath := s.guestFilePath(guestID)
+ if err := os.WriteFile(filePath, data, 0600); err != nil {
+ return fmt.Errorf("failed to write knowledge file: %w", err)
+ }
+
+ // Remove legacy unencrypted file if it exists
+ if s.crypto != nil {
+ legacyPath := filepath.Join(s.dataDir, filepath.Base(guestID)+".json")
+ if _, err := os.Stat(legacyPath); err == nil {
+ os.Remove(legacyPath)
+ log.Info().Str("guest_id", guestID).Msg("Removed legacy unencrypted knowledge file")
+ }
+ }
+
+ log.Debug().
+ Str("guest_id", guestID).
+ Int("notes", len(knowledge.Notes)).
+ Bool("encrypted", s.crypto != nil).
+ Msg("Saved guest knowledge")
+
+ return nil
+}
+
+// ListGuests returns all guests that have knowledge stored
+func (s *Store) ListGuests() ([]string, error) {
+ files, err := os.ReadDir(s.dataDir)
+ if err != nil {
+ return nil, fmt.Errorf("failed to read knowledge directory: %w", err)
+ }
+
+ var guests []string
+ for _, file := range files {
+ ext := filepath.Ext(file.Name())
+ if ext == ".json" || ext == ".enc" {
+ guestID := file.Name()[:len(file.Name())-len(ext)]
+ guests = append(guests, guestID)
+ }
+ }
+ return guests, nil
+}
+
+// FormatAllForContext returns a summary of all saved knowledge across all guests
+// This is used when no specific target is selected to give the AI full context
+func (s *Store) FormatAllForContext() string {
+ guests, err := s.ListGuests()
+ if err != nil || len(guests) == 0 {
+ return ""
+ }
+
+ var sections []string
+ totalNotes := 0
+
+ for _, guestID := range guests {
+ knowledge, err := s.GetKnowledge(guestID)
+ if err != nil || len(knowledge.Notes) == 0 {
+ continue
+ }
+
+ totalNotes += len(knowledge.Notes)
+
+ // Build a summary for this guest
+ guestName := knowledge.GuestName
+ if guestName == "" {
+ guestName = guestID
+ }
+
+ // Group notes by category
+ byCategory := make(map[string][]Note)
+ for _, note := range knowledge.Notes {
+ byCategory[note.Category] = append(byCategory[note.Category], note)
+ }
+
+ var guestSection string
+ guestSection = fmt.Sprintf("\n### %s (%s)", guestName, knowledge.GuestType)
+
+ categoryOrder := []string{"credential", "service", "path", "config", "learning"}
+ for _, cat := range categoryOrder {
+ notes, ok := byCategory[cat]
+ if !ok || len(notes) == 0 {
+ continue
+ }
+ for _, note := range notes {
+ // Mask credentials in the summary
+ content := note.Content
+ if cat == "credential" && len(content) > 6 {
+ content = content[:2] + "****" + content[len(content)-2:]
+ }
+ guestSection += fmt.Sprintf("\n- **%s**: %s", note.Title, content)
+ }
+ }
+
+ sections = append(sections, guestSection)
+ }
+
+ if len(sections) == 0 {
+ return ""
+ }
+
+ result := fmt.Sprintf("\n\n## Saved Knowledge (%d notes across %d guests)\n", totalNotes, len(sections))
+ result += "This is information learned from previous sessions. Use it to avoid rediscovery.\n"
+ result += strings.Join(sections, "\n")
+
+ return result
+}
+
diff --git a/internal/ai/providers/anthropic.go b/internal/ai/providers/anthropic.go
index f3b8c744b..710b15893 100644
--- a/internal/ai/providers/anthropic.go
+++ b/internal/ai/providers/anthropic.go
@@ -32,7 +32,8 @@ func NewAnthropicClient(apiKey, model string) *AnthropicClient {
apiKey: apiKey,
model: model,
client: &http.Client{
- Timeout: 120 * time.Second, // LLM responses can take a while
+ // 5 minutes - Opus and other large models can take a very long time
+ Timeout: 300 * time.Second,
},
}
}
diff --git a/internal/ai/providers/factory.go b/internal/ai/providers/factory.go
index a789d24de..183a8aaed 100644
--- a/internal/ai/providers/factory.go
+++ b/internal/ai/providers/factory.go
@@ -32,6 +32,13 @@ func NewFromConfig(cfg *config.AIConfig) (Provider, error) {
case config.AIProviderOllama:
return NewOllamaClient(cfg.GetModel(), cfg.GetBaseURL()), nil
+ case config.AIProviderDeepSeek:
+ if cfg.APIKey == "" {
+ return nil, fmt.Errorf("DeepSeek API key is required")
+ }
+ // DeepSeek uses OpenAI-compatible API
+ return NewOpenAIClient(cfg.APIKey, cfg.GetModel(), cfg.GetBaseURL()), nil
+
default:
return nil, fmt.Errorf("unknown provider: %s", cfg.Provider)
}
diff --git a/internal/ai/providers/openai.go b/internal/ai/providers/openai.go
index 514f06ccc..ab4a537a7 100644
--- a/internal/ai/providers/openai.go
+++ b/internal/ai/providers/openai.go
@@ -7,14 +7,20 @@ import (
"fmt"
"io"
"net/http"
+ "strings"
"time"
+
+ "github.com/rs/zerolog/log"
)
const (
- openaiAPIURL = "https://api.openai.com/v1/chat/completions"
+ openaiAPIURL = "https://api.openai.com/v1/chat/completions"
+ openaiMaxRetries = 3
+ openaiInitialBackoff = 2 * time.Second
)
// OpenAIClient implements the Provider interface for OpenAI's API
+// Also works with OpenAI-compatible APIs like DeepSeek
type OpenAIClient struct {
apiKey string
model string
@@ -32,7 +38,8 @@ func NewOpenAIClient(apiKey, model, baseURL string) *OpenAIClient {
model: model,
baseURL: baseURL,
client: &http.Client{
- Timeout: 120 * time.Second,
+ // 5 minutes timeout - DeepSeek reasoning models can take a long time
+ Timeout: 300 * time.Second,
},
}
}
@@ -44,15 +51,52 @@ func (c *OpenAIClient) Name() string {
// openaiRequest is the request body for the OpenAI API
type openaiRequest struct {
- Model string `json:"model"`
- Messages []openaiMessage `json:"messages"`
- MaxTokens int `json:"max_tokens,omitempty"`
- Temperature float64 `json:"temperature,omitempty"`
+ Model string `json:"model"`
+ Messages []openaiMessage `json:"messages"`
+ MaxTokens int `json:"max_tokens,omitempty"`
+ Temperature float64 `json:"temperature,omitempty"`
+ Tools []openaiTool `json:"tools,omitempty"`
+ ToolChoice interface{} `json:"tool_choice,omitempty"` // "auto", "none", or specific tool
+}
+
+// deepseekRequest extends openaiRequest with DeepSeek-specific fields
+type deepseekRequest struct {
+ Model string `json:"model"`
+ Messages []openaiMessage `json:"messages"`
+ MaxTokens int `json:"max_tokens,omitempty"`
+ Tools []openaiTool `json:"tools,omitempty"`
+ ToolChoice interface{} `json:"tool_choice,omitempty"`
+}
+
+// openaiTool represents a function tool in OpenAI format
+type openaiTool struct {
+ Type string `json:"type"` // always "function"
+ Function openaiFunction `json:"function"`
+}
+
+type openaiFunction struct {
+ Name string `json:"name"`
+ Description string `json:"description,omitempty"`
+ Parameters map[string]interface{} `json:"parameters,omitempty"`
}
type openaiMessage struct {
- Role string `json:"role"`
- Content string `json:"content"`
+ Role string `json:"role"`
+ Content interface{} `json:"content,omitempty"` // string or null for tool calls
+ ReasoningContent string `json:"reasoning_content,omitempty"` // DeepSeek thinking mode
+ ToolCalls []openaiToolCall `json:"tool_calls,omitempty"` // For assistant messages with tool calls
+ ToolCallID string `json:"tool_call_id,omitempty"` // For tool response messages
+}
+
+type openaiToolCall struct {
+ ID string `json:"id"`
+ Type string `json:"type"` // always "function"
+ Function openaiToolFunction `json:"function"`
+}
+
+type openaiToolFunction struct {
+ Name string `json:"name"`
+ Arguments string `json:"arguments"` // JSON string of arguments
}
// openaiResponse is the response from the OpenAI API
@@ -67,8 +111,15 @@ type openaiResponse struct {
type openaiChoice struct {
Index int `json:"index"`
- Message openaiMessage `json:"message"`
- FinishReason string `json:"finish_reason"`
+ Message openaiRespMsg `json:"message"`
+ FinishReason string `json:"finish_reason"` // "stop", "tool_calls", etc.
+}
+
+type openaiRespMsg struct {
+ Role string `json:"role"`
+ Content string `json:"content,omitempty"`
+ ReasoningContent string `json:"reasoning_content,omitempty"` // DeepSeek thinking mode
+ ToolCalls []openaiToolCall `json:"tool_calls,omitempty"`
}
type openaiUsage struct {
@@ -87,6 +138,16 @@ type openaiErrorDetail struct {
Code string `json:"code"`
}
+// isDeepSeek returns true if this client is configured for DeepSeek
+func (c *OpenAIClient) isDeepSeek() bool {
+ return strings.Contains(c.baseURL, "deepseek.com")
+}
+
+// isDeepSeekReasoner returns true if using DeepSeek's reasoning model
+func (c *OpenAIClient) isDeepSeekReasoner() bool {
+ return c.isDeepSeek() && strings.Contains(c.model, "reasoner")
+}
+
// Chat sends a chat request to the OpenAI API
func (c *OpenAIClient) Chat(ctx context.Context, req ChatRequest) (*ChatResponse, error) {
// Convert messages to OpenAI format
@@ -101,10 +162,45 @@ func (c *OpenAIClient) Chat(ctx context.Context, req ChatRequest) (*ChatResponse
}
for _, m := range req.Messages {
- messages = append(messages, openaiMessage{
- Role: m.Role,
- Content: m.Content,
- })
+ msg := openaiMessage{
+ Role: m.Role,
+ }
+
+ // Handle tool calls in assistant messages
+ if len(m.ToolCalls) > 0 {
+ msg.Content = nil // Content is null when there are tool calls
+ if m.Content != "" {
+ msg.Content = m.Content
+ }
+ // For DeepSeek reasoner, include reasoning_content if present
+ if c.isDeepSeekReasoner() && m.ReasoningContent != "" {
+ msg.ReasoningContent = m.ReasoningContent
+ }
+ for _, tc := range m.ToolCalls {
+ argsJSON, _ := json.Marshal(tc.Input)
+ msg.ToolCalls = append(msg.ToolCalls, openaiToolCall{
+ ID: tc.ID,
+ Type: "function",
+ Function: openaiToolFunction{
+ Name: tc.Name,
+ Arguments: string(argsJSON),
+ },
+ })
+ }
+ } else if m.ToolResult != nil {
+ // This is a tool result message
+ msg.Role = "tool"
+ msg.Content = m.ToolResult.Content
+ msg.ToolCallID = m.ToolResult.ToolUseID
+ } else {
+ msg.Content = m.Content
+ // For assistant messages with reasoning content (DeepSeek)
+ if c.isDeepSeekReasoner() && m.ReasoningContent != "" {
+ msg.ReasoningContent = m.ReasoningContent
+ }
+ }
+
+ messages = append(messages, msg)
}
// Use provided model or fall back to client default
@@ -113,6 +209,7 @@ func (c *OpenAIClient) Chat(ctx context.Context, req ChatRequest) (*ChatResponse
model = c.model
}
+ // Build request
openaiReq := openaiRequest{
Model: model,
Messages: messages,
@@ -122,40 +219,114 @@ func (c *OpenAIClient) Chat(ctx context.Context, req ChatRequest) (*ChatResponse
openaiReq.MaxTokens = req.MaxTokens
}
- if req.Temperature > 0 {
+ // DeepSeek reasoner doesn't support temperature
+ if req.Temperature > 0 && !c.isDeepSeekReasoner() {
openaiReq.Temperature = req.Temperature
}
+ // Convert tools to OpenAI format
+ if len(req.Tools) > 0 {
+ for _, t := range req.Tools {
+ // Skip non-function tools (like web_search)
+ if t.Type != "" && t.Type != "function" {
+ continue
+ }
+ openaiReq.Tools = append(openaiReq.Tools, openaiTool{
+ Type: "function",
+ Function: openaiFunction{
+ Name: t.Name,
+ Description: t.Description,
+ Parameters: t.InputSchema,
+ },
+ })
+ }
+ if len(openaiReq.Tools) > 0 {
+ openaiReq.ToolChoice = "auto"
+ }
+ }
+
body, err := json.Marshal(openaiReq)
if err != nil {
return nil, fmt.Errorf("failed to marshal request: %w", err)
}
- httpReq, err := http.NewRequestWithContext(ctx, "POST", c.baseURL, bytes.NewReader(body))
- if err != nil {
- return nil, fmt.Errorf("failed to create request: %w", err)
- }
+ // Retry loop for transient errors (connection resets, 429, 5xx)
+ var respBody []byte
+ var lastErr error
- httpReq.Header.Set("Content-Type", "application/json")
- httpReq.Header.Set("Authorization", "Bearer "+c.apiKey)
+ for attempt := 0; attempt <= openaiMaxRetries; attempt++ {
+ if attempt > 0 {
+ // Exponential backoff: 2s, 4s, 8s
+ backoff := openaiInitialBackoff * time.Duration(1<<(attempt-1))
+ log.Warn().
+ Int("attempt", attempt).
+ Dur("backoff", backoff).
+ Str("last_error", lastErr.Error()).
+ Msg("Retrying OpenAI/DeepSeek API request after transient error")
- resp, err := c.client.Do(httpReq)
- if err != nil {
- return nil, fmt.Errorf("request failed: %w", err)
- }
- defer resp.Body.Close()
-
- respBody, err := io.ReadAll(resp.Body)
- if err != nil {
- return nil, fmt.Errorf("failed to read response: %w", err)
- }
-
- if resp.StatusCode != http.StatusOK {
- var errResp openaiError
- if err := json.Unmarshal(respBody, &errResp); err == nil && errResp.Error.Message != "" {
- return nil, fmt.Errorf("API error (%d): %s", resp.StatusCode, errResp.Error.Message)
+ select {
+ case <-ctx.Done():
+ return nil, ctx.Err()
+ case <-time.After(backoff):
+ }
}
- return nil, fmt.Errorf("API error (%d): %s", resp.StatusCode, string(respBody))
+
+ httpReq, err := http.NewRequestWithContext(ctx, "POST", c.baseURL, bytes.NewReader(body))
+ if err != nil {
+ return nil, fmt.Errorf("failed to create request: %w", err)
+ }
+
+ httpReq.Header.Set("Content-Type", "application/json")
+ httpReq.Header.Set("Authorization", "Bearer "+c.apiKey)
+
+ resp, err := c.client.Do(httpReq)
+ if err != nil {
+ // Check if this is a retryable connection error
+ errStr := err.Error()
+ if strings.Contains(errStr, "connection reset") ||
+ strings.Contains(errStr, "connection refused") ||
+ strings.Contains(errStr, "EOF") ||
+ strings.Contains(errStr, "timeout") {
+ lastErr = fmt.Errorf("connection error: %w", err)
+ continue
+ }
+ return nil, fmt.Errorf("request failed: %w", err)
+ }
+
+ respBody, err = io.ReadAll(resp.Body)
+ resp.Body.Close()
+ if err != nil {
+ lastErr = fmt.Errorf("failed to read response: %w", err)
+ continue
+ }
+
+ // Check for retryable HTTP errors
+ if resp.StatusCode == 429 || resp.StatusCode == 502 || resp.StatusCode == 503 || resp.StatusCode == 504 {
+ var errResp openaiError
+ errMsg := string(respBody)
+ if err := json.Unmarshal(respBody, &errResp); err == nil && errResp.Error.Message != "" {
+ errMsg = errResp.Error.Message
+ }
+ lastErr = fmt.Errorf("API error (%d): %s", resp.StatusCode, errMsg)
+ continue
+ }
+
+ // Non-retryable error
+ if resp.StatusCode != http.StatusOK {
+ var errResp openaiError
+ if err := json.Unmarshal(respBody, &errResp); err == nil && errResp.Error.Message != "" {
+ return nil, fmt.Errorf("API error (%d): %s", resp.StatusCode, errResp.Error.Message)
+ }
+ return nil, fmt.Errorf("API error (%d): %s", resp.StatusCode, string(respBody))
+ }
+
+ // Success - break out of retry loop
+ lastErr = nil
+ break
+ }
+
+ if lastErr != nil {
+ return nil, fmt.Errorf("request failed after %d retries: %w", openaiMaxRetries, lastErr)
}
var openaiResp openaiResponse
@@ -167,13 +338,33 @@ func (c *OpenAIClient) Chat(ctx context.Context, req ChatRequest) (*ChatResponse
return nil, fmt.Errorf("no response choices returned")
}
- return &ChatResponse{
- Content: openaiResp.Choices[0].Message.Content,
- Model: openaiResp.Model,
- StopReason: openaiResp.Choices[0].FinishReason,
- InputTokens: openaiResp.Usage.PromptTokens,
- OutputTokens: openaiResp.Usage.CompletionTokens,
- }, nil
+ choice := openaiResp.Choices[0]
+ result := &ChatResponse{
+ Content: choice.Message.Content,
+ ReasoningContent: choice.Message.ReasoningContent, // DeepSeek thinking mode
+ Model: openaiResp.Model,
+ StopReason: choice.FinishReason,
+ InputTokens: openaiResp.Usage.PromptTokens,
+ OutputTokens: openaiResp.Usage.CompletionTokens,
+ }
+
+ // Convert tool calls from OpenAI format to our format
+ if len(choice.Message.ToolCalls) > 0 {
+ result.StopReason = "tool_use" // Normalize to match Anthropic's format
+ for _, tc := range choice.Message.ToolCalls {
+ var input map[string]interface{}
+ if err := json.Unmarshal([]byte(tc.Function.Arguments), &input); err != nil {
+ input = map[string]interface{}{"raw": tc.Function.Arguments}
+ }
+ result.ToolCalls = append(result.ToolCalls, ToolCall{
+ ID: tc.ID,
+ Name: tc.Function.Name,
+ Input: input,
+ })
+ }
+ }
+
+ return result, nil
}
// TestConnection validates the API key by making a minimal request
diff --git a/internal/ai/providers/provider.go b/internal/ai/providers/provider.go
index c858bf14c..b61e9ec80 100644
--- a/internal/ai/providers/provider.go
+++ b/internal/ai/providers/provider.go
@@ -7,10 +7,11 @@ import (
// Message represents a chat message
type Message struct {
- Role string `json:"role"` // "user", "assistant", "system"
- Content string `json:"content"` // Text content (simple case)
- ToolCalls []ToolCall `json:"tool_calls,omitempty"` // For assistant messages with tool calls
- ToolResult *ToolResult `json:"tool_result,omitempty"` // For user messages with tool results
+ Role string `json:"role"` // "user", "assistant", "system"
+ Content string `json:"content"` // Text content (simple case)
+ ReasoningContent string `json:"reasoning_content,omitempty"` // DeepSeek thinking mode
+ ToolCalls []ToolCall `json:"tool_calls,omitempty"` // For assistant messages with tool calls
+ ToolResult *ToolResult `json:"tool_result,omitempty"` // For user messages with tool results
}
// ToolCall represents a tool invocation from the AI
@@ -48,12 +49,13 @@ type ChatRequest struct {
// ChatResponse represents a response from the AI provider
type ChatResponse struct {
- Content string `json:"content"`
- Model string `json:"model"`
- StopReason string `json:"stop_reason,omitempty"` // "end_turn", "tool_use"
- ToolCalls []ToolCall `json:"tool_calls,omitempty"` // Tool invocations
- InputTokens int `json:"input_tokens,omitempty"`
- OutputTokens int `json:"output_tokens,omitempty"`
+ Content string `json:"content"`
+ ReasoningContent string `json:"reasoning_content,omitempty"` // DeepSeek thinking mode
+ Model string `json:"model"`
+ StopReason string `json:"stop_reason,omitempty"` // "end_turn", "tool_use"
+ ToolCalls []ToolCall `json:"tool_calls,omitempty"` // Tool invocations
+ InputTokens int `json:"input_tokens,omitempty"`
+ OutputTokens int `json:"output_tokens,omitempty"`
}
// Provider defines the interface for AI providers
diff --git a/internal/ai/routing.go b/internal/ai/routing.go
new file mode 100644
index 000000000..840927679
--- /dev/null
+++ b/internal/ai/routing.go
@@ -0,0 +1,354 @@
+// Package ai provides AI-powered diagnostic and command execution capabilities.
+// This file contains the robust agent routing logic for executing commands on the correct host.
+package ai
+
+import (
+ "fmt"
+ "strconv"
+ "strings"
+
+ "github.com/rcourtman/pulse-go-rewrite/internal/agentexec"
+ "github.com/rcourtman/pulse-go-rewrite/internal/config"
+ "github.com/rs/zerolog/log"
+)
+
+// RoutingResult contains the result of agent routing
+type RoutingResult struct {
+ AgentID string // ID of the selected agent
+ AgentHostname string // Hostname of the selected agent
+ TargetNode string // The node we're trying to reach
+ TargetVMID string // The VMID (for container/VM targets)
+ RoutingMethod string // How we determined the route (for debugging)
+ ClusterPeer bool // True if routing via a cluster peer
+ Warnings []string // Any warnings encountered during routing
+}
+
+// RoutingError represents a routing failure with actionable information
+type RoutingError struct {
+ TargetNode string
+ TargetVMID int
+ AvailableAgents []string
+ Reason string
+ Suggestion string
+}
+
+func (e *RoutingError) Error() string {
+ if e.Suggestion != "" {
+ return fmt.Sprintf("%s. %s", e.Reason, e.Suggestion)
+ }
+ return e.Reason
+}
+
+// routeToAgent determines which agent should execute a command.
+// This is the authoritative routing function that should be used for all command execution.
+//
+// Routing priority:
+// 1. VMID lookup from state (most reliable for pct/qm commands)
+// 2. Explicit "node" field in context
+// 3. Explicit "guest_node" field in context
+// 4. "hostname" field for host targets
+// 5. VMID extracted from target ID (last resort)
+//
+// Agent matching is EXACT only - no substring matching to prevent false positives.
+// If no direct match, cluster peer routing is attempted.
+// If all else fails, returns an explicit error rather than silently using wrong agent.
+func (s *Service) routeToAgent(req ExecuteRequest, command string, agents []agentexec.ConnectedAgent) (*RoutingResult, error) {
+ result := &RoutingResult{}
+
+ if len(agents) == 0 {
+ return nil, &RoutingError{
+ Reason: "No agents are connected to Pulse",
+ Suggestion: "Install pulse-agent on at least one host",
+ }
+ }
+
+ // Build a map of available agents for quick lookup and error messages
+ agentMap := make(map[string]agentexec.ConnectedAgent) // lowercase hostname -> agent
+ var agentHostnames []string
+ for _, agent := range agents {
+ hostname := strings.TrimSpace(strings.ToLower(agent.Hostname))
+ agentMap[hostname] = agent
+ agentHostnames = append(agentHostnames, agent.Hostname)
+ }
+
+ // Step 1: Try VMID-based routing (most authoritative for pct/qm commands)
+ if vmid, requiresOwnerNode, found := extractVMIDFromCommand(command); found && requiresOwnerNode {
+ targetInstance := ""
+ if inst, ok := req.Context["instance"].(string); ok {
+ targetInstance = inst
+ }
+
+ guests := s.lookupGuestsByVMID(vmid, targetInstance)
+
+ if len(guests) == 0 {
+ result.Warnings = append(result.Warnings,
+ fmt.Sprintf("VMID %d not found in Pulse state - routing based on context", vmid))
+ } else if len(guests) == 1 {
+ result.TargetNode = strings.ToLower(guests[0].Node)
+ result.RoutingMethod = "vmid_lookup"
+ log.Info().
+ Int("vmid", vmid).
+ Str("node", guests[0].Node).
+ Str("guest_name", guests[0].Name).
+ Msg("Routed command via VMID state lookup")
+ } else {
+ // Multiple matches - try to disambiguate
+ if targetInstance != "" {
+ for _, g := range guests {
+ if strings.EqualFold(g.Instance, targetInstance) {
+ result.TargetNode = strings.ToLower(g.Node)
+ result.RoutingMethod = "vmid_lookup_with_instance"
+ log.Info().
+ Int("vmid", vmid).
+ Str("node", g.Node).
+ Str("instance", g.Instance).
+ Msg("Resolved VMID collision using instance")
+ break
+ }
+ }
+ }
+ if result.TargetNode == "" {
+ // Return explicit error for VMID collision
+ var locations []string
+ for _, g := range guests {
+ locations = append(locations, fmt.Sprintf("%s on %s (%s)", g.Name, g.Node, g.Instance))
+ }
+ return nil, &RoutingError{
+ TargetVMID: vmid,
+ AvailableAgents: agentHostnames,
+ Reason: fmt.Sprintf("VMID %d exists on multiple nodes: %s",
+ vmid, strings.Join(locations, ", ")),
+ Suggestion: "Specify the instance/cluster in your query to disambiguate",
+ }
+ }
+ }
+ }
+
+ // Step 2: Try context-based routing (explicit node information)
+ if result.TargetNode == "" {
+ if node, ok := req.Context["node"].(string); ok && node != "" {
+ result.TargetNode = strings.ToLower(node)
+ result.RoutingMethod = "context_node"
+ log.Debug().
+ Str("node", node).
+ Str("command", command).
+ Msg("Routing via explicit 'node' in context")
+ } else if node, ok := req.Context["guest_node"].(string); ok && node != "" {
+ result.TargetNode = strings.ToLower(node)
+ result.RoutingMethod = "context_guest_node"
+ log.Debug().
+ Str("guest_node", node).
+ Str("command", command).
+ Msg("Routing via 'guest_node' in context")
+ } else if req.TargetType == "host" {
+ if hostname, ok := req.Context["hostname"].(string); ok && hostname != "" {
+ result.TargetNode = strings.ToLower(hostname)
+ result.RoutingMethod = "context_hostname"
+ log.Debug().
+ Str("hostname", hostname).
+ Str("command", command).
+ Msg("Routing via 'hostname' in context")
+ } else {
+ // For host target type with no node info, log a warning
+ // This is a common source of routing issues
+ log.Warn().
+ Str("target_type", req.TargetType).
+ Str("target_id", req.TargetID).
+ Str("command", command).
+ Msg("Host command with no node/hostname in context - may route to wrong agent")
+ result.Warnings = append(result.Warnings,
+ "No target host specified in context. Use target_host parameter for reliable routing.")
+ }
+ }
+ }
+
+
+ // Step 3: Extract VMID from target ID and look up in state
+ if result.TargetNode == "" && req.TargetID != "" {
+ if vmid := extractVMIDFromTargetID(req.TargetID); vmid > 0 {
+ result.TargetVMID = strconv.Itoa(vmid)
+
+ // Try instance from context
+ targetInstance := ""
+ if inst, ok := req.Context["instance"].(string); ok {
+ targetInstance = inst
+ }
+
+ guests := s.lookupGuestsByVMID(vmid, targetInstance)
+ if len(guests) == 1 {
+ result.TargetNode = strings.ToLower(guests[0].Node)
+ result.RoutingMethod = "target_id_vmid_lookup"
+ log.Debug().
+ Int("vmid", vmid).
+ Str("node", guests[0].Node).
+ Str("target_id", req.TargetID).
+ Msg("Resolved node from target ID VMID lookup")
+ }
+ }
+ }
+
+ // Step 4: Try to find exact matching agent
+ if result.TargetNode != "" {
+ targetNodeClean := strings.TrimSpace(strings.ToLower(result.TargetNode))
+
+ // EXACT match only - no substring matching
+ if agent, exists := agentMap[targetNodeClean]; exists {
+ result.AgentID = agent.AgentID
+ result.AgentHostname = agent.Hostname
+ log.Debug().
+ Str("target_node", result.TargetNode).
+ Str("agent", agent.Hostname).
+ Str("method", result.RoutingMethod).
+ Msg("Exact agent match found")
+ return result, nil
+ }
+
+ // Try cluster peer routing
+ if peerAgentID := s.findClusterPeerAgent(targetNodeClean, agents); peerAgentID != "" {
+ for _, agent := range agents {
+ if agent.AgentID == peerAgentID {
+ result.AgentID = peerAgentID
+ result.AgentHostname = agent.Hostname
+ result.ClusterPeer = true
+ log.Info().
+ Str("target_node", result.TargetNode).
+ Str("peer_agent", agent.Hostname).
+ Msg("Routing via cluster peer agent")
+ return result, nil
+ }
+ }
+ }
+
+ // No agent available for this node
+ return nil, &RoutingError{
+ TargetNode: result.TargetNode,
+ AvailableAgents: agentHostnames,
+ Reason: fmt.Sprintf("No agent connected to node %q", result.TargetNode),
+ Suggestion: fmt.Sprintf("Install pulse-agent on %q, or ensure it's in a cluster with %s",
+ result.TargetNode, strings.Join(agentHostnames, ", ")),
+ }
+ }
+
+ // Step 5: No target node determined - for host commands with no context, use first agent
+ if req.TargetType == "host" && len(agents) == 1 {
+ result.AgentID = agents[0].AgentID
+ result.AgentHostname = agents[0].Hostname
+ result.RoutingMethod = "single_agent_fallback"
+ result.Warnings = append(result.Warnings,
+ fmt.Sprintf("No target node specified, using the only connected agent (%s). For multi-agent setups, specify target_host.", agents[0].Hostname))
+ log.Info().
+ Str("agent", agents[0].Hostname).
+ Str("target_type", req.TargetType).
+ Msg("Routing via single-agent fallback")
+ return result, nil
+ }
+
+ // Cannot determine where to route
+ // Provide actionable error with available agents listed
+ log.Error().
+ Str("target_type", req.TargetType).
+ Str("target_id", req.TargetID).
+ Strs("available_agents", agentHostnames).
+ Msg("Routing failed - cannot determine target agent")
+
+ return nil, &RoutingError{
+ AvailableAgents: agentHostnames,
+ Reason: "Cannot determine which agent should execute this command",
+ Suggestion: fmt.Sprintf("Use target_host parameter with one of: %s. Or specify VMID in the command for pct/qm commands.",
+ strings.Join(agentHostnames, ", ")),
+ }
+
+}
+
+// extractVMIDFromTargetID extracts a numeric VMID from various target ID formats.
+// Handles formats like:
+// - "delly-minipc-106" -> 106
+// - "minipc-106" -> 106
+// - "106" -> 106
+// - "lxc-106" -> 106
+// - "vm-106" -> 106
+func extractVMIDFromTargetID(targetID string) int {
+ if targetID == "" {
+ return 0
+ }
+
+ // Try parsing the whole thing as a number first
+ if vmid, err := strconv.Atoi(targetID); err == nil && vmid > 0 {
+ return vmid
+ }
+
+ // Split by hyphen and take the last numeric part
+ parts := strings.Split(targetID, "-")
+ for i := len(parts) - 1; i >= 0; i-- {
+ if vmid, err := strconv.Atoi(parts[i]); err == nil && vmid > 0 {
+ return vmid
+ }
+ }
+
+ return 0
+}
+
+// findClusterPeerAgent finds an agent that can execute commands for a node in the same cluster.
+// For PVE clusters, any node can execute pvesh/vzdump commands, but pct exec/qm guest exec
+// require the agent to be on the specific node.
+func (s *Service) findClusterPeerAgent(targetNode string, agents []agentexec.ConnectedAgent) string {
+ // Check for nil persistence
+ if s.persistence == nil {
+ return ""
+ }
+
+ // Load nodes config to check cluster membership
+ nodesConfig, err := s.persistence.LoadNodesConfig()
+ if err != nil || nodesConfig == nil {
+ return ""
+ }
+
+ // Find which cluster the target node belongs to
+ var targetCluster string
+ var clusterEndpoints []config.ClusterEndpoint
+
+ for _, pve := range nodesConfig.PVEInstances {
+ if strings.EqualFold(pve.Name, targetNode) {
+ if pve.IsCluster && pve.ClusterName != "" {
+ targetCluster = pve.ClusterName
+ clusterEndpoints = pve.ClusterEndpoints
+ }
+ break
+ }
+ // Also check cluster endpoints
+ for _, ep := range pve.ClusterEndpoints {
+ if strings.EqualFold(ep.NodeName, targetNode) {
+ if pve.IsCluster && pve.ClusterName != "" {
+ targetCluster = pve.ClusterName
+ clusterEndpoints = pve.ClusterEndpoints
+ }
+ break
+ }
+ }
+ }
+
+ if targetCluster == "" {
+ return ""
+ }
+
+ // Build list of cluster member nodes
+ clusterNodes := make(map[string]bool)
+ for _, ep := range clusterEndpoints {
+ clusterNodes[strings.ToLower(ep.NodeName)] = true
+ }
+
+ // Find an agent on any cluster member
+ for _, agent := range agents {
+ agentHostname := strings.ToLower(agent.Hostname)
+ if clusterNodes[agentHostname] {
+ log.Debug().
+ Str("target_node", targetNode).
+ Str("cluster", targetCluster).
+ Str("peer_agent", agent.Hostname).
+ Msg("Found cluster peer agent")
+ return agent.AgentID
+ }
+ }
+
+ return ""
+}
diff --git a/internal/ai/routing_test.go b/internal/ai/routing_test.go
new file mode 100644
index 000000000..188238723
--- /dev/null
+++ b/internal/ai/routing_test.go
@@ -0,0 +1,276 @@
+package ai
+
+import (
+ "testing"
+
+ "github.com/rcourtman/pulse-go-rewrite/internal/agentexec"
+)
+
+func TestExtractVMIDFromTargetID(t *testing.T) {
+ tests := []struct {
+ name string
+ targetID string
+ want int
+ }{
+ // Standard formats
+ {"plain vmid", "106", 106},
+ {"node-vmid", "minipc-106", 106},
+ {"instance-node-vmid", "delly-minipc-106", 106},
+
+ // Edge cases with hyphenated names
+ {"hyphenated-node-vmid", "pve-node-01-106", 106},
+ {"hyphenated-instance-node-vmid", "my-cluster-pve-node-01-106", 106},
+
+ // Type prefixes
+ {"lxc prefix", "lxc-106", 106},
+ {"vm prefix", "vm-106", 106},
+ {"ct prefix", "ct-106", 106},
+
+ // Non-numeric - should return 0
+ {"non-numeric", "mycontainer", 0},
+ {"no-vmid", "node-name", 0},
+ {"empty", "", 0},
+
+ // Large VMIDs (Proxmox uses up to 999999999)
+ {"large vmid", "node-999999", 999999},
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ got := extractVMIDFromTargetID(tt.targetID)
+ if got != tt.want {
+ t.Errorf("extractVMIDFromTargetID(%q) = %d, want %d", tt.targetID, got, tt.want)
+ }
+ })
+ }
+}
+
+func TestRoutingError(t *testing.T) {
+ t.Run("with suggestion", func(t *testing.T) {
+ err := &RoutingError{
+ TargetNode: "minipc",
+ AvailableAgents: []string{"delly", "pimox"},
+ Reason: "No agent connected to node \"minipc\"",
+ Suggestion: "Install pulse-agent on minipc",
+ }
+
+ want := "No agent connected to node \"minipc\". Install pulse-agent on minipc"
+ if err.Error() != want {
+ t.Errorf("Error() = %q, want %q", err.Error(), want)
+ }
+ })
+
+ t.Run("without suggestion", func(t *testing.T) {
+ err := &RoutingError{
+ Reason: "No agents connected",
+ }
+
+ want := "No agents connected"
+ if err.Error() != want {
+ t.Errorf("Error() = %q, want %q", err.Error(), want)
+ }
+ })
+}
+
+func TestRouteToAgent_NoAgents(t *testing.T) {
+ s := &Service{}
+
+ req := ExecuteRequest{
+ TargetType: "container",
+ TargetID: "minipc-106",
+ }
+
+ _, err := s.routeToAgent(req, "pct exec 106 -- hostname", nil)
+ if err == nil {
+ t.Error("expected error for no agents, got nil")
+ }
+
+ routingErr, ok := err.(*RoutingError)
+ if !ok {
+ t.Fatalf("expected RoutingError, got %T", err)
+ }
+
+ if routingErr.Suggestion == "" {
+ t.Error("expected suggestion in error")
+ }
+}
+
+func TestRouteToAgent_ExactMatch(t *testing.T) {
+ s := &Service{}
+
+ agents := []agentexec.ConnectedAgent{
+ {AgentID: "agent-1", Hostname: "delly"},
+ {AgentID: "agent-2", Hostname: "minipc"},
+ {AgentID: "agent-3", Hostname: "pimox"},
+ }
+
+ tests := []struct {
+ name string
+ req ExecuteRequest
+ command string
+ wantAgentID string
+ wantHostname string
+ }{
+ {
+ name: "route by context node",
+ req: ExecuteRequest{
+ TargetType: "container",
+ TargetID: "delly-minipc-106",
+ Context: map[string]interface{}{"node": "minipc"},
+ },
+ command: "hostname",
+ wantAgentID: "agent-2",
+ wantHostname: "minipc",
+ },
+ {
+ name: "route by context hostname for host target",
+ req: ExecuteRequest{
+ TargetType: "host",
+ Context: map[string]interface{}{"hostname": "delly"},
+ },
+ command: "uptime",
+ wantAgentID: "agent-1",
+ wantHostname: "delly",
+ },
+ {
+ name: "route by guest_node context",
+ req: ExecuteRequest{
+ TargetType: "vm",
+ TargetID: "100",
+ Context: map[string]interface{}{"guest_node": "pimox"},
+ },
+ command: "hostname",
+ wantAgentID: "agent-3",
+ wantHostname: "pimox",
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ result, err := s.routeToAgent(tt.req, tt.command, agents)
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+
+ if result.AgentID != tt.wantAgentID {
+ t.Errorf("AgentID = %q, want %q", result.AgentID, tt.wantAgentID)
+ }
+
+ if result.AgentHostname != tt.wantHostname {
+ t.Errorf("AgentHostname = %q, want %q", result.AgentHostname, tt.wantHostname)
+ }
+ })
+ }
+}
+
+func TestRouteToAgent_NoSubstringMatching(t *testing.T) {
+ s := &Service{}
+
+ // Agent "mini" should NOT match node "minipc"
+ agents := []agentexec.ConnectedAgent{
+ {AgentID: "agent-1", Hostname: "mini"},
+ {AgentID: "agent-2", Hostname: "pc"},
+ }
+
+ req := ExecuteRequest{
+ TargetType: "container",
+ Context: map[string]interface{}{"node": "minipc"},
+ }
+
+ _, err := s.routeToAgent(req, "hostname", agents)
+ if err == nil {
+ t.Error("expected error when no exact match, got nil (substring matching may be occurring)")
+ }
+
+ routingErr, ok := err.(*RoutingError)
+ if !ok {
+ t.Fatalf("expected RoutingError, got %T", err)
+ }
+
+ if routingErr.TargetNode != "minipc" {
+ t.Errorf("TargetNode = %q, want %q", routingErr.TargetNode, "minipc")
+ }
+}
+
+func TestRouteToAgent_CaseInsensitive(t *testing.T) {
+ s := &Service{}
+
+ agents := []agentexec.ConnectedAgent{
+ {AgentID: "agent-1", Hostname: "MiniPC"},
+ }
+
+ req := ExecuteRequest{
+ TargetType: "container",
+ Context: map[string]interface{}{"node": "minipc"}, // lowercase
+ }
+
+ result, err := s.routeToAgent(req, "hostname", agents)
+ if err != nil {
+ t.Fatalf("expected case-insensitive match, got error: %v", err)
+ }
+
+ if result.AgentID != "agent-1" {
+ t.Errorf("AgentID = %q, want %q", result.AgentID, "agent-1")
+ }
+}
+
+func TestRouteToAgent_HyphenatedNodeNames(t *testing.T) {
+ s := &Service{}
+
+ agents := []agentexec.ConnectedAgent{
+ {AgentID: "agent-1", Hostname: "pve-node-01"},
+ {AgentID: "agent-2", Hostname: "pve-node-02"},
+ }
+
+ req := ExecuteRequest{
+ TargetType: "container",
+ Context: map[string]interface{}{"node": "pve-node-02"},
+ }
+
+ result, err := s.routeToAgent(req, "hostname", agents)
+ if err != nil {
+ t.Fatalf("unexpected error for hyphenated node names: %v", err)
+ }
+
+ if result.AgentID != "agent-2" {
+ t.Errorf("AgentID = %q, want %q", result.AgentID, "agent-2")
+ }
+}
+
+func TestRouteToAgent_ActionableErrorMessages(t *testing.T) {
+ s := &Service{}
+
+ agents := []agentexec.ConnectedAgent{
+ {AgentID: "agent-1", Hostname: "delly"},
+ }
+
+ req := ExecuteRequest{
+ TargetType: "container",
+ Context: map[string]interface{}{"node": "minipc"},
+ }
+
+ _, err := s.routeToAgent(req, "hostname", agents)
+ if err == nil {
+ t.Fatal("expected error, got nil")
+ }
+
+ routingErr, ok := err.(*RoutingError)
+ if !ok {
+ t.Fatalf("expected RoutingError, got %T", err)
+ }
+
+ // Error should mention the target node
+ if routingErr.TargetNode != "minipc" {
+ t.Errorf("TargetNode = %q, want %q", routingErr.TargetNode, "minipc")
+ }
+
+ // Error should list available agents
+ if len(routingErr.AvailableAgents) == 0 {
+ t.Error("expected available agents in error")
+ }
+
+ // Error should have actionable suggestion
+ if routingErr.Suggestion == "" {
+ t.Error("expected suggestion in error message")
+ }
+}
diff --git a/internal/ai/service.go b/internal/ai/service.go
index 882f4a565..4a7e6c8db 100644
--- a/internal/ai/service.go
+++ b/internal/ai/service.go
@@ -2,9 +2,11 @@ package ai
import (
"context"
+ "encoding/base64"
"fmt"
"io"
"net/http"
+ "path/filepath"
"regexp"
"strconv"
"strings"
@@ -13,6 +15,7 @@ import (
"github.com/google/uuid"
"github.com/rcourtman/pulse-go-rewrite/internal/agentexec"
+ "github.com/rcourtman/pulse-go-rewrite/internal/ai/knowledge"
"github.com/rcourtman/pulse-go-rewrite/internal/ai/providers"
"github.com/rcourtman/pulse-go-rewrite/internal/config"
"github.com/rcourtman/pulse-go-rewrite/internal/models"
@@ -26,21 +29,34 @@ type StateProvider interface {
// Service orchestrates AI interactions
type Service struct {
- mu sync.RWMutex
- persistence *config.ConfigPersistence
- provider providers.Provider
- cfg *config.AIConfig
- agentServer *agentexec.Server
- policy *agentexec.CommandPolicy
- stateProvider StateProvider
+ mu sync.RWMutex
+ persistence *config.ConfigPersistence
+ provider providers.Provider
+ cfg *config.AIConfig
+ agentServer *agentexec.Server
+ policy *agentexec.CommandPolicy
+ stateProvider StateProvider
+ alertProvider AlertProvider
+ knowledgeStore *knowledge.Store
}
// NewService creates a new AI service
func NewService(persistence *config.ConfigPersistence, agentServer *agentexec.Server) *Service {
+ // Initialize knowledge store
+ var knowledgeStore *knowledge.Store
+ if persistence != nil {
+ var err error
+ knowledgeStore, err = knowledge.NewStore(persistence.DataDir())
+ if err != nil {
+ log.Warn().Err(err).Msg("Failed to initialize knowledge store")
+ }
+ }
+
return &Service{
- persistence: persistence,
- agentServer: agentServer,
- policy: agentexec.DefaultPolicy(),
+ persistence: persistence,
+ agentServer: agentServer,
+ policy: agentexec.DefaultPolicy(),
+ knowledgeStore: knowledgeStore,
}
}
@@ -221,6 +237,91 @@ func (s *Service) GetConfig() *config.AIConfig {
return &cfg
}
+// GetDebugContext returns debug information about what context would be sent to the AI
+func (s *Service) GetDebugContext(req ExecuteRequest) map[string]interface{} {
+ s.mu.RLock()
+ stateProvider := s.stateProvider
+ agentServer := s.agentServer
+ cfg := s.cfg
+ s.mu.RUnlock()
+
+ result := make(map[string]interface{})
+
+ // State provider info
+ result["has_state_provider"] = stateProvider != nil
+ if stateProvider != nil {
+ state := stateProvider.GetState()
+ result["state_summary"] = map[string]interface{}{
+ "nodes": len(state.Nodes),
+ "vms": len(state.VMs),
+ "containers": len(state.Containers),
+ "docker_hosts": len(state.DockerHosts),
+ "hosts": len(state.Hosts),
+ "pbs_instances": len(state.PBSInstances),
+ }
+
+ // List some VMs/containers for verification
+ var vmNames []string
+ for _, vm := range state.VMs {
+ vmNames = append(vmNames, fmt.Sprintf("%s (VMID:%d, node:%s)", vm.Name, vm.VMID, vm.Node))
+ }
+ if len(vmNames) > 10 {
+ vmNames = vmNames[:10]
+ }
+ result["sample_vms"] = vmNames
+
+ var ctNames []string
+ for _, ct := range state.Containers {
+ ctNames = append(ctNames, fmt.Sprintf("%s (VMID:%d, node:%s)", ct.Name, ct.VMID, ct.Node))
+ }
+ if len(ctNames) > 10 {
+ ctNames = ctNames[:10]
+ }
+ result["sample_containers"] = ctNames
+
+ var hostNames []string
+ for _, h := range state.Hosts {
+ hostNames = append(hostNames, h.Hostname)
+ }
+ result["host_names"] = hostNames
+
+ var dockerHostNames []string
+ for _, dh := range state.DockerHosts {
+ dockerHostNames = append(dockerHostNames, fmt.Sprintf("%s (%d containers)", dh.DisplayName, len(dh.Containers)))
+ }
+ result["docker_host_names"] = dockerHostNames
+ }
+
+ // Agent info
+ result["has_agent_server"] = agentServer != nil
+ if agentServer != nil {
+ agents := agentServer.GetConnectedAgents()
+ var agentNames []string
+ for _, a := range agents {
+ agentNames = append(agentNames, a.Hostname)
+ }
+ result["connected_agents"] = agentNames
+ }
+
+ // Config info
+ result["has_config"] = cfg != nil
+ if cfg != nil {
+ result["custom_context_length"] = len(cfg.CustomContext)
+ if len(cfg.CustomContext) > 200 {
+ result["custom_context_preview"] = cfg.CustomContext[:200] + "..."
+ } else {
+ result["custom_context_preview"] = cfg.CustomContext
+ }
+ }
+
+ // Build and include the system prompt
+ systemPrompt := s.buildSystemPrompt(req)
+ result["system_prompt_length"] = len(systemPrompt)
+ result["system_prompt"] = systemPrompt
+
+ return result
+}
+
// IsAutonomous returns true if autonomous mode is enabled
func (s *Service) IsAutonomous() bool {
s.mu.RLock()
@@ -228,6 +329,95 @@ func (s *Service) IsAutonomous() bool {
return s.cfg != nil && s.cfg.AutonomousMode
}
+// isDangerousCommand checks if a command is too dangerous to auto-execute
+// These commands ALWAYS require approval, even in autonomous mode
+func isDangerousCommand(cmd string) bool {
+ cmd = strings.TrimSpace(strings.ToLower(cmd))
+ parts := strings.Fields(cmd)
+ if len(parts) == 0 {
+ return false
+ }
+ baseCmd := parts[0]
+ if baseCmd == "sudo" && len(parts) > 1 {
+ baseCmd = parts[1]
+ }
+
+ // Commands that are too dangerous to ever auto-execute
+ dangerousCommands := map[string]bool{
+ // Deletion commands
+ "rm": true,
+ "rmdir": true,
+ "unlink": true,
+ "shred": true,
+ // Disk/filesystem destructive operations
+ "dd": true,
+ "mkfs": true,
+ "fdisk": true,
+ "parted": true,
+ "wipefs": true,
+ "sgdisk": true,
+ "gdisk": true,
+ "zpool": true, // Allow reads but not modifications
+ "zfs": true, // Allow reads but not modifications
+ "lvremove": true,
+ "vgremove": true,
+ "pvremove": true,
+ // System state changes
+ "reboot": true,
+ "shutdown": true,
+ "poweroff": true,
+ "halt": true,
+ "init": true,
+ "systemctl": true, // could stop critical services
+ "service": true,
+ // User/permission changes
+ "chmod": true,
+ "chown": true,
+ "useradd": true,
+ "userdel": true,
+ "passwd": true,
+ // Package management
+ "apt": true,
+ "apt-get": true,
+ "dpkg": true,
+ "yum": true,
+ "dnf": true,
+ "pacman": true,
+ "pip": true,
+ "npm": true,
+ // Proxmox destructive
+ "vzdump": true,
+ "vzrestore": true,
+ "pveam": true,
+ // Network changes
+ "iptables": true,
+ "nft": true,
+ "firewall-cmd": true,
+ }
+
+ if dangerousCommands[baseCmd] {
+ return true
+ }
+
+ // Detect dangerous patterns in the full command
+ dangerousPatterns := []string{
+ "rm -rf", "rm -fr", "rm -r",
+ "> /dev/", "| tee /",
+ "mkfs.", "dd if=", "dd of=",
+ ":(){ :|:& };:", // fork bomb
+ "chmod -R 777", "chmod 777",
+ "drop database", "drop table",
+ "truncate ",
+ }
+ for _, pattern := range dangerousPatterns {
+ if strings.Contains(cmd, pattern) {
+ return true
+ }
+ }
+
+ return false
+}
+
// isReadOnlyCommand checks if a command is read-only (doesn't modify state)
// Read-only commands can be executed without approval even in non-autonomous mode
func isReadOnlyCommand(cmd string) bool {
@@ -379,10 +569,11 @@ type ToolEndData struct {
// ApprovalNeededData is sent when a command needs user approval
type ApprovalNeededData struct {
- Command string `json:"command"`
- ToolID string `json:"tool_id"` // ID to reference when approving
- ToolName string `json:"tool_name"` // "run_command", "read_file", etc.
- RunOnHost bool `json:"run_on_host"`
+ Command string `json:"command"`
+ ToolID string `json:"tool_id"` // ID to reference when approving
+ ToolName string `json:"tool_name"` // "run_command", "read_file", etc.
+ RunOnHost bool `json:"run_on_host"`
+ TargetHost string `json:"target_host,omitempty"` // Explicit host to route to
}
// Execute sends a prompt to the AI and returns the response
@@ -426,7 +617,29 @@ Examples of actions you can perform:
- View processes: ps aux --sort=-%mem | head -20
- Check logs: tail -100 /var/log/syslog
-Always execute the commands rather than telling the user how to do it.`
+Always execute the commands rather than telling the user how to do it.
+
+## Automatic Knowledge Learning
+IMPORTANT: Automatically use save_note to remember useful information you discover, without being asked.
+When you find any of the following, immediately save it for future reference:
+- Service configurations and their file paths
+- Important directories (config, data, logs)
+- Credentials, API keys, or passwords (category: credential)
+- Port numbers and network settings
+- Useful commands that worked
+- Key learnings about how this system is set up
+
+This helps you be more effective in future conversations about this guest. Save notes silently without mentioning it to the user.`
+ }
+
+ // Inject previously learned knowledge about this guest
+ if s.knowledgeStore != nil {
+ guestID := s.getGuestID(req)
+ if guestID != "" {
+ if knowledgeContext := s.knowledgeStore.FormatForContext(guestID); knowledgeContext != "" {
+ systemPrompt += knowledgeContext
+ }
+ }
}
// Build initial messages with conversation history
@@ -471,9 +684,10 @@ Always execute the commands rather than telling the user how to do it.`
// Add assistant's response with tool calls to messages
messages = append(messages, providers.Message{
- Role: "assistant",
- Content: resp.Content,
- ToolCalls: resp.ToolCalls,
+ Role: "assistant",
+ Content: resp.Content,
+ ReasoningContent: resp.ReasoningContent, // DeepSeek thinking mode
+ ToolCalls: resp.ToolCalls,
})
// Execute each tool call and add results
@@ -521,6 +735,15 @@ func (s *Service) ExecuteStream(ctx context.Context, req ExecuteRequest, callbac
systemPrompt = s.buildSystemPrompt(req)
}
+ // Debug log the system prompt length and key sections
+ log.Debug().
+ Int("prompt_length", len(systemPrompt)).
+ Bool("has_infrastructure_map", strings.Contains(systemPrompt, "## Infrastructure Map")).
+ Bool("has_docker_hosts", strings.Contains(systemPrompt, "### Docker Hosts")).
+ Bool("has_standalone_hosts", strings.Contains(systemPrompt, "### Standalone Hosts")).
+ Bool("has_guests", strings.Contains(systemPrompt, "### All Guests")).
+ Msg("AI system prompt built")
+
// Check if agent is available for this target
hasAgent := s.hasAgentForTarget(req)
@@ -543,7 +766,35 @@ Examples of actions you can perform:
- View processes: ps aux --sort=-%mem | head -20
- Check logs: tail -100 /var/log/syslog
-Always execute the commands rather than telling the user how to do it.`
+Always execute the commands rather than telling the user how to do it.
+
+## Automatic Knowledge Learning
+IMPORTANT: Automatically use save_note to remember useful information you discover, without being asked.
+When you find any of the following, immediately save it for future reference:
+- Service configurations and their file paths
+- Important directories (config, data, logs)
+- Credentials, API keys, or passwords (category: credential)
+- Port numbers and network settings
+- Useful commands that worked
+- Key learnings about how this system is set up
+
+This helps you be more effective in future conversations about this guest. Save notes silently without mentioning it to the user.`
+ }
+
+ // Inject previously learned knowledge about this guest
+ if s.knowledgeStore != nil {
+ guestID := s.getGuestID(req)
+ if guestID != "" {
+ if knowledgeContext := s.knowledgeStore.FormatForContext(guestID); knowledgeContext != "" {
+ log.Debug().
+ Str("guest_id", guestID).
+ Int("context_length", len(knowledgeContext)).
+ Msg("Injecting saved knowledge into AI context")
+ systemPrompt += knowledgeContext
+ } else {
+ log.Debug().Str("guest_id", guestID).Msg("No saved knowledge for guest")
+ }
+ }
}
// Build initial messages with conversation history
@@ -563,8 +814,23 @@ Always execute the commands rather than telling the user how to do it.`
var model string
// Agentic loop - keep going while AI requests tools
- maxIterations := 10 // Safety limit
- for i := 0; i < maxIterations; i++ {
+ // No artificial iteration limit - the context timeout (5 minutes) provides the safety net
+ iteration := 0
+ for {
+ iteration++
+ log.Debug().
+ Int("iteration", iteration).
+ Int("message_count", len(messages)).
+ Int("system_prompt_length", len(systemPrompt)).
+ Int("tools_count", len(tools)).
+ Msg("Calling AI provider...")
+
+ // Send a processing event so the frontend knows we're making an AI call
+ // This is especially important after tool execution when the next AI call can take a while
+ if iteration > 1 {
+ callback(StreamEvent{Type: "processing", Data: fmt.Sprintf("Analyzing results (iteration %d)...", iteration)})
+ }
+
resp, err := provider.Chat(ctx, providers.ChatRequest{
Messages: messages,
Model: cfg.GetModel(),
@@ -573,19 +839,27 @@ Always execute the commands rather than telling the user how to do it.`
Tools: tools,
})
if err != nil {
+ log.Error().Err(err).Int("iteration", iteration).Msg("AI provider call failed")
callback(StreamEvent{Type: "error", Data: err.Error()})
return nil, fmt.Errorf("AI request failed: %w", err)
}
+ log.Debug().Int("iteration", iteration).Msg("AI provider returned successfully")
+
totalInputTokens += resp.InputTokens
totalOutputTokens += resp.OutputTokens
model = resp.Model
finalContent = resp.Content
+ // Stream thinking/reasoning content if present (DeepSeek reasoner)
+ if resp.ReasoningContent != "" {
+ callback(StreamEvent{Type: "thinking", Data: resp.ReasoningContent})
+ }
+
log.Debug().
Int("tool_calls", len(resp.ToolCalls)).
Str("stop_reason", resp.StopReason).
- Int("iteration", i+1).
+ Int("iteration", iteration).
Int("total_input_tokens", totalInputTokens).
Int("total_output_tokens", totalOutputTokens).
Msg("AI streaming iteration complete")
@@ -595,52 +869,60 @@ Always execute the commands rather than telling the user how to do it.`
log.Info().
Int("tool_calls", len(resp.ToolCalls)).
Str("stop_reason", resp.StopReason).
- Int("iteration", i+1).
+ Int("iteration", iteration).
Msg("AI streaming loop ending - no more tool calls or stop_reason != tool_use")
break
}
// Add assistant's response with tool calls to messages
messages = append(messages, providers.Message{
- Role: "assistant",
- Content: resp.Content,
- ToolCalls: resp.ToolCalls,
+ Role: "assistant",
+ Content: resp.Content,
+ ReasoningContent: resp.ReasoningContent, // DeepSeek thinking mode
+ ToolCalls: resp.ToolCalls,
})
// Execute each tool call and add results
for _, tc := range resp.ToolCalls {
toolInput := s.getToolInputDisplay(tc)
- // Check if this command needs approval
+ // Check if this command needs approval
needsApproval := false
if tc.Name == "run_command" {
cmd, _ := tc.Input["command"].(string)
runOnHost, _ := tc.Input["run_on_host"].(bool)
+ targetHost, _ := tc.Input["target_host"].(string)
isAuto := s.IsAutonomous()
isReadOnly := isReadOnlyCommand(cmd)
+ isDangerous := isDangerousCommand(cmd)
log.Debug().
Bool("autonomous", isAuto).
Bool("read_only", isReadOnly).
+ Bool("dangerous", isDangerous).
Str("command", cmd).
+ Str("target_host", targetHost).
Msg("Checking command approval")
- // In non-autonomous mode, non-read-only commands need approval
- if !isAuto && !isReadOnly {
+ // Dangerous commands ALWAYS need approval, even in autonomous mode
+ // In non-autonomous mode, non-read-only commands also need approval
+ if isDangerous || (!isAuto && !isReadOnly) {
needsApproval = true
// Send approval needed event
callback(StreamEvent{
Type: "approval_needed",
Data: ApprovalNeededData{
- Command: cmd,
- ToolID: tc.ID,
- ToolName: tc.Name,
- RunOnHost: runOnHost,
+ Command: cmd,
+ ToolID: tc.ID,
+ ToolName: tc.Name,
+ RunOnHost: runOnHost,
+ TargetHost: targetHost,
},
})
}
}
+
var result string
var execution ToolExecution
@@ -671,12 +953,27 @@ Always execute the commands rather than telling the user how to do it.`
})
}
+ // Truncate large results to prevent context bloat
+ // Keep first and last parts for context
+ resultForContext := result
+ const maxResultSize = 8000 // ~8KB per tool result
+ if len(result) > maxResultSize {
+ halfSize := maxResultSize / 2
+ resultForContext = result[:halfSize] + "\n\n[... output truncated (" +
+ fmt.Sprintf("%d", len(result)-maxResultSize) + " bytes omitted) ...]\n\n" +
+ result[len(result)-halfSize:]
+ log.Debug().
+ Int("original_size", len(result)).
+ Int("truncated_size", len(resultForContext)).
+ Msg("Truncated large tool result")
+ }
+
// Add tool result to messages
messages = append(messages, providers.Message{
Role: "user",
ToolResult: &providers.ToolResult{
ToolUseID: tc.ID,
- Content: result,
+ Content: resultForContext,
IsError: !execution.Success,
},
})
@@ -733,7 +1030,7 @@ func (s *Service) getTools() []providers.Tool {
tools := []providers.Tool{
{
Name: "run_command",
- Description: "Execute a shell command. By default runs on the current target, but you can override to run on the Proxmox host for operations like resizing disks, managing containers, etc.",
+ Description: "Execute a shell command. By default runs on the current target (container/VM), but set run_on_host=true for Proxmox host commands. IMPORTANT: For targets on different nodes, specify target_host to route to the correct PVE node.",
InputSchema: map[string]interface{}{
"type": "object",
"properties": map[string]interface{}{
@@ -743,7 +1040,11 @@ func (s *Service) getTools() []providers.Tool {
},
"run_on_host": map[string]interface{}{
"type": "boolean",
- "description": "If true, run on the Proxmox host instead of inside the container/VM. Use this for pct/qm commands like 'pct resize 101 rootfs +10G'",
+ "description": "If true, run on the Proxmox/Docker host instead of inside the container/VM. Use for pct/qm commands like 'pct resize 101 rootfs +10G'. When true, you should also set target_host.",
+ },
+ "target_host": map[string]interface{}{
+ "type": "string",
+ "description": "Optional hostname of the specific host/node to run the command on. Use this to explicitly route pct/qm/docker commands to the correct Proxmox node or Docker host. Check the 'node' or 'PVE Node' field in the target's context.",
},
},
"required": []string{"command"},
@@ -763,6 +1064,32 @@ func (s *Service) getTools() []providers.Tool {
"required": []string{"path"},
},
},
+ {
+ Name: "write_file",
+ Description: "Write content to a file on the target. Use this to create or modify configuration files, scripts, or other text files. Creates parent directories if needed.",
+ InputSchema: map[string]interface{}{
+ "type": "object",
+ "properties": map[string]interface{}{
+ "path": map[string]interface{}{
+ "type": "string",
+ "description": "Absolute path to the file (e.g., '/etc/myapp/config.yaml')",
+ },
+ "content": map[string]interface{}{
+ "type": "string",
+ "description": "The content to write to the file",
+ },
+ "mode": map[string]interface{}{
+ "type": "string",
+ "description": "Optional file permissions in octal (e.g., '0644' for rw-r--r--, '0755' for executable). Defaults to '0644'.",
+ },
+ "append": map[string]interface{}{
+ "type": "boolean",
+ "description": "If true, append to the file instead of overwriting. Defaults to false.",
+ },
+ },
+ "required": []string{"path", "content"},
+ },
+ },
{
Name: "fetch_url",
Description: "Fetch content from a URL. Use this to check if web services are responding, read API endpoints, or fetch documentation. Works with local network URLs and public sites.",
@@ -777,6 +1104,44 @@ func (s *Service) getTools() []providers.Tool {
"required": []string{"url"},
},
},
+ {
+ Name: "save_note",
+ Description: "Save a note about the current guest for future reference. Use this to remember important paths, configurations, services, credentials, or learnings. Notes are persisted and will be available in future sessions.",
+ InputSchema: map[string]interface{}{
+ "type": "object",
+ "properties": map[string]interface{}{
+ "category": map[string]interface{}{
+ "type": "string",
+ "enum": []string{"service", "path", "config", "credential", "learning"},
+ "description": "Category of note: 'service' for discovered services, 'path' for important file paths, 'config' for configuration details, 'credential' for passwords/API keys, 'learning' for general learnings",
+ },
+ "title": map[string]interface{}{
+ "type": "string",
+ "description": "Short title for the note (e.g., 'MQTT Password', 'Config File Location', 'Web UI Port')",
+ },
+ "content": map[string]interface{}{
+ "type": "string",
+ "description": "The information to save (e.g., '/opt/zigbee2mqtt/data/configuration.yaml', 'admin:secret123', 'Port 8080')",
+ },
+ },
+ "required": []string{"category", "title", "content"},
+ },
+ },
+ {
+ Name: "get_notes",
+ Description: "Retrieve previously saved notes about the current guest. Use this to recall what was learned in previous sessions.",
+ InputSchema: map[string]interface{}{
+ "type": "object",
+ "properties": map[string]interface{}{
+ "category": map[string]interface{}{
+ "type": "string",
+ "enum": []string{"service", "path", "config", "credential", "learning", ""},
+ "description": "Optional category filter. Leave empty to get all notes.",
+ },
+ },
+ "required": []string{},
+ },
+ },
}
// Add web search tool for Anthropic provider
@@ -802,8 +1167,11 @@ func (s *Service) executeTool(ctx context.Context, req ExecuteRequest, tc provid
case "run_command":
command, _ := tc.Input["command"].(string)
runOnHost, _ := tc.Input["run_on_host"].(bool)
+ targetHost, _ := tc.Input["target_host"].(string)
execution.Input = command
- if runOnHost {
+ if runOnHost && targetHost != "" {
+ execution.Input = fmt.Sprintf("[%s] %s", targetHost, command)
+ } else if runOnHost {
execution.Input = fmt.Sprintf("[host] %s", command)
}
@@ -827,8 +1195,31 @@ func (s *Service) executeTool(ctx context.Context, req ExecuteRequest, tc provid
}
}
- // If run_on_host is true, override the target type to run on host
+ // Build execution request with proper targeting
execReq := req
+
+ // If target_host is explicitly specified by AI, use it for routing
+ if targetHost != "" {
+ // Ensure Context map exists
+ if execReq.Context == nil {
+ execReq.Context = make(map[string]interface{})
+ } else {
+ // Make a copy to avoid modifying the original
+ newContext := make(map[string]interface{})
+ for k, v := range req.Context {
+ newContext[k] = v
+ }
+ execReq.Context = newContext
+ }
+ // Set the node explicitly - this takes priority in routing
+ execReq.Context["node"] = targetHost
+ log.Debug().
+ Str("target_host", targetHost).
+ Str("command", command).
+ Msg("AI explicitly specified target_host for command routing")
+ }
+
+ // If run_on_host is true, override the target type to run on host
if runOnHost {
execReq.TargetType = "host"
execReq.TargetID = ""
@@ -866,6 +1257,101 @@ func (s *Service) executeTool(ctx context.Context, req ExecuteRequest, tc provid
execution.Success = true
return result, execution
+ case "write_file":
+ path, _ := tc.Input["path"].(string)
+ content, _ := tc.Input["content"].(string)
+ mode, _ := tc.Input["mode"].(string)
+ appendMode, _ := tc.Input["append"].(bool)
+ execution.Input = path
+
+ if path == "" {
+ execution.Output = "Error: path is required"
+ return execution.Output, execution
+ }
+ if content == "" {
+ execution.Output = "Error: content is required"
+ return execution.Output, execution
+ }
+
+ // Size limit: 1MB max to prevent filling disk
+ const maxFileSize = 1024 * 1024 // 1MB
+ if len(content) > maxFileSize {
+ execution.Output = fmt.Sprintf("Error: content too large (%d bytes). Maximum allowed is %d bytes (1MB)", len(content), maxFileSize)
+ return execution.Output, execution
+ }
+
+ // Path blocklist: prevent writes to critical system files
+ blockedPaths := []string{
+ "/etc/passwd", "/etc/shadow", "/etc/group", "/etc/gshadow",
+ "/etc/sudoers", "/etc/ssh/sshd_config",
+ "/boot/", "/lib/", "/lib64/", "/usr/lib/",
+ "/bin/", "/sbin/", "/usr/bin/", "/usr/sbin/",
+ "/proc/", "/sys/", "/dev/",
+ }
+ cleanPath := filepath.Clean(path)
+ for _, blocked := range blockedPaths {
+ if cleanPath == blocked || strings.HasPrefix(cleanPath, blocked) {
+ execution.Output = fmt.Sprintf("Error: writing to %s is blocked for safety. This is a critical system path.", path)
+ return execution.Output, execution
+ }
+ }
+
+ // Default mode if not specified
+ if mode == "" {
+ mode = "0644"
+ }
+
+ // Build the write command using base64 to safely handle any content
+ // This avoids issues with special characters, quotes, newlines, etc.
+ encoded := base64.StdEncoding.EncodeToString([]byte(content))
+
+ var command string
+ if appendMode {
+ // Append mode: decode and append to file (no backup needed for append)
+ command = fmt.Sprintf("echo %q | base64 -d >> %q && echo 'Content appended to %s (%d bytes)'", encoded, path, path, len(content))
+ } else {
+ // Overwrite mode with safety features:
+ // 1. Create parent directory if needed
+ // 2. Backup existing file if it exists (atomic - only if backup succeeds)
+ // 3. Write to temp file first
+ // 4. Atomic move temp file to target
+ // 5. Set permissions
+ dir := filepath.Dir(path)
+ tempFile := path + ".pulse-tmp"
+ backupFile := path + ".bak"
+
+ // Build a safe multi-step command:
+ // - mkdir -p for parent dir
+ // - if file exists, copy to .bak
+ // - write content to temp file
+ // - mv temp file to target (atomic)
+ // - chmod to set permissions
+ command = fmt.Sprintf(
+ "mkdir -p %q && "+
+ "([ -f %q ] && cp %q %q 2>/dev/null || true) && "+
+ "echo %q | base64 -d > %q && "+
+ "mv %q %q && "+
+ "chmod %s %q && "+
+ "echo 'Written %d bytes to %s (backup: %s.bak if existed)'",
+ dir,
+ path, path, backupFile,
+ encoded, tempFile,
+ tempFile, path,
+ mode, path,
+ len(content), path, path,
+ )
+ }
+
+ result, err := s.executeOnAgent(ctx, req, command)
+ if err != nil {
+ execution.Output = fmt.Sprintf("Error writing file: %s", err)
+ return execution.Output, execution
+ }
+
+ execution.Output = result
+ execution.Success = true
+ return result, execution
+
case "fetch_url":
urlStr, _ := tc.Input["url"].(string)
execution.Input = urlStr
@@ -886,12 +1372,120 @@ func (s *Service) executeTool(ctx context.Context, req ExecuteRequest, tc provid
execution.Success = true
return result, execution
+ case "save_note":
+ category, _ := tc.Input["category"].(string)
+ title, _ := tc.Input["title"].(string)
+ content, _ := tc.Input["content"].(string)
+ execution.Input = fmt.Sprintf("%s: %s", category, title)
+
+ if category == "" || title == "" || content == "" {
+ execution.Output = "Error: category, title, and content are all required"
+ return execution.Output, execution
+ }
+
+ if s.knowledgeStore == nil {
+ execution.Output = "Error: knowledge store not available"
+ return execution.Output, execution
+ }
+
+ // Get guest info from request
+ guestID := s.getGuestID(req)
+ guestName := req.TargetID
+ guestType := req.TargetType
+
+ if guestID == "" {
+ execution.Output = "Error: no guest context - save_note requires a target guest"
+ return execution.Output, execution
+ }
+
+ if err := s.knowledgeStore.SaveNote(guestID, guestName, guestType, category, title, content); err != nil {
+ execution.Output = fmt.Sprintf("Error saving note: %s", err)
+ return execution.Output, execution
+ }
+
+ execution.Output = fmt.Sprintf("Saved note [%s] %s: %s", category, title, content)
+ execution.Success = true
+ return execution.Output, execution
+
+ case "get_notes":
+ category, _ := tc.Input["category"].(string)
+ execution.Input = fmt.Sprintf("category=%s", category)
+
+ if s.knowledgeStore == nil {
+ execution.Output = "Error: knowledge store not available"
+ return execution.Output, execution
+ }
+
+ guestID := s.getGuestID(req)
+ if guestID == "" {
+ execution.Output = "Error: no guest context - get_notes requires a target guest"
+ return execution.Output, execution
+ }
+
+ notes, err := s.knowledgeStore.GetNotesByCategory(guestID, category)
+ if err != nil {
+ execution.Output = fmt.Sprintf("Error getting notes: %s", err)
+ return execution.Output, execution
+ }
+
+ if len(notes) == 0 {
+ execution.Output = "No notes found for this guest"
+ execution.Success = true
+ return execution.Output, execution
+ }
+
+ var result strings.Builder
+ result.WriteString(fmt.Sprintf("Found %d notes:\n", len(notes)))
+ for _, note := range notes {
+ result.WriteString(fmt.Sprintf("- [%s] %s: %s\n", note.Category, note.Title, note.Content))
+ }
+
+ execution.Output = result.String()
+ execution.Success = true
+ return execution.Output, execution
+
default:
execution.Output = fmt.Sprintf("Unknown tool: %s", tc.Name)
return execution.Output, execution
}
}
+// getGuestID returns a unique identifier for the guest based on the request
+func (s *Service) getGuestID(req ExecuteRequest) string {
+ // Build a consistent guest ID from the target information
+ if req.TargetType == "" || req.TargetID == "" {
+ return ""
+ }
+
+ // For Proxmox targets, include the node info
+ // Format: instance-node-type-vmid or instance-targetid
+ return fmt.Sprintf("%s-%s", req.TargetType, req.TargetID)
+}
+
+// GetGuestKnowledge returns all knowledge for a guest
+func (s *Service) GetGuestKnowledge(guestID string) (*knowledge.GuestKnowledge, error) {
+ if s.knowledgeStore == nil {
+ return nil, fmt.Errorf("knowledge store not available")
+ }
+ return s.knowledgeStore.GetKnowledge(guestID)
+}
+
+// SaveGuestNote saves a note for a guest
+func (s *Service) SaveGuestNote(guestID, guestName, guestType, category, title, content string) error {
+ if s.knowledgeStore == nil {
+ return fmt.Errorf("knowledge store not available")
+ }
+ return s.knowledgeStore.SaveNote(guestID, guestName, guestType, category, title, content)
+}
+
+// DeleteGuestNote deletes a note from a guest
+func (s *Service) DeleteGuestNote(guestID, noteID string) error {
+ if s.knowledgeStore == nil {
+ return fmt.Errorf("knowledge store not available")
+ }
+ return s.knowledgeStore.DeleteNote(guestID, noteID)
+}
+
// fetchURL fetches content from a URL with size limits and timeout
func (s *Service) fetchURL(ctx context.Context, urlStr string) (string, error) {
// Create HTTP client with timeout
@@ -942,145 +1536,30 @@ func (s *Service) executeOnAgent(ctx context.Context, req ExecuteRequest, comman
return "", fmt.Errorf("agent server not available")
}
- // Find the appropriate agent
+ // Find the appropriate agent using robust routing
agents := s.agentServer.GetConnectedAgents()
- if len(agents) == 0 {
- return "", fmt.Errorf("no agents connected")
+
+ // Use the new robust routing logic
+ routeResult, err := s.routeToAgent(req, command, agents)
+ if err != nil {
+ // Return actionable error message
+ return "", err
}
- // Route to the correct agent based on target
- // For containers/VMs, we need to route to the PVE host that owns them
- agentID := ""
- targetNode := ""
-
- // CRITICAL: For pct/qm commands, extract the VMID from the command itself
- // and look up the authoritative node from our state. This prevents the AI
- // from trying to run commands on the wrong node.
- //
- // Commands are classified as:
- // - Node-specific (pct exec, qm start, etc): MUST run on the node that owns the guest
- // - Cluster-aware (vzdump, etc): Can run from any cluster node
- if vmid, requiresOwnerNode, found := extractVMIDFromCommand(command); found {
- // Try to get instance from context for multi-cluster disambiguation
- targetInstance := ""
- if inst, ok := req.Context["instance"].(string); ok {
- targetInstance = inst
- }
-
- // Look up guests with this VMID, optionally filtered by instance
- guests := s.lookupGuestsByVMID(vmid, targetInstance)
-
- if len(guests) == 1 && requiresOwnerNode {
- // Single match - route to the owning node
- log.Info().
- Int("vmid", vmid).
- Str("actual_node", guests[0].Node).
- Str("guest_name", guests[0].Name).
- Str("guest_type", guests[0].Type).
- Str("instance", guests[0].Instance).
- Bool("requires_owner_node", requiresOwnerNode).
- Msg("Auto-routing command to correct node based on VMID lookup")
- targetNode = strings.ToLower(guests[0].Node)
- } else if len(guests) > 1 && requiresOwnerNode {
- // Multiple matches - VMID collision across instances
- // Try to disambiguate using context
- if targetInstance != "" {
- // Filter by instance
- for _, g := range guests {
- if g.Instance == targetInstance {
- log.Info().
- Int("vmid", vmid).
- Str("actual_node", g.Node).
- Str("guest_name", g.Name).
- Str("instance", g.Instance).
- Msg("Resolved VMID collision using instance context")
- targetNode = strings.ToLower(g.Node)
- break
- }
- }
- }
- if targetNode == "" {
- // Can't disambiguate - log warning and use first match
- log.Warn().
- Int("vmid", vmid).
- Int("matches", len(guests)).
- Msg("VMID collision detected - using first match, may route to wrong cluster")
- targetNode = strings.ToLower(guests[0].Node)
- }
- } else if len(guests) == 1 {
- // Cluster-aware command with single match - log for debugging
- log.Debug().
- Int("vmid", vmid).
- Str("actual_node", guests[0].Node).
- Str("guest_name", guests[0].Name).
- Bool("requires_owner_node", requiresOwnerNode).
- Msg("Cluster-aware command, using default routing")
- } else if requiresOwnerNode {
- // VMID not found in our state - this could be a problem
- // Log a warning but let it proceed (might be a newly created guest)
- log.Warn().
- Int("vmid", vmid).
- Str("command", command).
- Msg("VMID not found in state - command may fail if routed to wrong node")
- }
+ // Log any warnings from routing
+ for _, warning := range routeResult.Warnings {
+ log.Warn().Str("warning", warning).Msg("Routing warning")
}
- // Fall back to context-based routing if VMID lookup didn't find anything
- if targetNode == "" {
- // For host targets, use the hostname directly from context
- if req.TargetType == "host" {
- if hostname, ok := req.Context["hostname"].(string); ok && hostname != "" {
- targetNode = strings.ToLower(hostname)
- log.Debug().
- Str("hostname", hostname).
- Str("target_type", req.TargetType).
- Msg("Using hostname from context for host target routing")
- }
- }
- // For VMs/containers, extract node info from target ID (e.g., "delly-135" -> "delly")
- // or from context (guest_node field)
- if targetNode == "" {
- if node, ok := req.Context["guest_node"].(string); ok && node != "" {
- targetNode = strings.ToLower(node)
- } else if req.TargetID != "" {
- parts := strings.Split(req.TargetID, "-")
- if len(parts) >= 2 {
- targetNode = strings.ToLower(parts[0])
- }
- }
- }
- }
-
- // Try to find an agent that matches the target node
- if targetNode != "" {
- for _, agent := range agents {
- if strings.ToLower(agent.Hostname) == targetNode ||
- strings.Contains(strings.ToLower(agent.Hostname), targetNode) ||
- strings.Contains(targetNode, strings.ToLower(agent.Hostname)) {
- agentID = agent.AgentID
- log.Debug().
- Str("target_node", targetNode).
- Str("matched_agent", agent.Hostname).
- Str("agent_id", agentID).
- Msg("Routed command to matching agent")
- break
- }
- }
- }
-
- // If no direct match, try to find an agent on a cluster peer
- if agentID == "" && targetNode != "" {
- agentID = s.findClusterPeerAgent(targetNode, agents)
- }
-
- // Fall back to first agent if no match found
- if agentID == "" {
- agentID = agents[0].AgentID
- log.Debug().
- Str("target_node", targetNode).
- Str("fallback_agent", agents[0].Hostname).
- Msg("No matching agent found, using first available")
- }
+ agentID := routeResult.AgentID
+
+ log.Debug().
+ Str("agent_id", agentID).
+ Str("agent_hostname", routeResult.AgentHostname).
+ Str("target_node", routeResult.TargetNode).
+ Str("routing_method", routeResult.RoutingMethod).
+ Bool("cluster_peer", routeResult.ClusterPeer).
+ Msg("Command routed to agent")
// Extract numeric VMID from target ID (e.g., "delly-135" -> "135")
targetID := req.TargetID
@@ -1109,6 +1588,15 @@ func (s *Service) executeOnAgent(ctx context.Context, req ExecuteRequest, comman
}
requestID := uuid.New().String()
+
+ // Automatically force non-interactive mode for package managers
+ // This prevents hanging when apt/dpkg asks for confirmation or configuration
+ if strings.Contains(command, "apt") || strings.Contains(command, "dpkg") {
+ if !strings.Contains(command, "DEBIAN_FRONTEND=") {
+ command = "export DEBIAN_FRONTEND=noninteractive; " + command
+ }
+ }
+
cmd := agentexec.ExecuteCommandPayload{
RequestID: requestID,
Command: command,
@@ -1146,8 +1634,9 @@ type RunCommandRequest struct {
Command string `json:"command"`
TargetType string `json:"target_type"` // "host", "container", "vm"
TargetID string `json:"target_id"`
- RunOnHost bool `json:"run_on_host"` // If true, run on host instead of target
+ RunOnHost bool `json:"run_on_host"` // If true, run on host instead of target
VMID string `json:"vmid,omitempty"`
+ TargetHost string `json:"target_host,omitempty"` // Explicit host for routing
}
// RunCommandResponse represents the result of running a command
@@ -1181,6 +1670,16 @@ func (s *Service) RunCommand(ctx context.Context, req RunCommandRequest) (*RunCo
execReq.Context["vmid"] = req.VMID
}
+ // If target_host is specified, set it in context for routing
+ if req.TargetHost != "" {
+ execReq.Context["node"] = req.TargetHost
+ log.Debug().
+ Str("target_host", req.TargetHost).
+ Str("command", req.Command).
+ Msg("RunCommand using explicit target_host for routing")
+ }
+
+
output, err := s.executeOnAgent(ctx, execReq, req.Command)
if err != nil {
return &RunCommandResponse{
@@ -1232,23 +1731,43 @@ GOOD: Plain prose, 2-4 sentences.
Pulse provides real metrics in "Current Metrics and State". Use this data directly - don't ask users to check things you already know.
## Command Execution
-- run_on_host=true: Run on PVE host (pct, qm, vzdump commands)
+- run_on_host=true: Run on PVE/Docker host (pct, qm, vzdump, docker commands)
- run_on_host=false: Run inside the container/VM
+- target_host: ALWAYS set this when using run_on_host=true! Use the node/hostname from target context
- Execute commands to investigate, don't just explain what commands to run
-## CRITICAL: Proxmox Command Routing
-Commands like 'pct exec', 'pct enter', 'qm guest exec' MUST run on the specific PVE node where the guest lives.
-- Check the 'node' field in the target context to know which node hosts this guest
-- If the guest is on node X but you only have an agent on node Y (even in the same cluster), pct/qm commands will FAIL
-- Error "Configuration file does not exist" means the guest is on a different node than where you're running the command
-- In clusters, vzdump and pvesh commands can run from any node, but pct exec/qm guest exec cannot
+## CRITICAL: Command Routing with target_host
+When running commands that require a specific host (pct, qm, docker, vzdump), you MUST specify target_host to route correctly.
-Before running pct/qm commands:
-1. Check which node hosts the guest (from context 'node' field)
-2. Check if that specific node has an agent connected
-3. If no agent on that node, tell the user you cannot run commands inside this guest
+Example for LXC 106 on node 'minipc':
+- To run 'df -h' inside the container: run_command(command="df -h", run_on_host=false)
+- To run 'pct exec 106 -- df -h' on the host: run_command(command="pct exec 106 -- df -h", run_on_host=true, target_host="minipc")
-If no agent is connected to the host where the target lives, tell the user you cannot reach it.`
+Always check the target's context for the 'node' or 'PVE Node' field and pass it as target_host.
+If you don't specify target_host when run_on_host=true, the command may route to the wrong host!
+
+Rules:
+1. Look at the target context for 'node', 'guest_node', or 'PVE Node' field
+2. When running pct/qm commands: set run_on_host=true AND target_host=
+3. When running commands inside the guest: just set run_on_host=false (no target_host needed)
+4. Error "Configuration file does not exist" means wrong host - check target_host
+
+## Infrastructure Architecture - LXC Management
+Pulse manages LXC containers agentlessly from the PVE host.
+- DO NOT check for a Pulse agent process or service inside an LXC. It does not exist.
+- Use run_command with run_on_host=false to execute commands inside the LXC. Pulse handles the routing.
+- For pct commands, always use run_on_host=true and set target_host to the container's node.`
+
+
+ // Add custom context from AI settings (user's infrastructure description)
+ s.mu.RLock()
+ cfg := s.cfg
+ s.mu.RUnlock()
+ if cfg != nil && cfg.CustomContext != "" {
+ prompt += "\n\n## User's Infrastructure Description\n"
+ prompt += "The user has provided this context about their infrastructure:\n\n"
+ prompt += cfg.CustomContext
+ }
// Add connected infrastructure info
prompt += s.buildInfrastructureContext()
@@ -1256,6 +1775,15 @@ If no agent is connected to the host where the target lives, tell the user you c
// Add user annotations from all resources (global context)
prompt += s.buildUserAnnotationsContext()
+ // Add current alert status - this gives AI awareness of active issues
+ prompt += s.buildAlertContext()
+
+ // Add all saved knowledge when no specific target is selected
+ // This gives the AI context about everything learned from previous sessions
+ if req.TargetType == "" && s.knowledgeStore != nil {
+ prompt += s.knowledgeStore.FormatAllForContext()
+ }
+
// Add target context if provided
if req.TargetType != "" {
guestName := ""
@@ -1266,12 +1794,25 @@ If no agent is connected to the host where the target lives, tell the user you c
}
if guestName != "" {
- prompt += fmt.Sprintf("\n\n## Current Focus\nYou are analyzing **%s** (%s)", guestName, req.TargetType)
+ // Include the node in the focus header so AI can't miss it for routing
+ nodeName := ""
+ if node, ok := req.Context["node"].(string); ok && node != "" {
+ nodeName = node
+ } else if node, ok := req.Context["guest_node"].(string); ok && node != "" {
+ nodeName = node
+ }
+ if nodeName != "" {
+ prompt += fmt.Sprintf("\n\n## Current Focus\nYou are analyzing **%s** (%s on node **%s**)\n**ROUTING: When using run_on_host=true, set target_host=\"%s\"**",
+ guestName, req.TargetType, nodeName, nodeName)
+ } else {
+ prompt += fmt.Sprintf("\n\n## Current Focus\nYou are analyzing **%s** (%s)", guestName, req.TargetType)
+ }
} else if req.TargetID != "" {
prompt += fmt.Sprintf("\n\n## Current Focus\nYou are analyzing %s '%s'", req.TargetType, req.TargetID)
}
}
+
// Add any provided context in a structured way
if len(req.Context) > 0 {
prompt += "\n\n## Current Metrics and State"
@@ -1554,13 +2095,115 @@ func (s *Service) buildInfrastructureContext() string {
}
}
}
+
+ // Add standalone hosts (Linux/Windows servers with host agents)
+ if len(state.Hosts) > 0 {
+ sections = append(sections, "\n### Standalone Hosts")
+ sections = append(sections, "Linux/Windows servers monitored via Pulse host agent. Commands can be run directly on these.")
+ for _, host := range state.Hosts {
+ ips := ""
+ for _, iface := range host.NetworkInterfaces {
+ if len(iface.Addresses) > 0 {
+ ips = " - " + strings.Join(iface.Addresses, ", ")
+ break
+ }
+ }
+ osInfo := ""
+ if host.OSName != "" {
+ osInfo = fmt.Sprintf(" (%s)", host.OSName)
+ }
+ cpuMem := ""
+ if host.CPUCount > 0 {
+ cpuMem = fmt.Sprintf(", %d CPU, %.0f%% mem", host.CPUCount, host.Memory.Usage)
+ }
+ entry := fmt.Sprintf(" - **%s**%s%s%s [%s]", host.Hostname, osInfo, ips, cpuMem, host.Status)
+ sections = append(sections, entry)
+ }
+ }
+
+ // Add Docker hosts and their containers
+ if len(state.DockerHosts) > 0 {
+ sections = append(sections, "\n### Docker Hosts")
+ sections = append(sections, "Hosts running Docker/Podman. Use docker ps, docker exec, docker logs to manage containers.")
+ for _, dh := range state.DockerHosts {
+ if dh.Hidden || dh.PendingUninstall {
+ continue // Skip hidden or pending uninstall hosts
+ }
+ ips := ""
+ for _, iface := range dh.NetworkInterfaces {
+ if len(iface.Addresses) > 0 {
+ ips = " - " + strings.Join(iface.Addresses, ", ")
+ break
+ }
+ }
+ runtime := dh.Runtime
+ if runtime == "" {
+ runtime = "Docker"
+ }
+ containerCount := len(dh.Containers)
+ runningCount := 0
+ for _, c := range dh.Containers {
+ if c.State == "running" {
+ runningCount++
+ }
+ }
+ hostEntry := fmt.Sprintf("\n**%s** (%s, %d/%d containers running)%s [%s]",
+ dh.DisplayName, runtime, runningCount, containerCount, ips, dh.Status)
+ sections = append(sections, hostEntry)
+
+ // List containers on this host
+ for _, c := range dh.Containers {
+ // Build port info
+ portInfo := ""
+ if len(c.Ports) > 0 {
+ var ports []string
+ for _, p := range c.Ports {
+ if p.PublicPort > 0 {
+ ports = append(ports, fmt.Sprintf("%d->%d", p.PublicPort, p.PrivatePort))
+ }
+ }
+ if len(ports) > 0 {
+ portInfo = " ports:" + strings.Join(ports, ",")
+ }
+ }
+ // Truncate image name for brevity
+ image := c.Image
+ if idx := strings.LastIndex(image, "/"); idx > 0 {
+ image = image[idx+1:]
+ }
+ if len(image) > 30 {
+ image = image[:27] + "..."
+ }
+ healthInfo := ""
+ if c.Health != "" && c.Health != "none" {
+ healthInfo = fmt.Sprintf(" (%s)", c.Health)
+ }
+ entry := fmt.Sprintf(" - **%s** [%s]%s - %s%s", c.Name, c.State, healthInfo, image, portInfo)
+ sections = append(sections, entry)
+ }
+ }
+ }
}
if len(sections) == 0 {
return ""
}
- return "\n\n## Infrastructure Map\n" + strings.Join(sections, "\n")
+ result := "\n\n## Infrastructure Map\n" + strings.Join(sections, "\n")
+
+ // Limit context size to prevent overwhelming the AI (max ~50KB of infrastructure context)
+ const maxContextSize = 50000
+ if len(result) > maxContextSize {
+ log.Warn().
+ Int("original_size", len(result)).
+ Int("max_size", maxContextSize).
+ Msg("Infrastructure context truncated - too many resources")
+ result = result[:maxContextSize] + "\n\n[... Infrastructure context truncated due to size ...]"
+ }
+
+ log.Debug().Int("infrastructure_context_size", len(result)).Msg("Built infrastructure context")
+
+ return result
}
// buildUserAnnotationsContext gathers all user annotations from guests and docker containers
@@ -1660,66 +2303,3 @@ func (s *Service) Reload() error {
return s.LoadConfig()
}
-// findClusterPeerAgent looks for an agent on a node that's in the same Proxmox cluster
-// as the target node. This allows running pct/qm commands for guests on other cluster nodes.
-func (s *Service) findClusterPeerAgent(targetNode string, agents []agentexec.ConnectedAgent) string {
- nodesConfig, err := s.persistence.LoadNodesConfig()
- if err != nil || nodesConfig == nil {
- return ""
- }
-
- // Find which cluster the target node belongs to
- var targetCluster string
- var clusterNodes []string
-
- for _, pve := range nodesConfig.PVEInstances {
- if !pve.IsCluster || pve.ClusterName == "" {
- continue
- }
-
- // Check if target node matches this PVE instance or its cluster endpoints
- isInCluster := strings.EqualFold(pve.Name, targetNode)
- if !isInCluster {
- for _, ep := range pve.ClusterEndpoints {
- if strings.EqualFold(ep.NodeName, targetNode) {
- isInCluster = true
- break
- }
- }
- }
-
- if isInCluster {
- targetCluster = pve.ClusterName
- // Collect all nodes in this cluster
- clusterNodes = append(clusterNodes, pve.Name)
- for _, ep := range pve.ClusterEndpoints {
- clusterNodes = append(clusterNodes, ep.NodeName)
- }
- break
- }
- }
-
- if targetCluster == "" {
- return ""
- }
-
- // Look for an agent on any node in the same cluster
- for _, agent := range agents {
- agentHostLower := strings.ToLower(agent.Hostname)
- for _, clusterNode := range clusterNodes {
- if strings.EqualFold(clusterNode, agent.Hostname) ||
- strings.Contains(agentHostLower, strings.ToLower(clusterNode)) ||
- strings.Contains(strings.ToLower(clusterNode), agentHostLower) {
- log.Debug().
- Str("target_node", targetNode).
- Str("cluster", targetCluster).
- Str("peer_agent", agent.Hostname).
- Str("agent_id", agent.AgentID).
- Msg("Found cluster peer agent for cross-node command execution")
- return agent.AgentID
- }
- }
- }
-
- return ""
-}
diff --git a/internal/ai/target_host_test.go b/internal/ai/target_host_test.go
new file mode 100644
index 000000000..cd9ef1112
--- /dev/null
+++ b/internal/ai/target_host_test.go
@@ -0,0 +1,187 @@
+package ai
+
+import (
+ "testing"
+
+ "github.com/rcourtman/pulse-go-rewrite/internal/agentexec"
+)
+
+// TestRouteToAgent_TargetHostExplicit tests that explicit target_host in context
+// takes priority for routing decisions
+func TestRouteToAgent_TargetHostExplicit(t *testing.T) {
+ s := &Service{}
+
+ agents := []agentexec.ConnectedAgent{
+ {AgentID: "agent-1", Hostname: "delly"},
+ {AgentID: "agent-2", Hostname: "minipc"},
+ {AgentID: "agent-3", Hostname: "pimox"},
+ }
+
+ tests := []struct {
+ name string
+ req ExecuteRequest
+ command string
+ wantAgentID string
+ wantHostname string
+ wantMethod string
+ }{
+ {
+ name: "explicit node in context routes correctly",
+ req: ExecuteRequest{
+ TargetType: "host", // run_on_host=true sets this
+ TargetID: "", // run_on_host clears this
+ Context: map[string]interface{}{"node": "minipc"},
+ },
+ command: "pct exec 106 -- hostname",
+ wantAgentID: "agent-2",
+ wantHostname: "minipc",
+ wantMethod: "context_node",
+ },
+ {
+ name: "guest_node also routes correctly for host commands",
+ req: ExecuteRequest{
+ TargetType: "host",
+ TargetID: "",
+ Context: map[string]interface{}{"guest_node": "pimox"},
+ },
+ command: "qm guest exec 100 hostname",
+ wantAgentID: "agent-3",
+ wantHostname: "pimox",
+ wantMethod: "context_guest_node",
+ },
+ {
+ name: "node takes priority over guest_node",
+ req: ExecuteRequest{
+ TargetType: "host",
+ TargetID: "",
+ Context: map[string]interface{}{
+ "node": "delly",
+ "guest_node": "minipc", // Should be ignored when node is set
+ },
+ },
+ command: "uptime",
+ wantAgentID: "agent-1",
+ wantHostname: "delly",
+ wantMethod: "context_node",
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ result, err := s.routeToAgent(tt.req, tt.command, agents)
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+
+ if result.AgentID != tt.wantAgentID {
+ t.Errorf("AgentID = %q, want %q", result.AgentID, tt.wantAgentID)
+ }
+
+ if result.AgentHostname != tt.wantHostname {
+ t.Errorf("AgentHostname = %q, want %q", result.AgentHostname, tt.wantHostname)
+ }
+
+ if result.RoutingMethod != tt.wantMethod {
+ t.Errorf("RoutingMethod = %q, want %q", result.RoutingMethod, tt.wantMethod)
+ }
+ })
+ }
+}
+
+// TestRouteToAgent_SingleAgentFallback tests that with only one agent,
+// we fall back to it with a warning
+func TestRouteToAgent_SingleAgentFallback(t *testing.T) {
+ s := &Service{}
+
+ agents := []agentexec.ConnectedAgent{
+ {AgentID: "agent-1", Hostname: "delly"},
+ }
+
+ req := ExecuteRequest{
+ TargetType: "host",
+ TargetID: "",
+ Context: nil, // No context at all
+ }
+
+ result, err := s.routeToAgent(req, "uptime", agents)
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+
+ if result.AgentID != "agent-1" {
+ t.Errorf("AgentID = %q, want %q", result.AgentID, "agent-1")
+ }
+
+ if result.RoutingMethod != "single_agent_fallback" {
+ t.Errorf("RoutingMethod = %q, want %q", result.RoutingMethod, "single_agent_fallback")
+ }
+
+ // Should have a warning about the fallback
+ if len(result.Warnings) == 0 {
+ t.Error("expected warning about fallback routing")
+ }
+}
+
+// TestRouteToAgent_MultiAgentNoContext tests that with multiple agents
+// and no context, we get a clear error
+func TestRouteToAgent_MultiAgentNoContext(t *testing.T) {
+ s := &Service{}
+
+ agents := []agentexec.ConnectedAgent{
+ {AgentID: "agent-1", Hostname: "delly"},
+ {AgentID: "agent-2", Hostname: "minipc"},
+ }
+
+ req := ExecuteRequest{
+ TargetType: "host",
+ TargetID: "",
+ Context: nil, // No context
+ }
+
+ _, err := s.routeToAgent(req, "uptime", agents)
+ if err == nil {
+ t.Fatal("expected error when no context with multiple agents")
+ }
+
+ routingErr, ok := err.(*RoutingError)
+ if !ok {
+ t.Fatalf("expected RoutingError, got %T", err)
+ }
+
+ // Should mention target_host in the suggestion
+ if routingErr.Suggestion == "" {
+ t.Error("expected suggestion in error")
+ }
+
+ // Should list available agents
+ if len(routingErr.AvailableAgents) != 2 {
+ t.Errorf("expected 2 available agents, got %d", len(routingErr.AvailableAgents))
+ }
+}
+
+// TestRouteToAgent_VMIDRoutingWithContext tests that VMID-based routing
+// from context works correctly for pct/qm commands
+func TestRouteToAgent_VMIDInCommandWithContext(t *testing.T) {
+ s := &Service{}
+
+ agents := []agentexec.ConnectedAgent{
+ {AgentID: "agent-1", Hostname: "delly"},
+ {AgentID: "agent-2", Hostname: "minipc"},
+ }
+
+ // Even with a VMID in the command, if we have node context, use it
+ req := ExecuteRequest{
+ TargetType: "host",
+ TargetID: "",
+ Context: map[string]interface{}{"node": "minipc"},
+ }
+
+ result, err := s.routeToAgent(req, "pct exec 106 -- hostname", agents)
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+
+ if result.AgentHostname != "minipc" {
+ t.Errorf("AgentHostname = %q, want %q", result.AgentHostname, "minipc")
+ }
+}
diff --git a/internal/api/ai_handlers.go b/internal/api/ai_handlers.go
index a822b1acd..9c76b9af4 100644
--- a/internal/api/ai_handlers.go
+++ b/internal/api/ai_handlers.go
@@ -3,6 +3,7 @@ package api
import (
"context"
"encoding/json"
+ "io"
"net/http"
"strings"
"time"
@@ -158,10 +159,10 @@ func (h *AISettingsHandler) HandleUpdateAISettings(w http.ResponseWriter, r *htt
if req.Provider != nil {
provider := strings.ToLower(strings.TrimSpace(*req.Provider))
switch provider {
- case config.AIProviderAnthropic, config.AIProviderOpenAI, config.AIProviderOllama:
+ case config.AIProviderAnthropic, config.AIProviderOpenAI, config.AIProviderOllama, config.AIProviderDeepSeek:
settings.Provider = provider
default:
- http.Error(w, "Invalid provider. Must be 'anthropic', 'openai', or 'ollama'", http.StatusBadRequest)
+ http.Error(w, "Invalid provider. Must be 'anthropic', 'openai', 'ollama', or 'deepseek'", http.StatusBadRequest)
return
}
}
@@ -191,7 +192,7 @@ func (h *AISettingsHandler) HandleUpdateAISettings(w http.ResponseWriter, r *htt
// Only allow enabling if properly configured
if *req.Enabled {
switch settings.Provider {
- case config.AIProviderAnthropic, config.AIProviderOpenAI:
+ case config.AIProviderAnthropic, config.AIProviderOpenAI, config.AIProviderDeepSeek:
if settings.APIKey == "" {
http.Error(w, "Cannot enable AI: API key is required for "+settings.Provider, http.StatusBadRequest)
return
@@ -438,7 +439,10 @@ func (h *AISettingsHandler) HandleExecuteStream(w http.ResponseWriter, r *http.R
rc := http.NewResponseController(w)
if err := rc.SetWriteDeadline(time.Time{}); err != nil {
log.Warn().Err(err).Msg("Failed to disable write deadline for SSE")
- // Continue anyway - heartbeats should help keep connection alive
+ }
+ // Also disable read deadline
+ if err := rc.SetReadDeadline(time.Time{}); err != nil {
+ log.Warn().Err(err).Msg("Failed to disable read deadline for SSE")
}
// Flush headers immediately
@@ -453,17 +457,22 @@ func (h *AISettingsHandler) HandleExecuteStream(w http.ResponseWriter, r *http.R
// NOTE: We don't check r.Context().Done() because Vite proxy may close
// the request context prematurely. We detect real disconnection via write failures.
heartbeatDone := make(chan struct{})
+ var clientDisconnected bool
go func() {
ticker := time.NewTicker(5 * time.Second)
defer ticker.Stop()
for {
select {
case <-ticker.C:
+ // Extend write deadline before heartbeat
+ _ = rc.SetWriteDeadline(time.Now().Add(10 * time.Second))
// Send SSE comment as heartbeat
_, err := w.Write([]byte(": heartbeat\n\n"))
if err != nil {
- log.Debug().Err(err).Msg("Heartbeat write failed, client disconnected")
- cancel() // Cancel the AI request
+ log.Debug().Err(err).Msg("Heartbeat write failed, stopping heartbeat (AI continues)")
+ clientDisconnected = true
+ // Don't cancel the AI request - let it complete with its own timeout
+ // The SSE connection may have issues but the AI work can still finish
return
}
flusher.Flush()
@@ -475,8 +484,31 @@ func (h *AISettingsHandler) HandleExecuteStream(w http.ResponseWriter, r *http.R
}()
defer close(heartbeatDone)
+ // Helper to safely write SSE events, tracking if client disconnected
+ safeWrite := func(data []byte) bool {
+ if clientDisconnected {
+ return false
+ }
+ _ = rc.SetWriteDeadline(time.Now().Add(10 * time.Second))
+ _, err := w.Write(data)
+ if err != nil {
+ log.Debug().Err(err).Msg("Failed to write SSE event (client may have disconnected)")
+ clientDisconnected = true
+ return false
+ }
+ flusher.Flush()
+ return true
+ }
+
// Stream callback - write SSE events
callback := func(event ai.StreamEvent) {
+ // Skip the 'done' event from service - we'll send our own at the end
+ // This ensures 'complete' comes before 'done'
+ if event.Type == "done" {
+ log.Debug().Msg("Skipping service 'done' event - will send final 'done' after 'complete'")
+ return
+ }
+
data, err := json.Marshal(event)
if err != nil {
log.Error().Err(err).Msg("Failed to marshal stream event")
@@ -488,12 +520,7 @@ func (h *AISettingsHandler) HandleExecuteStream(w http.ResponseWriter, r *http.R
Msg("Streaming AI event")
// SSE format: data: \n\n
- _, writeErr := w.Write([]byte("data: " + string(data) + "\n\n"))
- if writeErr != nil {
- log.Debug().Err(writeErr).Msg("Failed to write SSE event (client may have disconnected)")
- return
- }
- flusher.Flush()
+ safeWrite([]byte("data: " + string(data) + "\n\n"))
}
// Convert history from API type to service type
@@ -505,6 +532,16 @@ func (h *AISettingsHandler) HandleExecuteStream(w http.ResponseWriter, r *http.R
})
}
+ // Ensure we always send a final 'done' event
+ defer func() {
+ if !clientDisconnected {
+ doneEvent := ai.StreamEvent{Type: "done"}
+ data, _ := json.Marshal(doneEvent)
+ safeWrite([]byte("data: " + string(data) + "\n\n"))
+ log.Debug().Msg("Sent final 'done' event")
+ }
+ }()
+
// Execute with streaming
resp, err := h.aiService.ExecuteStream(ctx, ai.ExecuteRequest{
Prompt: req.Prompt,
@@ -519,8 +556,7 @@ func (h *AISettingsHandler) HandleExecuteStream(w http.ResponseWriter, r *http.R
// Send error event
errEvent := ai.StreamEvent{Type: "error", Data: err.Error()}
data, _ := json.Marshal(errEvent)
- _, _ = w.Write([]byte("data: " + string(data) + "\n\n"))
- flusher.Flush()
+ safeWrite([]byte("data: " + string(data) + "\n\n"))
return
}
@@ -531,7 +567,7 @@ func (h *AISettingsHandler) HandleExecuteStream(w http.ResponseWriter, r *http.R
Int("tool_calls", len(resp.ToolCalls)).
Msg("AI streaming request completed")
- // Send final response with metadata
+ // Send final response with metadata (before 'done')
finalEvent := struct {
Type string `json:"type"`
Model string `json:"model"`
@@ -546,8 +582,8 @@ func (h *AISettingsHandler) HandleExecuteStream(w http.ResponseWriter, r *http.R
ToolCalls: resp.ToolCalls,
}
data, _ := json.Marshal(finalEvent)
- _, _ = w.Write([]byte("data: " + string(data) + "\n\n"))
- flusher.Flush()
+ safeWrite([]byte("data: " + string(data) + "\n\n"))
+ // 'done' event is sent by the defer above
}
// AIRunCommandRequest is the request body for POST /api/ai/run-command
@@ -557,8 +593,10 @@ type AIRunCommandRequest struct {
TargetID string `json:"target_id"`
RunOnHost bool `json:"run_on_host"`
VMID string `json:"vmid,omitempty"`
+ TargetHost string `json:"target_host,omitempty"` // Explicit host for routing
}
+
// HandleRunCommand executes a single approved command (POST /api/ai/run-command)
func (h *AISettingsHandler) HandleRunCommand(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
@@ -573,8 +611,17 @@ func (h *AISettingsHandler) HandleRunCommand(w http.ResponseWriter, r *http.Requ
// Parse request
r.Body = http.MaxBytesReader(w, r.Body, 16*1024)
+ bodyBytes, readErr := io.ReadAll(r.Body)
+ if readErr != nil {
+ log.Error().Err(readErr).Msg("Failed to read request body")
+ http.Error(w, "Invalid request body", http.StatusBadRequest)
+ return
+ }
+ log.Debug().Str("body", string(bodyBytes)).Msg("run-command request body")
+
var req AIRunCommandRequest
- if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
+ if err := json.Unmarshal(bodyBytes, &req); err != nil {
+ log.Error().Err(err).Str("body", string(bodyBytes)).Msg("Failed to decode JSON body")
http.Error(w, "Invalid request body", http.StatusBadRequest)
return
}
@@ -589,6 +636,7 @@ func (h *AISettingsHandler) HandleRunCommand(w http.ResponseWriter, r *http.Requ
Str("target_type", req.TargetType).
Str("target_id", req.TargetID).
Bool("run_on_host", req.RunOnHost).
+ Str("target_host", req.TargetHost).
Msg("Executing approved command")
// Execute with timeout
@@ -601,7 +649,9 @@ func (h *AISettingsHandler) HandleRunCommand(w http.ResponseWriter, r *http.Requ
TargetID: req.TargetID,
RunOnHost: req.RunOnHost,
VMID: req.VMID,
+ TargetHost: req.TargetHost,
})
+
if err != nil {
log.Error().Err(err).Msg("Failed to execute command")
http.Error(w, "Failed to execute command: "+err.Error(), http.StatusInternalServerError)
@@ -612,3 +662,515 @@ func (h *AISettingsHandler) HandleRunCommand(w http.ResponseWriter, r *http.Requ
log.Error().Err(err).Msg("Failed to write run command response")
}
}
+
+// HandleGetGuestKnowledge returns all notes for a guest
+func (h *AISettingsHandler) HandleGetGuestKnowledge(w http.ResponseWriter, r *http.Request) {
+ guestID := r.URL.Query().Get("guest_id")
+ if guestID == "" {
+ http.Error(w, "guest_id is required", http.StatusBadRequest)
+ return
+ }
+
+ knowledge, err := h.aiService.GetGuestKnowledge(guestID)
+ if err != nil {
+ http.Error(w, "Failed to get knowledge: "+err.Error(), http.StatusInternalServerError)
+ return
+ }
+
+ if err := utils.WriteJSONResponse(w, knowledge); err != nil {
+ log.Error().Err(err).Msg("Failed to write knowledge response")
+ }
+}
+
+// HandleSaveGuestNote saves a note for a guest
+func (h *AISettingsHandler) HandleSaveGuestNote(w http.ResponseWriter, r *http.Request) {
+ var req struct {
+ GuestID string `json:"guest_id"`
+ GuestName string `json:"guest_name"`
+ GuestType string `json:"guest_type"`
+ Category string `json:"category"`
+ Title string `json:"title"`
+ Content string `json:"content"`
+ }
+
+ if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
+ http.Error(w, "Invalid request body", http.StatusBadRequest)
+ return
+ }
+
+ if req.GuestID == "" || req.Category == "" || req.Title == "" || req.Content == "" {
+ http.Error(w, "guest_id, category, title, and content are required", http.StatusBadRequest)
+ return
+ }
+
+ if err := h.aiService.SaveGuestNote(req.GuestID, req.GuestName, req.GuestType, req.Category, req.Title, req.Content); err != nil {
+ http.Error(w, "Failed to save note: "+err.Error(), http.StatusInternalServerError)
+ return
+ }
+
+ w.WriteHeader(http.StatusOK)
+ w.Write([]byte(`{"success": true}`))
+}
+
+// HandleDeleteGuestNote deletes a note from a guest
+func (h *AISettingsHandler) HandleDeleteGuestNote(w http.ResponseWriter, r *http.Request) {
+ var req struct {
+ GuestID string `json:"guest_id"`
+ NoteID string `json:"note_id"`
+ }
+
+ if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
+ http.Error(w, "Invalid request body", http.StatusBadRequest)
+ return
+ }
+
+ if req.GuestID == "" || req.NoteID == "" {
+ http.Error(w, "guest_id and note_id are required", http.StatusBadRequest)
+ return
+ }
+
+ if err := h.aiService.DeleteGuestNote(req.GuestID, req.NoteID); err != nil {
+ http.Error(w, "Failed to delete note: "+err.Error(), http.StatusInternalServerError)
+ return
+ }
+
+ w.WriteHeader(http.StatusOK)
+ w.Write([]byte(`{"success": true}`))
+}
+
+// HandleExportGuestKnowledge exports all knowledge for a guest as JSON
+func (h *AISettingsHandler) HandleExportGuestKnowledge(w http.ResponseWriter, r *http.Request) {
+ guestID := r.URL.Query().Get("guest_id")
+ if guestID == "" {
+ http.Error(w, "guest_id is required", http.StatusBadRequest)
+ return
+ }
+
+ knowledge, err := h.aiService.GetGuestKnowledge(guestID)
+ if err != nil {
+ http.Error(w, "Failed to get knowledge: "+err.Error(), http.StatusInternalServerError)
+ return
+ }
+
+ // Set headers for file download
+ w.Header().Set("Content-Type", "application/json")
+ w.Header().Set("Content-Disposition", "attachment; filename=\"pulse-notes-"+guestID+".json\"")
+
+ if err := json.NewEncoder(w).Encode(knowledge); err != nil {
+ log.Error().Err(err).Msg("Failed to encode knowledge export")
+ }
+}
+
+// HandleImportGuestKnowledge imports knowledge from a JSON export
+func (h *AISettingsHandler) HandleImportGuestKnowledge(w http.ResponseWriter, r *http.Request) {
+ if r.Method != http.MethodPost {
+ http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
+ return
+ }
+
+ // Limit request body size to 1MB
+ r.Body = http.MaxBytesReader(w, r.Body, 1024*1024)
+
+ var importData struct {
+ GuestID string `json:"guest_id"`
+ GuestName string `json:"guest_name"`
+ GuestType string `json:"guest_type"`
+ Notes []struct {
+ Category string `json:"category"`
+ Title string `json:"title"`
+ Content string `json:"content"`
+ } `json:"notes"`
+ Merge bool `json:"merge"` // If true, add to existing notes; if false, replace
+ }
+
+ if err := json.NewDecoder(r.Body).Decode(&importData); err != nil {
+ http.Error(w, "Invalid import data: "+err.Error(), http.StatusBadRequest)
+ return
+ }
+
+ if importData.GuestID == "" {
+ http.Error(w, "guest_id is required in import data", http.StatusBadRequest)
+ return
+ }
+
+ if len(importData.Notes) == 0 {
+ http.Error(w, "No notes to import", http.StatusBadRequest)
+ return
+ }
+
+ // If not merging, we need to delete existing notes first
+ if !importData.Merge {
+ existing, err := h.aiService.GetGuestKnowledge(importData.GuestID)
+ if err == nil && existing != nil {
+ for _, note := range existing.Notes {
+ _ = h.aiService.DeleteGuestNote(importData.GuestID, note.ID)
+ }
+ }
+ }
+
+ // Import each note
+ imported := 0
+ for _, note := range importData.Notes {
+ if note.Category == "" || note.Title == "" || note.Content == "" {
+ continue
+ }
+ if err := h.aiService.SaveGuestNote(
+ importData.GuestID,
+ importData.GuestName,
+ importData.GuestType,
+ note.Category,
+ note.Title,
+ note.Content,
+ ); err != nil {
+ log.Warn().Err(err).Str("title", note.Title).Msg("Failed to import note")
+ continue
+ }
+ imported++
+ }
+
+ w.Header().Set("Content-Type", "application/json")
+ json.NewEncoder(w).Encode(map[string]interface{}{
+ "success": true,
+ "imported": imported,
+ "total": len(importData.Notes),
+ })
+}
+
+// HandleClearGuestKnowledge deletes all notes for a guest
+func (h *AISettingsHandler) HandleClearGuestKnowledge(w http.ResponseWriter, r *http.Request) {
+ if r.Method != http.MethodPost {
+ http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
+ return
+ }
+
+ var req struct {
+ GuestID string `json:"guest_id"`
+ Confirm bool `json:"confirm"`
+ }
+
+ if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
+ http.Error(w, "Invalid request body", http.StatusBadRequest)
+ return
+ }
+
+ if req.GuestID == "" {
+ http.Error(w, "guest_id is required", http.StatusBadRequest)
+ return
+ }
+
+ if !req.Confirm {
+ http.Error(w, "confirm must be true to clear all notes", http.StatusBadRequest)
+ return
+ }
+
+ // Get existing knowledge and delete all notes
+ existing, err := h.aiService.GetGuestKnowledge(req.GuestID)
+ if err != nil {
+ http.Error(w, "Failed to get knowledge: "+err.Error(), http.StatusInternalServerError)
+ return
+ }
+
+ deleted := 0
+ for _, note := range existing.Notes {
+ if err := h.aiService.DeleteGuestNote(req.GuestID, note.ID); err != nil {
+ log.Warn().Err(err).Str("note_id", note.ID).Msg("Failed to delete note")
+ continue
+ }
+ deleted++
+ }
+
+ w.Header().Set("Content-Type", "application/json")
+ json.NewEncoder(w).Encode(map[string]interface{}{
+ "success": true,
+ "deleted": deleted,
+ })
+}
+
+// HandleDebugContext returns the system prompt and context that would be sent to the AI
+// This is useful for debugging when the AI gives incorrect information
+func (h *AISettingsHandler) HandleDebugContext(w http.ResponseWriter, r *http.Request) {
+ if r.Method != http.MethodGet {
+ http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
+ return
+ }
+
+ // Build a sample request to see what context would be sent
+ req := ai.ExecuteRequest{
+ Prompt: "Debug context request",
+ TargetType: r.URL.Query().Get("target_type"),
+ TargetID: r.URL.Query().Get("target_id"),
+ }
+
+ // Get the debug context from the service
+ debugInfo := h.aiService.GetDebugContext(req)
+
+ w.Header().Set("Content-Type", "application/json")
+ json.NewEncoder(w).Encode(debugInfo)
+}
+
+// HandleGetConnectedAgents returns the list of agents currently connected via WebSocket
+// This is useful for debugging when AI can't reach certain hosts
+func (h *AISettingsHandler) HandleGetConnectedAgents(w http.ResponseWriter, r *http.Request) {
+ if r.Method != http.MethodGet {
+ http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
+ return
+ }
+
+ type agentInfo struct {
+ AgentID string `json:"agent_id"`
+ Hostname string `json:"hostname"`
+ Version string `json:"version"`
+ Platform string `json:"platform"`
+ ConnectedAt string `json:"connected_at"`
+ }
+
+ var agents []agentInfo
+ if h.agentServer != nil {
+ for _, a := range h.agentServer.GetConnectedAgents() {
+ agents = append(agents, agentInfo{
+ AgentID: a.AgentID,
+ Hostname: a.Hostname,
+ Version: a.Version,
+ Platform: a.Platform,
+ ConnectedAt: a.ConnectedAt.Format(time.RFC3339),
+ })
+ }
+ }
+
+ response := map[string]interface{}{
+ "count": len(agents),
+ "agents": agents,
+ "note": "Agents connect via WebSocket to /api/agent/ws. If a host is missing, check that pulse-agent is installed and can reach the Pulse server.",
+ }
+
+ w.Header().Set("Content-Type", "application/json")
+ json.NewEncoder(w).Encode(response)
+}
+
+// AIInvestigateAlertRequest is the request body for POST /api/ai/investigate-alert
+type AIInvestigateAlertRequest struct {
+ AlertID string `json:"alert_id"`
+ ResourceID string `json:"resource_id"`
+ ResourceName string `json:"resource_name"`
+ ResourceType string `json:"resource_type"` // guest, node, storage, docker
+ AlertType string `json:"alert_type"` // cpu, memory, disk, offline, etc.
+ Level string `json:"level"` // warning, critical
+ Value float64 `json:"value"`
+ Threshold float64 `json:"threshold"`
+ Message string `json:"message"`
+ Duration string `json:"duration"` // How long the alert has been active
+ Node string `json:"node,omitempty"`
+ VMID int `json:"vmid,omitempty"`
+}
+
+// HandleInvestigateAlert investigates an alert using AI (POST /api/ai/investigate-alert)
+// This is a dedicated endpoint for one-click alert investigation from the UI
+func (h *AISettingsHandler) HandleInvestigateAlert(w http.ResponseWriter, r *http.Request) {
+ // Handle CORS
+ origin := r.Header.Get("Origin")
+ if origin != "" {
+ w.Header().Set("Access-Control-Allow-Origin", origin)
+ w.Header().Set("Access-Control-Allow-Credentials", "true")
+ w.Header().Set("Access-Control-Allow-Methods", "POST, OPTIONS")
+ w.Header().Set("Access-Control-Allow-Headers", "Content-Type, Accept, Cookie")
+ w.Header().Set("Vary", "Origin")
+ }
+
+ if r.Method == http.MethodOptions {
+ w.WriteHeader(http.StatusOK)
+ return
+ }
+
+ if r.Method != http.MethodPost {
+ http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
+ return
+ }
+
+ // Require authentication
+ if !CheckAuth(h.config, w, r) {
+ return
+ }
+
+ // Check if AI is enabled
+ if !h.aiService.IsEnabled() {
+ http.Error(w, "AI is not enabled or configured", http.StatusBadRequest)
+ return
+ }
+
+ // Parse request
+ r.Body = http.MaxBytesReader(w, r.Body, 16*1024)
+ var req AIInvestigateAlertRequest
+ if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
+ http.Error(w, "Invalid request body", http.StatusBadRequest)
+ return
+ }
+
+ // Build investigation prompt
+ investigationPrompt := ai.GenerateAlertInvestigationPrompt(ai.AlertInvestigationRequest{
+ AlertID: req.AlertID,
+ ResourceID: req.ResourceID,
+ ResourceName: req.ResourceName,
+ ResourceType: req.ResourceType,
+ AlertType: req.AlertType,
+ Level: req.Level,
+ Value: req.Value,
+ Threshold: req.Threshold,
+ Message: req.Message,
+ Duration: req.Duration,
+ Node: req.Node,
+ VMID: req.VMID,
+ })
+
+ log.Info().
+ Str("alert_id", req.AlertID).
+ Str("resource", req.ResourceName).
+ Str("type", req.AlertType).
+ Msg("AI alert investigation started")
+
+ // Set up SSE streaming
+ w.Header().Set("Content-Type", "text/event-stream")
+ w.Header().Set("Cache-Control", "no-cache")
+ w.Header().Set("Connection", "keep-alive")
+ w.Header().Set("X-Accel-Buffering", "no")
+ w.Header().Set("Transfer-Encoding", "identity")
+
+ flusher, ok := w.(http.Flusher)
+ if !ok {
+ http.Error(w, "Streaming not supported", http.StatusInternalServerError)
+ return
+ }
+
+ // Disable write/read deadlines for SSE
+ rc := http.NewResponseController(w)
+ _ = rc.SetWriteDeadline(time.Time{})
+ _ = rc.SetReadDeadline(time.Time{})
+
+ flusher.Flush()
+
+ // Create context with timeout
+ ctx, cancel := context.WithTimeout(context.Background(), 300*time.Second)
+ defer cancel()
+
+ // Heartbeat routine
+ heartbeatDone := make(chan struct{})
+ var clientDisconnected bool
+ go func() {
+ ticker := time.NewTicker(5 * time.Second)
+ defer ticker.Stop()
+ for {
+ select {
+ case <-ticker.C:
+ _ = rc.SetWriteDeadline(time.Now().Add(10 * time.Second))
+ _, err := w.Write([]byte(": heartbeat\n\n"))
+ if err != nil {
+ clientDisconnected = true
+ return
+ }
+ flusher.Flush()
+ case <-heartbeatDone:
+ return
+ }
+ }
+ }()
+ defer close(heartbeatDone)
+
+ safeWrite := func(data []byte) bool {
+ if clientDisconnected {
+ return false
+ }
+ _ = rc.SetWriteDeadline(time.Now().Add(10 * time.Second))
+ _, err := w.Write(data)
+ if err != nil {
+ clientDisconnected = true
+ return false
+ }
+ flusher.Flush()
+ return true
+ }
+
+ // Determine target type and ID from alert info
+ targetType := req.ResourceType
+ targetID := req.ResourceID
+
+ // Map resource type to expected target type format
+ switch req.ResourceType {
+ case "guest":
+ // Could be VM or container - try to determine from VMID
+ if req.VMID > 0 {
+ targetType = "container" // Default to container, AI will figure it out
+ }
+ case "docker":
+ targetType = "docker_container"
+ }
+
+ // Stream callback
+ callback := func(event ai.StreamEvent) {
+ if event.Type == "done" {
+ return
+ }
+ data, err := json.Marshal(event)
+ if err != nil {
+ return
+ }
+ safeWrite([]byte("data: " + string(data) + "\n\n"))
+ }
+
+ // Execute with streaming
+ defer func() {
+ if !clientDisconnected {
+ doneEvent := ai.StreamEvent{Type: "done"}
+ data, _ := json.Marshal(doneEvent)
+ safeWrite([]byte("data: " + string(data) + "\n\n"))
+ }
+ }()
+
+ resp, err := h.aiService.ExecuteStream(ctx, ai.ExecuteRequest{
+ Prompt: investigationPrompt,
+ TargetType: targetType,
+ TargetID: targetID,
+ Context: map[string]interface{}{
+ "alertId": req.AlertID,
+ "alertType": req.AlertType,
+ "alertLevel": req.Level,
+ "alertMessage": req.Message,
+ "guestName": req.ResourceName,
+ "node": req.Node,
+ },
+ }, callback)
+
+ if err != nil {
+ log.Error().Err(err).Msg("AI alert investigation failed")
+ errEvent := ai.StreamEvent{Type: "error", Data: err.Error()}
+ data, _ := json.Marshal(errEvent)
+ safeWrite([]byte("data: " + string(data) + "\n\n"))
+ return
+ }
+
+ // Send completion event
+ finalEvent := struct {
+ Type string `json:"type"`
+ Model string `json:"model"`
+ InputTokens int `json:"input_tokens"`
+ OutputTokens int `json:"output_tokens"`
+ ToolCalls []ai.ToolExecution `json:"tool_calls,omitempty"`
+ }{
+ Type: "complete",
+ Model: resp.Model,
+ InputTokens: resp.InputTokens,
+ OutputTokens: resp.OutputTokens,
+ ToolCalls: resp.ToolCalls,
+ }
+ data, _ := json.Marshal(finalEvent)
+ safeWrite([]byte("data: " + string(data) + "\n\n"))
+
+ log.Info().
+ Str("alert_id", req.AlertID).
+ Str("model", resp.Model).
+ Int("tool_calls", len(resp.ToolCalls)).
+ Msg("AI alert investigation completed")
+}
+
+// SetAlertProvider sets the alert provider for AI context
+func (h *AISettingsHandler) SetAlertProvider(ap ai.AlertProvider) {
+ h.aiService.SetAlertProvider(ap)
+}
diff --git a/internal/api/router.go b/internal/api/router.go
index b3a582b69..4734d340a 100644
--- a/internal/api/router.go
+++ b/internal/api/router.go
@@ -27,6 +27,7 @@ import (
"github.com/rcourtman/pulse-go-rewrite/internal/agentbinaries"
"github.com/rcourtman/pulse-go-rewrite/internal/agentexec"
+ "github.com/rcourtman/pulse-go-rewrite/internal/ai"
"github.com/rcourtman/pulse-go-rewrite/internal/auth"
"github.com/rcourtman/pulse-go-rewrite/internal/config"
"github.com/rcourtman/pulse-go-rewrite/internal/models"
@@ -201,6 +202,7 @@ func (r *Router) setupRoutes() {
r.mux.HandleFunc("/api/storage/", RequireAuth(r.config, RequireScope(config.ScopeMonitoringRead, r.handleStorage)))
r.mux.HandleFunc("/api/storage-charts", RequireAuth(r.config, RequireScope(config.ScopeMonitoringRead, r.handleStorageCharts)))
r.mux.HandleFunc("/api/charts", RequireAuth(r.config, RequireScope(config.ScopeMonitoringRead, r.handleCharts)))
+ r.mux.HandleFunc("/api/metrics-store/stats", RequireAuth(r.config, RequireScope(config.ScopeMonitoringRead, r.handleMetricsStoreStats)))
r.mux.HandleFunc("/api/diagnostics", RequireAuth(r.config, r.handleDiagnostics))
r.mux.HandleFunc("/api/diagnostics/temperature-proxy/register-nodes", RequireAdmin(r.config, RequireScope(config.ScopeSettingsWrite, r.handleDiagnosticsRegisterProxyNodes)))
r.mux.HandleFunc("/api/diagnostics/docker/prepare-token", RequireAdmin(r.config, RequireScope(config.ScopeSettingsWrite, r.handleDiagnosticsDockerPrepareToken)))
@@ -1022,13 +1024,26 @@ func (r *Router) setupRoutes() {
// Inject state provider so AI has access to full infrastructure context (VMs, containers, IPs)
if r.monitor != nil {
r.aiSettingsHandler.SetStateProvider(r.monitor)
+ // Inject alert provider so AI has awareness of current alerts
+ if alertManager := r.monitor.GetAlertManager(); alertManager != nil {
+ r.aiSettingsHandler.SetAlertProvider(ai.NewAlertManagerAdapter(alertManager))
+ }
}
r.mux.HandleFunc("/api/settings/ai", RequireAdmin(r.config, RequireScope(config.ScopeSettingsRead, r.aiSettingsHandler.HandleGetAISettings)))
r.mux.HandleFunc("/api/settings/ai/update", RequireAdmin(r.config, RequireScope(config.ScopeSettingsWrite, r.aiSettingsHandler.HandleUpdateAISettings)))
r.mux.HandleFunc("/api/ai/test", RequireAdmin(r.config, RequireScope(config.ScopeSettingsWrite, r.aiSettingsHandler.HandleTestAIConnection)))
r.mux.HandleFunc("/api/ai/execute", RequireAuth(r.config, r.aiSettingsHandler.HandleExecute))
r.mux.HandleFunc("/api/ai/execute/stream", RequireAuth(r.config, r.aiSettingsHandler.HandleExecuteStream))
+ r.mux.HandleFunc("/api/ai/investigate-alert", RequireAuth(r.config, r.aiSettingsHandler.HandleInvestigateAlert))
r.mux.HandleFunc("/api/ai/run-command", RequireAuth(r.config, r.aiSettingsHandler.HandleRunCommand))
+ r.mux.HandleFunc("/api/ai/knowledge", RequireAuth(r.config, r.aiSettingsHandler.HandleGetGuestKnowledge))
+ r.mux.HandleFunc("/api/ai/knowledge/save", RequireAuth(r.config, r.aiSettingsHandler.HandleSaveGuestNote))
+ r.mux.HandleFunc("/api/ai/knowledge/delete", RequireAuth(r.config, r.aiSettingsHandler.HandleDeleteGuestNote))
+ r.mux.HandleFunc("/api/ai/knowledge/export", RequireAuth(r.config, r.aiSettingsHandler.HandleExportGuestKnowledge))
+ r.mux.HandleFunc("/api/ai/knowledge/import", RequireAuth(r.config, r.aiSettingsHandler.HandleImportGuestKnowledge))
+ r.mux.HandleFunc("/api/ai/knowledge/clear", RequireAuth(r.config, r.aiSettingsHandler.HandleClearGuestKnowledge))
+ r.mux.HandleFunc("/api/ai/debug/context", RequireAdmin(r.config, r.aiSettingsHandler.HandleDebugContext))
+ r.mux.HandleFunc("/api/ai/agents", RequireAuth(r.config, r.aiSettingsHandler.HandleGetConnectedAgents))
// Agent WebSocket for AI command execution
r.mux.HandleFunc("/api/agent/ws", r.handleAgentWebSocket)
@@ -2909,6 +2924,44 @@ func (r *Router) handleStorageCharts(w http.ResponseWriter, req *http.Request) {
}
}
+// handleMetricsStoreStats returns statistics about the persistent metrics store
+func (r *Router) handleMetricsStoreStats(w http.ResponseWriter, req *http.Request) {
+ if req.Method != http.MethodGet {
+ http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
+ return
+ }
+
+ store := r.monitor.GetMetricsStore()
+ if store == nil {
+ w.Header().Set("Content-Type", "application/json")
+ json.NewEncoder(w).Encode(map[string]interface{}{
+ "enabled": false,
+ "error": "Persistent metrics store not initialized",
+ })
+ return
+ }
+
+ stats := store.GetStats()
+ w.Header().Set("Content-Type", "application/json")
+ if err := json.NewEncoder(w).Encode(map[string]interface{}{
+ "enabled": true,
+ "dbPath": stats.DBPath,
+ "dbSize": stats.DBSize,
+ "rawCount": stats.RawCount,
+ "minuteCount": stats.MinuteCount,
+ "hourlyCount": stats.HourlyCount,
+ "dailyCount": stats.DailyCount,
+ "totalWrites": stats.TotalWrites,
+ "bufferSize": stats.BufferSize,
+ "lastFlush": stats.LastFlush,
+ "lastRollup": stats.LastRollup,
+ "lastRetention": stats.LastRetention,
+ }); err != nil {
+ log.Error().Err(err).Msg("Failed to encode metrics store stats")
+ http.Error(w, "Internal server error", http.StatusInternalServerError)
+ }
+}
+
// handleConfig handles configuration requests
func (r *Router) handleConfig(w http.ResponseWriter, req *http.Request) {
if req.Method != http.MethodGet {
diff --git a/internal/config/ai.go b/internal/config/ai.go
index 5450ac885..502ebe1fe 100644
--- a/internal/config/ai.go
+++ b/internal/config/ai.go
@@ -4,7 +4,7 @@ package config
// This is stored in ai.enc (encrypted) in the config directory
type AIConfig struct {
Enabled bool `json:"enabled"`
- Provider string `json:"provider"` // "anthropic", "openai", "ollama"
+ Provider string `json:"provider"` // "anthropic", "openai", "ollama", "deepseek"
APIKey string `json:"api_key"` // encrypted at rest (not needed for ollama)
Model string `json:"model"` // e.g., "claude-opus-4-5-20250514", "gpt-4o", "llama3"
BaseURL string `json:"base_url"` // custom endpoint (required for ollama, optional for openai)
@@ -17,6 +17,7 @@ const (
AIProviderAnthropic = "anthropic"
AIProviderOpenAI = "openai"
AIProviderOllama = "ollama"
+ AIProviderDeepSeek = "deepseek"
)
// Default models per provider
@@ -24,7 +25,9 @@ const (
DefaultAIModelAnthropic = "claude-opus-4-5-20251101"
DefaultAIModelOpenAI = "gpt-4o"
DefaultAIModelOllama = "llama3"
+ DefaultAIModelDeepSeek = "deepseek-reasoner"
DefaultOllamaBaseURL = "http://localhost:11434"
+ DefaultDeepSeekBaseURL = "https://api.deepseek.com/chat/completions"
)
// NewDefaultAIConfig returns an AIConfig with sensible defaults
@@ -43,7 +46,7 @@ func (c *AIConfig) IsConfigured() bool {
}
switch c.Provider {
- case AIProviderAnthropic, AIProviderOpenAI:
+ case AIProviderAnthropic, AIProviderOpenAI, AIProviderDeepSeek:
return c.APIKey != ""
case AIProviderOllama:
// Ollama doesn't need an API key
@@ -58,8 +61,11 @@ func (c *AIConfig) GetBaseURL() string {
if c.BaseURL != "" {
return c.BaseURL
}
- if c.Provider == AIProviderOllama {
+ switch c.Provider {
+ case AIProviderOllama:
return DefaultOllamaBaseURL
+ case AIProviderDeepSeek:
+ return DefaultDeepSeekBaseURL
}
return ""
}
@@ -76,6 +82,8 @@ func (c *AIConfig) GetModel() string {
return DefaultAIModelOpenAI
case AIProviderOllama:
return DefaultAIModelOllama
+ case AIProviderDeepSeek:
+ return DefaultAIModelDeepSeek
default:
return ""
}
diff --git a/internal/config/persistence.go b/internal/config/persistence.go
index df8428f74..1389439b6 100644
--- a/internal/config/persistence.go
+++ b/internal/config/persistence.go
@@ -88,6 +88,11 @@ func newConfigPersistence(configDir string) (*ConfigPersistence, error) {
return cp, nil
}
+// DataDir returns the configuration directory path
+func (c *ConfigPersistence) DataDir() string {
+ return c.configDir
+}
+
// EnsureConfigDir ensures the configuration directory exists
func (c *ConfigPersistence) EnsureConfigDir() error {
return os.MkdirAll(c.configDir, 0700)
diff --git a/internal/metrics/store.go b/internal/metrics/store.go
new file mode 100644
index 000000000..d07a2e47b
--- /dev/null
+++ b/internal/metrics/store.go
@@ -0,0 +1,575 @@
+// Package metrics provides persistent storage for time-series metrics data
+// using SQLite for durability across restarts.
+package metrics
+
+import (
+ "database/sql"
+ "fmt"
+ "os"
+ "path/filepath"
+ "sync"
+ "time"
+
+ "github.com/rs/zerolog/log"
+ _ "modernc.org/sqlite"
+)
+
+// Tier represents the granularity of stored metrics
+type Tier string
+
+const (
+ TierRaw Tier = "raw" // Raw data, ~5s intervals
+ TierMinute Tier = "minute" // 1-minute averages
+ TierHourly Tier = "hourly" // 1-hour averages
+ TierDaily Tier = "daily" // 1-day averages
+)
+
+// MetricPoint represents a single metric data point
+type MetricPoint struct {
+ Timestamp time.Time
+ Value float64
+ Min float64 // For aggregated data
+ Max float64 // For aggregated data
+}
+
+// StoreConfig holds configuration for the metrics store
+type StoreConfig struct {
+ DBPath string
+ WriteBufferSize int // Number of records to buffer before batch write
+ FlushInterval time.Duration // Max time between flushes
+ RetentionRaw time.Duration // How long to keep raw data
+ RetentionMinute time.Duration // How long to keep minute data
+ RetentionHourly time.Duration // How long to keep hourly data
+ RetentionDaily time.Duration // How long to keep daily data
+}
+
+// DefaultConfig returns sensible defaults for metrics storage
+func DefaultConfig(dataDir string) StoreConfig {
+ return StoreConfig{
+ DBPath: filepath.Join(dataDir, "metrics.db"),
+ WriteBufferSize: 100,
+ FlushInterval: 5 * time.Second,
+ RetentionRaw: 2 * time.Hour,
+ RetentionMinute: 24 * time.Hour,
+ RetentionHourly: 7 * 24 * time.Hour,
+ RetentionDaily: 90 * 24 * time.Hour,
+ }
+}
+
+// bufferedMetric holds a metric waiting to be written
+type bufferedMetric struct {
+ resourceType string
+ resourceID string
+ metricType string
+ value float64
+ timestamp time.Time
+}
+
+// Store provides persistent metrics storage
+type Store struct {
+ db *sql.DB
+ config StoreConfig
+
+ // Write buffer
+ bufferMu sync.Mutex
+ buffer []bufferedMetric
+
+ // Background workers
+ stopCh chan struct{}
+ doneCh chan struct{}
+ stopOnce sync.Once
+}
+
+// NewStore creates a new metrics store with the given configuration
+func NewStore(config StoreConfig) (*Store, error) {
+ // Ensure directory exists
+ dir := filepath.Dir(config.DBPath)
+ if err := os.MkdirAll(dir, 0755); err != nil {
+ return nil, fmt.Errorf("failed to create metrics directory: %w", err)
+ }
+
+ // Open database with WAL mode for better concurrent access
+ db, err := sql.Open("sqlite", config.DBPath+"?_journal_mode=WAL&_busy_timeout=5000")
+ if err != nil {
+ return nil, fmt.Errorf("failed to open metrics database: %w", err)
+ }
+
+ // Configure connection pool (SQLite works best with single writer)
+ db.SetMaxOpenConns(1)
+ db.SetMaxIdleConns(1)
+ db.SetConnMaxLifetime(0)
+
+ store := &Store{
+ db: db,
+ config: config,
+ buffer: make([]bufferedMetric, 0, config.WriteBufferSize),
+ stopCh: make(chan struct{}),
+ doneCh: make(chan struct{}),
+ }
+
+ // Initialize schema
+ if err := store.initSchema(); err != nil {
+ db.Close()
+ return nil, fmt.Errorf("failed to initialize schema: %w", err)
+ }
+
+ // Start background workers
+ go store.backgroundWorker()
+
+ log.Info().
+ Str("path", config.DBPath).
+ Int("bufferSize", config.WriteBufferSize).
+ Msg("Metrics store initialized")
+
+ return store, nil
+}
+
+// initSchema creates the database schema if it doesn't exist
+func (s *Store) initSchema() error {
+ schema := `
+ -- Main metrics table
+ CREATE TABLE IF NOT EXISTS metrics (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ resource_type TEXT NOT NULL,
+ resource_id TEXT NOT NULL,
+ metric_type TEXT NOT NULL,
+ value REAL NOT NULL,
+ min_value REAL,
+ max_value REAL,
+ timestamp INTEGER NOT NULL,
+ tier TEXT NOT NULL DEFAULT 'raw'
+ );
+
+ -- Index for efficient queries by resource and time
+ CREATE INDEX IF NOT EXISTS idx_metrics_lookup
+ ON metrics(resource_type, resource_id, metric_type, tier, timestamp);
+
+ -- Index for retention pruning
+ CREATE INDEX IF NOT EXISTS idx_metrics_tier_time
+ ON metrics(tier, timestamp);
+
+ -- Metadata table for tracking rollup state
+ CREATE TABLE IF NOT EXISTS metrics_meta (
+ key TEXT PRIMARY KEY,
+ value TEXT NOT NULL
+ );
+ `
+
+ _, err := s.db.Exec(schema)
+ if err != nil {
+ return fmt.Errorf("failed to create schema: %w", err)
+ }
+
+ log.Debug().Msg("Metrics schema initialized")
+ return nil
+}
+
+// Write adds a metric to the write buffer
+func (s *Store) Write(resourceType, resourceID, metricType string, value float64, timestamp time.Time) {
+ s.bufferMu.Lock()
+ defer s.bufferMu.Unlock()
+
+ s.buffer = append(s.buffer, bufferedMetric{
+ resourceType: resourceType,
+ resourceID: resourceID,
+ metricType: metricType,
+ value: value,
+ timestamp: timestamp,
+ })
+
+ // Flush if buffer is full
+ if len(s.buffer) >= s.config.WriteBufferSize {
+ s.flushLocked()
+ }
+}
+
+// flush writes buffered metrics to the database (caller must hold bufferMu)
+func (s *Store) flushLocked() {
+ if len(s.buffer) == 0 {
+ return
+ }
+
+ // Copy buffer for writing
+ toWrite := make([]bufferedMetric, len(s.buffer))
+ copy(toWrite, s.buffer)
+ s.buffer = s.buffer[:0]
+
+ // Write in background to not block callers
+ go s.writeBatch(toWrite)
+}
+
+// writeBatch writes a batch of metrics to the database
+func (s *Store) writeBatch(metrics []bufferedMetric) {
+ if len(metrics) == 0 {
+ return
+ }
+
+ tx, err := s.db.Begin()
+ if err != nil {
+ log.Error().Err(err).Msg("Failed to begin metrics transaction")
+ return
+ }
+
+ stmt, err := tx.Prepare(`
+ INSERT INTO metrics (resource_type, resource_id, metric_type, value, timestamp, tier)
+ VALUES (?, ?, ?, ?, ?, 'raw')
+ `)
+ if err != nil {
+ tx.Rollback()
+ log.Error().Err(err).Msg("Failed to prepare metrics insert")
+ return
+ }
+ defer stmt.Close()
+
+ for _, m := range metrics {
+ _, err := stmt.Exec(m.resourceType, m.resourceID, m.metricType, m.value, m.timestamp.Unix())
+ if err != nil {
+ log.Warn().Err(err).
+ Str("resource", m.resourceID).
+ Str("metric", m.metricType).
+ Msg("Failed to insert metric")
+ }
+ }
+
+ if err := tx.Commit(); err != nil {
+ log.Error().Err(err).Msg("Failed to commit metrics batch")
+ return
+ }
+
+ log.Debug().Int("count", len(metrics)).Msg("Wrote metrics batch")
+}
+
+// Query retrieves metrics for a resource within a time range
+func (s *Store) Query(resourceType, resourceID, metricType string, start, end time.Time) ([]MetricPoint, error) {
+ // Select appropriate tier based on time range
+ tier := s.selectTier(end.Sub(start))
+
+ rows, err := s.db.Query(`
+ SELECT timestamp, value, COALESCE(min_value, value), COALESCE(max_value, value)
+ FROM metrics
+ WHERE resource_type = ? AND resource_id = ? AND metric_type = ? AND tier = ?
+ AND timestamp >= ? AND timestamp <= ?
+ ORDER BY timestamp ASC
+ `, resourceType, resourceID, metricType, string(tier), start.Unix(), end.Unix())
+ if err != nil {
+ return nil, fmt.Errorf("failed to query metrics: %w", err)
+ }
+ defer rows.Close()
+
+ var points []MetricPoint
+ for rows.Next() {
+ var ts int64
+ var p MetricPoint
+ if err := rows.Scan(&ts, &p.Value, &p.Min, &p.Max); err != nil {
+ log.Warn().Err(err).Msg("Failed to scan metric row")
+ continue
+ }
+ p.Timestamp = time.Unix(ts, 0)
+ points = append(points, p)
+ }
+
+ return points, rows.Err()
+}
+
+// QueryAll retrieves all metric types for a resource within a time range
+func (s *Store) QueryAll(resourceType, resourceID string, start, end time.Time) (map[string][]MetricPoint, error) {
+ tier := s.selectTier(end.Sub(start))
+
+ rows, err := s.db.Query(`
+ SELECT metric_type, timestamp, value, COALESCE(min_value, value), COALESCE(max_value, value)
+ FROM metrics
+ WHERE resource_type = ? AND resource_id = ? AND tier = ?
+ AND timestamp >= ? AND timestamp <= ?
+ ORDER BY metric_type, timestamp ASC
+ `, resourceType, resourceID, string(tier), start.Unix(), end.Unix())
+ if err != nil {
+ return nil, fmt.Errorf("failed to query all metrics: %w", err)
+ }
+ defer rows.Close()
+
+ result := make(map[string][]MetricPoint)
+ for rows.Next() {
+ var metricType string
+ var ts int64
+ var p MetricPoint
+ if err := rows.Scan(&metricType, &ts, &p.Value, &p.Min, &p.Max); err != nil {
+ log.Warn().Err(err).Msg("Failed to scan metric row")
+ continue
+ }
+ p.Timestamp = time.Unix(ts, 0)
+ result[metricType] = append(result[metricType], p)
+ }
+
+ return result, rows.Err()
+}
+
+// selectTier chooses the appropriate data tier based on time range
+func (s *Store) selectTier(duration time.Duration) Tier {
+ switch {
+ case duration <= s.config.RetentionRaw:
+ return TierRaw
+ case duration <= s.config.RetentionMinute:
+ return TierMinute
+ case duration <= s.config.RetentionHourly:
+ return TierHourly
+ default:
+ return TierDaily
+ }
+}
+
+// backgroundWorker runs periodic tasks
+func (s *Store) backgroundWorker() {
+ defer close(s.doneCh)
+
+ flushTicker := time.NewTicker(s.config.FlushInterval)
+ rollupTicker := time.NewTicker(5 * time.Minute)
+ retentionTicker := time.NewTicker(1 * time.Hour)
+
+ defer flushTicker.Stop()
+ defer rollupTicker.Stop()
+ defer retentionTicker.Stop()
+
+ for {
+ select {
+ case <-s.stopCh:
+ // Final flush before stopping
+ s.Flush()
+ return
+
+ case <-flushTicker.C:
+ s.Flush()
+
+ case <-rollupTicker.C:
+ s.runRollup()
+
+ case <-retentionTicker.C:
+ s.runRetention()
+ }
+ }
+}
+
+// Flush writes any buffered metrics to the database
+func (s *Store) Flush() {
+ s.bufferMu.Lock()
+ defer s.bufferMu.Unlock()
+ s.flushLocked()
+}
+
+// runRollup aggregates raw data into higher tiers
+func (s *Store) runRollup() {
+ start := time.Now()
+
+ // Rollup raw -> minute (for data older than 5 minutes)
+ s.rollupTier(TierRaw, TierMinute, time.Minute, 5*time.Minute)
+
+ // Rollup minute -> hourly (for data older than 1 hour)
+ s.rollupTier(TierMinute, TierHourly, time.Hour, time.Hour)
+
+ // Rollup hourly -> daily (for data older than 24 hours)
+ s.rollupTier(TierHourly, TierDaily, 24*time.Hour, 24*time.Hour)
+
+ log.Debug().Dur("duration", time.Since(start)).Msg("Metrics rollup completed")
+}
+
+// rollupTier aggregates data from one tier to another
+func (s *Store) rollupTier(fromTier, toTier Tier, bucketSize, minAge time.Duration) {
+ cutoff := time.Now().Add(-minAge).Unix()
+ bucketSecs := int64(bucketSize.Seconds())
+
+ // Find distinct resource/metric combinations that need rollup
+ rows, err := s.db.Query(`
+ SELECT DISTINCT resource_type, resource_id, metric_type
+ FROM metrics
+ WHERE tier = ? AND timestamp < ?
+ `, string(fromTier), cutoff)
+ if err != nil {
+ log.Error().Err(err).Str("tier", string(fromTier)).Msg("Failed to find rollup candidates")
+ return
+ }
+
+ var candidates []struct {
+ resourceType string
+ resourceID string
+ metricType string
+ }
+
+ for rows.Next() {
+ var c struct {
+ resourceType string
+ resourceID string
+ metricType string
+ }
+ if err := rows.Scan(&c.resourceType, &c.resourceID, &c.metricType); err == nil {
+ candidates = append(candidates, c)
+ }
+ }
+ rows.Close()
+
+ if len(candidates) == 0 {
+ return
+ }
+
+ // Process each candidate
+ for _, c := range candidates {
+ s.rollupCandidate(c.resourceType, c.resourceID, c.metricType, fromTier, toTier, bucketSecs, cutoff)
+ }
+}
+
+// rollupCandidate aggregates a single resource/metric from one tier to another
+func (s *Store) rollupCandidate(resourceType, resourceID, metricType string, fromTier, toTier Tier, bucketSecs, cutoff int64) {
+ tx, err := s.db.Begin()
+ if err != nil {
+ return
+ }
+ defer tx.Rollback()
+
+ // Aggregate data into buckets
+ _, err = tx.Exec(`
+ INSERT INTO metrics (resource_type, resource_id, metric_type, value, min_value, max_value, timestamp, tier)
+ SELECT
+ resource_type,
+ resource_id,
+ metric_type,
+ AVG(value) as value,
+ MIN(value) as min_value,
+ MAX(value) as max_value,
+ (timestamp / ?) * ? as bucket_ts,
+ ?
+ FROM metrics
+ WHERE resource_type = ? AND resource_id = ? AND metric_type = ?
+ AND tier = ? AND timestamp < ?
+ GROUP BY resource_type, resource_id, metric_type, bucket_ts
+ `, bucketSecs, bucketSecs, string(toTier), resourceType, resourceID, metricType, string(fromTier), cutoff)
+
+ if err != nil {
+ log.Warn().Err(err).
+ Str("resource", resourceID).
+ Str("from", string(fromTier)).
+ Str("to", string(toTier)).
+ Msg("Failed to rollup metrics")
+ return
+ }
+
+ // Delete rolled-up raw data
+ _, err = tx.Exec(`
+ DELETE FROM metrics
+ WHERE resource_type = ? AND resource_id = ? AND metric_type = ?
+ AND tier = ? AND timestamp < ?
+ `, resourceType, resourceID, metricType, string(fromTier), cutoff)
+
+ if err != nil {
+ log.Warn().Err(err).Msg("Failed to delete rolled-up metrics")
+ return
+ }
+
+ tx.Commit()
+}
+
+// runRetention deletes data older than retention period
+func (s *Store) runRetention() {
+ start := time.Now()
+ now := time.Now()
+
+ // Delete old data for each tier
+ tiers := []struct {
+ tier Tier
+ retention time.Duration
+ }{
+ {TierRaw, s.config.RetentionRaw},
+ {TierMinute, s.config.RetentionMinute},
+ {TierHourly, s.config.RetentionHourly},
+ {TierDaily, s.config.RetentionDaily},
+ }
+
+ var totalDeleted int64
+ for _, t := range tiers {
+ cutoff := now.Add(-t.retention).Unix()
+ result, err := s.db.Exec(`DELETE FROM metrics WHERE tier = ? AND timestamp < ?`, string(t.tier), cutoff)
+ if err != nil {
+ log.Warn().Err(err).Str("tier", string(t.tier)).Msg("Failed to prune metrics")
+ continue
+ }
+ if affected, _ := result.RowsAffected(); affected > 0 {
+ totalDeleted += affected
+ }
+ }
+
+ if totalDeleted > 0 {
+ log.Info().
+ Int64("deleted", totalDeleted).
+ Dur("duration", time.Since(start)).
+ Msg("Metrics retention cleanup completed")
+ }
+}
+
+// Close shuts down the store gracefully
+func (s *Store) Close() error {
+ s.stopOnce.Do(func() {
+ close(s.stopCh)
+ })
+
+ // Wait for background worker to finish
+ select {
+ case <-s.doneCh:
+ case <-time.After(5 * time.Second):
+ log.Warn().Msg("Metrics store shutdown timed out")
+ }
+
+ return s.db.Close()
+}
+
+// Stats holds metrics store statistics
+type Stats struct {
+ DBPath string `json:"dbPath"`
+ DBSize int64 `json:"dbSize"`
+ RawCount int64 `json:"rawCount"`
+ MinuteCount int64 `json:"minuteCount"`
+ HourlyCount int64 `json:"hourlyCount"`
+ DailyCount int64 `json:"dailyCount"`
+ TotalWrites int64 `json:"totalWrites"`
+ BufferSize int `json:"bufferSize"`
+ LastFlush time.Time `json:"lastFlush"`
+ LastRollup time.Time `json:"lastRollup"`
+ LastRetention time.Time `json:"lastRetention"`
+}
+
+// GetStats returns storage statistics
+func (s *Store) GetStats() Stats {
+ stats := Stats{
+ DBPath: s.config.DBPath,
+ }
+
+ // Count by tier
+ rows, err := s.db.Query(`SELECT tier, COUNT(*) FROM metrics GROUP BY tier`)
+ if err == nil {
+ defer rows.Close()
+ for rows.Next() {
+ var tier string
+ var count int64
+ if err := rows.Scan(&tier, &count); err == nil {
+ switch tier {
+ case "raw":
+ stats.RawCount = count
+ case "minute":
+ stats.MinuteCount = count
+ case "hourly":
+ stats.HourlyCount = count
+ case "daily":
+ stats.DailyCount = count
+ }
+ }
+ }
+ }
+
+ // Get database size
+ if fi, err := os.Stat(s.config.DBPath); err == nil {
+ stats.DBSize = fi.Size()
+ }
+
+ // Get buffer size
+ s.bufferMu.Lock()
+ stats.BufferSize = len(s.buffer)
+ s.bufferMu.Unlock()
+
+ return stats
+}
diff --git a/internal/monitoring/monitor.go b/internal/monitoring/monitor.go
index 1f7ef486d..dacada59a 100644
--- a/internal/monitoring/monitor.go
+++ b/internal/monitoring/monitor.go
@@ -24,6 +24,7 @@ import (
"github.com/rcourtman/pulse-go-rewrite/internal/discovery"
"github.com/rcourtman/pulse-go-rewrite/internal/errors"
"github.com/rcourtman/pulse-go-rewrite/internal/logging"
+ "github.com/rcourtman/pulse-go-rewrite/internal/metrics"
"github.com/rcourtman/pulse-go-rewrite/internal/mock"
"github.com/rcourtman/pulse-go-rewrite/internal/models"
"github.com/rcourtman/pulse-go-rewrite/internal/notifications"
@@ -553,6 +554,7 @@ type Monitor struct {
startTime time.Time
rateTracker *RateTracker
metricsHistory *MetricsHistory
+ metricsStore *metrics.Store // Persistent SQLite metrics storage
alertManager *alerts.Manager
notificationMgr *notifications.NotificationManager
configPersist *config.ConfigPersistence
@@ -2554,7 +2556,7 @@ func checkContainerizedTempMonitoring() {
// Log warning
log.Warn().
- Msg("🔐 SECURITY NOTICE: Pulse is running in a container with SSH-based temperature monitoring enabled. " +
+ Msg("SECURITY NOTICE: Pulse is running in a container with SSH-based temperature monitoring enabled. " +
"SSH private keys are stored inside the container, which could be a security risk if the container is compromised. " +
"Future versions will use agent-based architecture for better security. " +
"See documentation for hardening recommendations.")
@@ -2638,6 +2640,17 @@ func New(cfg *config.Config) (*Monitor, error) {
guestAgentVersionTimeout := parseDurationEnv("GUEST_AGENT_VERSION_TIMEOUT", defaultGuestAgentVersionTimeout)
guestAgentRetries := parseIntEnv("GUEST_AGENT_RETRIES", defaultGuestAgentRetries)
+ // Initialize persistent metrics store (SQLite)
+ var metricsStore *metrics.Store
+ metricsStoreConfig := metrics.DefaultConfig(cfg.DataPath)
+ ms, err := metrics.NewStore(metricsStoreConfig)
+ if err != nil {
+ log.Error().Err(err).Msg("Failed to initialize persistent metrics store - continuing with in-memory only")
+ } else {
+ metricsStore = ms
+ log.Info().Str("path", metricsStoreConfig.DBPath).Msg("Persistent metrics store initialized")
+ }
+
m := &Monitor{
config: cfg,
state: models.NewState(),
@@ -2662,6 +2675,7 @@ func New(cfg *config.Config) (*Monitor, error) {
startTime: time.Now(),
rateTracker: NewRateTracker(),
metricsHistory: NewMetricsHistory(1000, 24*time.Hour), // Keep up to 1000 points or 24 hours
+ metricsStore: metricsStore, // Persistent SQLite storage
alertManager: alerts.NewManager(),
notificationMgr: notifications.NewNotificationManager(cfg.PublicURL),
configPersist: config.NewConfigPersistence(cfg.DataPath),
@@ -4880,6 +4894,12 @@ func (m *Monitor) pollPVEInstance(ctx context.Context, instanceName string, clie
m.metricsHistory.AddNodeMetric(modelNodes[i].ID, "cpu", modelNodes[i].CPU*100, now)
m.metricsHistory.AddNodeMetric(modelNodes[i].ID, "memory", modelNodes[i].Memory.Usage, now)
m.metricsHistory.AddNodeMetric(modelNodes[i].ID, "disk", modelNodes[i].Disk.Usage, now)
+ // Also write to persistent store
+ if m.metricsStore != nil {
+ m.metricsStore.Write("node", modelNodes[i].ID, "cpu", modelNodes[i].CPU*100, now)
+ m.metricsStore.Write("node", modelNodes[i].ID, "memory", modelNodes[i].Memory.Usage, now)
+ m.metricsStore.Write("node", modelNodes[i].ID, "disk", modelNodes[i].Disk.Usage, now)
+ }
}
// Check thresholds for alerts
@@ -5933,6 +5953,43 @@ func (m *Monitor) pollVMsAndContainersEfficient(ctx context.Context, instanceNam
m.state.UpdateVMsForInstance(instanceName, allVMs)
m.state.UpdateContainersForInstance(instanceName, allContainers)
+ // Record guest metrics history for running guests (enables sparkline/trends view)
+ now := time.Now()
+ for _, vm := range allVMs {
+ if vm.Status == "running" {
+ m.metricsHistory.AddGuestMetric(vm.ID, "cpu", vm.CPU*100, now)
+ m.metricsHistory.AddGuestMetric(vm.ID, "memory", vm.Memory.Usage, now)
+ if vm.Disk.Usage >= 0 {
+ m.metricsHistory.AddGuestMetric(vm.ID, "disk", vm.Disk.Usage, now)
+ }
+ // Also write to persistent store
+ if m.metricsStore != nil {
+ m.metricsStore.Write("vm", vm.ID, "cpu", vm.CPU*100, now)
+ m.metricsStore.Write("vm", vm.ID, "memory", vm.Memory.Usage, now)
+ if vm.Disk.Usage >= 0 {
+ m.metricsStore.Write("vm", vm.ID, "disk", vm.Disk.Usage, now)
+ }
+ }
+ }
+ }
+ for _, ct := range allContainers {
+ if ct.Status == "running" {
+ m.metricsHistory.AddGuestMetric(ct.ID, "cpu", ct.CPU*100, now)
+ m.metricsHistory.AddGuestMetric(ct.ID, "memory", ct.Memory.Usage, now)
+ if ct.Disk.Usage >= 0 {
+ m.metricsHistory.AddGuestMetric(ct.ID, "disk", ct.Disk.Usage, now)
+ }
+ // Also write to persistent store
+ if m.metricsStore != nil {
+ m.metricsStore.Write("container", ct.ID, "cpu", ct.CPU*100, now)
+ m.metricsStore.Write("container", ct.ID, "memory", ct.Memory.Usage, now)
+ if ct.Disk.Usage >= 0 {
+ m.metricsStore.Write("container", ct.ID, "disk", ct.Disk.Usage, now)
+ }
+ }
+ }
+ }
+
m.pollReplicationStatus(ctx, instanceName, client, allVMs)
log.Info().
@@ -6943,6 +7000,11 @@ func (m *Monitor) GetConfigPersistence() *config.ConfigPersistence {
return m.configPersist
}
+// GetMetricsStore returns the persistent metrics store
+func (m *Monitor) GetMetricsStore() *metrics.Store {
+ return m.metricsStore
+}
+
// pollStorageBackupsWithNodes polls backups using a provided nodes list to avoid duplicate GetNodes calls
func (m *Monitor) pollStorageBackupsWithNodes(ctx context.Context, instanceName string, client PVEClientInterface, nodes []proxmox.Node, nodeEffectiveStatus map[string]string) {
@@ -7781,6 +7843,15 @@ func (m *Monitor) Stop() {
m.notificationMgr.Stop()
}
+ // Close persistent metrics store (flushes buffered data)
+ if m.metricsStore != nil {
+ if err := m.metricsStore.Close(); err != nil {
+ log.Error().Err(err).Msg("Failed to close metrics store")
+ } else {
+ log.Info().Msg("Metrics store closed successfully")
+ }
+ }
+
log.Info().Msg("Monitor stopped")
}
diff --git a/internal/monitoring/monitor_polling.go b/internal/monitoring/monitor_polling.go
index 928611ec0..965582974 100644
--- a/internal/monitoring/monitor_polling.go
+++ b/internal/monitoring/monitor_polling.go
@@ -842,6 +842,26 @@ func (m *Monitor) pollVMsWithNodes(ctx context.Context, instanceName string, cli
// Update state with all VMs
m.state.UpdateVMsForInstance(instanceName, allVMs)
+ // Record guest metrics history for running VMs (enables sparkline/trends view)
+ now := time.Now()
+ for _, vm := range allVMs {
+ if vm.Status == "running" {
+ m.metricsHistory.AddGuestMetric(vm.ID, "cpu", vm.CPU*100, now)
+ m.metricsHistory.AddGuestMetric(vm.ID, "memory", vm.Memory.Usage, now)
+ if vm.Disk.Usage >= 0 {
+ m.metricsHistory.AddGuestMetric(vm.ID, "disk", vm.Disk.Usage, now)
+ }
+ // Also write to persistent store
+ if m.metricsStore != nil {
+ m.metricsStore.Write("vm", vm.ID, "cpu", vm.CPU*100, now)
+ m.metricsStore.Write("vm", vm.ID, "memory", vm.Memory.Usage, now)
+ if vm.Disk.Usage >= 0 {
+ m.metricsStore.Write("vm", vm.ID, "disk", vm.Disk.Usage, now)
+ }
+ }
+ }
+ }
+
duration := time.Since(startTime)
log.Info().
Str("instance", instanceName).
@@ -1109,6 +1129,26 @@ func (m *Monitor) pollContainersWithNodes(ctx context.Context, instanceName stri
// Update state with all containers
m.state.UpdateContainersForInstance(instanceName, allContainers)
+ // Record guest metrics history for running containers (enables sparkline/trends view)
+ now := time.Now()
+ for _, ct := range allContainers {
+ if ct.Status == "running" {
+ m.metricsHistory.AddGuestMetric(ct.ID, "cpu", ct.CPU*100, now)
+ m.metricsHistory.AddGuestMetric(ct.ID, "memory", ct.Memory.Usage, now)
+ if ct.Disk.Usage >= 0 {
+ m.metricsHistory.AddGuestMetric(ct.ID, "disk", ct.Disk.Usage, now)
+ }
+ // Also write to persistent store
+ if m.metricsStore != nil {
+ m.metricsStore.Write("container", ct.ID, "cpu", ct.CPU*100, now)
+ m.metricsStore.Write("container", ct.ID, "memory", ct.Memory.Usage, now)
+ if ct.Disk.Usage >= 0 {
+ m.metricsStore.Write("container", ct.ID, "disk", ct.Disk.Usage, now)
+ }
+ }
+ }
+ }
+
duration := time.Since(startTime)
log.Info().
Str("instance", instanceName).
diff --git a/internal/utils/helpers.go b/internal/utils/helpers.go
index 823a2796c..1eb602fb5 100644
--- a/internal/utils/helpers.go
+++ b/internal/utils/helpers.go
@@ -42,11 +42,22 @@ func GetenvTrim(key string) string {
return strings.TrimSpace(os.Getenv(key))
}
-// NormalizeVersion strips the "v" prefix from version strings for comparison.
-// This normalizes versions like "v4.33.1" to "4.33.1" so that version strings
-// from different sources (agent vs server) can be compared consistently.
+// NormalizeVersion normalizes version strings for comparison by:
+// 1. Stripping whitespace
+// 2. Removing the "v" prefix (e.g., "v4.33.1" -> "4.33.1")
+// 3. Stripping build metadata after "+" (e.g., "4.36.2+git.14.dirty" -> "4.36.2")
+//
+// Per semver spec, build metadata MUST be ignored when determining version precedence.
+// This fixes issues where dirty builds like "4.36.2+git.14.g469307d6.dirty" would
+// incorrectly be treated as newer than "4.36.2", causing infinite update loops.
func NormalizeVersion(version string) string {
- return strings.TrimPrefix(strings.TrimSpace(version), "v")
+ v := strings.TrimPrefix(strings.TrimSpace(version), "v")
+ // Strip build metadata (everything after +)
+ // Per semver: build metadata MUST be ignored when determining version precedence
+ if idx := strings.Index(v, "+"); idx != -1 {
+ v = v[:idx]
+ }
+ return v
}
// CompareVersions compares two semver-like version strings.
diff --git a/internal/utils/utils_test.go b/internal/utils/utils_test.go
index 123e32418..c2110ad74 100644
--- a/internal/utils/utils_test.go
+++ b/internal/utils/utils_test.go
@@ -307,6 +307,12 @@ func TestNormalizeVersion(t *testing.T) {
{"v", ""},
{" ", ""},
{"vv4.33.1", "v4.33.1"}, // Only removes one v
+
+ // Build metadata (semver +suffix should be stripped)
+ {"4.36.2+git.14.g469307d6.dirty", "4.36.2"},
+ {"v4.36.2+build123", "4.36.2"},
+ {"1.0.0+20231215", "1.0.0"},
+ {"v1.0.0-rc1+build.456", "1.0.0-rc1"},
}
for _, tc := range tests {
@@ -353,6 +359,14 @@ func TestCompareVersions(t *testing.T) {
{"0.0.1", "0.0.0", 1},
{"0.0.0", "0.0.1", -1},
{"1.0", "0.9.9", 1},
+
+ // Build metadata should be ignored (semver +suffix)
+ // This is the critical fix for the infinite agent update loop bug
+ {"4.36.2+git.14.g469307d6.dirty", "4.36.2", 0}, // Dirty == clean
+ {"4.36.2", "4.36.2+git.14.g469307d6.dirty", 0}, // Clean == dirty
+ {"v4.36.2+build123", "v4.36.2", 0}, // With v prefix
+ {"4.36.3", "4.36.2+git.14.g469307d6.dirty", 1}, // Newer beats dirty
+ {"4.36.2+git.14.g469307d6.dirty", "4.36.3", -1}, // Dirty older than newer
}
for _, tc := range tests {
|