From 85ffe10aed9388cf7a0573405751e99ebc660d1a Mon Sep 17 00:00:00 2001 From: rcourtman Date: Tue, 21 Oct 2025 10:40:33 +0000 Subject: [PATCH] docs: add Mermaid diagrams to improve visual documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enhance documentation with six Mermaid diagrams to better explain complex system implementations: - Adaptive polling lifecycle flowchart showing enqueue→execute→feedback cycle with scheduler, priority queue, and worker interactions - Circuit breaker state machine diagram illustrating Closed↔Open↔Half-open transitions with triggers and recovery paths - Temperature proxy architecture diagram highlighting trust boundaries, security controls, and data flow between host/container/cluster - Sensor proxy request flow sequence diagram showing auth, rate limiting, validation, and SSH execution pipeline - Alert webhook pipeline flowchart detailing template resolution, URL rendering, HTTP dispatch, and retry logic - Script library workflow diagram illustrating dev→test→bundle→distribute lifecycle emphasizing modular design These visualizations make it easier for operators and contributors to understand Pulse's sophisticated architectural patterns. --- docs/TEMPERATURE_MONITORING_SECURITY.md | 54 +-- docs/WEBHOOKS.md | 28 ++ docs/monitoring/ADAPTIVE_POLLING.md | 58 ++- docs/operations/pulse-sensor-proxy-runbook.md | 35 ++ docs/script-library-guide.md | 32 ++ frontend-modern/src/App.tsx | 17 + frontend-modern/src/api/alerts.ts | 6 + .../components/Alerts/ActivationBanner.tsx | 117 ++++++ .../src/components/Alerts/ActivationModal.tsx | 354 ++++++++++++++++++ .../src/stores/alertsActivation.ts | 96 +++++ frontend-modern/src/types/alerts.ts | 5 + internal/alerts/alerts.go | 66 +++- internal/api/alerts.go | 46 +++ 13 files changed, 875 insertions(+), 39 deletions(-) create mode 100644 frontend-modern/src/components/Alerts/ActivationBanner.tsx create mode 100644 frontend-modern/src/components/Alerts/ActivationModal.tsx create mode 100644 frontend-modern/src/stores/alertsActivation.ts diff --git a/docs/TEMPERATURE_MONITORING_SECURITY.md b/docs/TEMPERATURE_MONITORING_SECURITY.md index 962eab512..dffbabf00 100644 --- a/docs/TEMPERATURE_MONITORING_SECURITY.md +++ b/docs/TEMPERATURE_MONITORING_SECURITY.md @@ -17,27 +17,39 @@ This document describes the security architecture of Pulse's temperature monitor ## Architecture Overview -``` -┌─────────────────────────────────────────┐ -│ Proxmox Host (delly) │ -│ │ -│ ┌──────────────────────────────────┐ │ -│ │ pulse-sensor-proxy (UID 999) │ │ -│ │ - SSH keys (host-only) │ │ -│ │ - Unix socket exposed │ │ -│ │ - Method-level authorization │ │ -│ │ - Rate limiting enforced │ │ -│ └──────────────────────────────────┘ │ -│ │ │ -│ │ Unix Socket (read-only) │ -│ ↓ │ -│ ┌──────────────────────────────────┐ │ -│ │ LXC Container (ID-mapped) │ │ -│ │ - No SSH keys │ │ -│ │ - Socket at /mnt/pulse-proxy │ │ -│ │ - Can't call privileged RPCs │ │ -│ └──────────────────────────────────┘ │ -└─────────────────────────────────────────┘ +```mermaid +graph TD + subgraph Host["Proxmox Host (delly)\nTrust Boundary"] + Proxy["pulse-sensor-proxy service\nUID 999\nSO_PEERCRED auth\nMethod ACL + per-UID rate limit\nPer-node concurrency = 1"] + Socket["Unix socket\n/run/pulse-sensor-proxy.sock\n(0600 bind mount)"] + Audit["Audit & Metrics\n/var/log/pulse/... & :9127/metrics"] + PrivOps["Privileged RPCs\nensure_cluster_keys | register_nodes | request_cleanup\nHost UID only"] + end + + subgraph Container["Pulse Container (ID-mapped root)"] + Backend["Pulse Backend"] + Poller["Temperature Poller worker"] + end + + subgraph Cluster["Cluster Nodes"] + SensorCmd["Forced SSH command\n`sensors -j` only\nRestricted authorized_keys entry"] + end + + Poller -->|poll request| Backend + Backend -->|RPC via bind-mounted socket| Socket + Socket --> Proxy + Proxy -->|temperature JSON response| Backend + Proxy -->|rate-limit reject + 2 s penalty| Reject["429 response"] + Reject --> Backend + + Proxy -->|SSH (ed25519 key)\nforced command| SensorCmd + SensorCmd -->|temperature JSON| Proxy + + Proxy -->|audit entry + metrics| Audit + Audit -->|Prometheus scrape| Metrics["Telemetry Consumers\n(Grafana, watchdog)"] + + PrivOps --> Proxy + Backend -. blocked (ID-mapped root) .-> PrivOps ``` **Key Principle**: SSH keys never enter containers. All SSH operations are performed by the host-side proxy. diff --git a/docs/WEBHOOKS.md b/docs/WEBHOOKS.md index 3a6420bbf..54144013e 100644 --- a/docs/WEBHOOKS.md +++ b/docs/WEBHOOKS.md @@ -113,6 +113,34 @@ For webhooks that require authentication or custom headers: ## Custom Payload Templates +```mermaid +flowchart TD + AlertEvent["Alert Event Triggered"] + GatherData["Gather Alert Data\n(Level, Type, Resource, Node, etc.)"] + ResolveURL["Resolve URL Template\n({{urlpath}}, {{urlquery}}"] + ResolvePayload["Resolve Payload Template\n(variable substitution)"] + ApplyFunctions["Apply Template Functions\n(title, upper, lower, printf)"] + Dispatch["HTTP POST Request"] + CheckResponse{"Response\nStatus?"} + Success["200-299: Success\nLog delivery"] + Retry["429/5xx: Retry\n(exponential backoff)"] + Failure["4xx: Failure\nLog error"] + TrackDelivery["Update Delivery Metrics\npulse_webhook_deliveries_total"] + + AlertEvent --> GatherData + GatherData --> ResolveURL + ResolveURL --> ResolvePayload + ResolvePayload --> ApplyFunctions + ApplyFunctions --> Dispatch + Dispatch --> CheckResponse + CheckResponse -->|Success| Success + CheckResponse -->|Transient Error| Retry + CheckResponse -->|Permanent Error| Failure + Success --> TrackDelivery + Retry --> TrackDelivery + Failure --> TrackDelivery +``` + For generic webhooks, you can define custom JSON payloads using Go template syntax. ### Available Variables diff --git a/docs/monitoring/ADAPTIVE_POLLING.md b/docs/monitoring/ADAPTIVE_POLLING.md index bced23e30..940d1e54e 100644 --- a/docs/monitoring/ADAPTIVE_POLLING.md +++ b/docs/monitoring/ADAPTIVE_POLLING.md @@ -3,12 +3,46 @@ ## Overview Phase 2 introduces a scheduler that adapts poll cadence based on freshness, errors, and workload. The goal is to prioritize stale or changing instances while backing off on healthy, idle targets. -``` -┌──────────┐ ┌──────────────┐ ┌──────────────┐ ┌─────────────┐ -│ PollLoop │─────▶│ Scheduler │─────▶│ Priority Q │─────▶│ TaskWorkers │ -└──────────┘ └──────────────┘ └──────────────┘ └─────────────┘ - ▲ │ │ │ - │ └─────► Staleness, metrics, circuit breaker feedback ────┘ +```mermaid +flowchart TD + PollLoop["PollLoop\n(ticker & config updates)"] + Scheduler["Scheduler\ncomputes ScheduledTask"] + Staleness["Staleness Tracker\n(last success, freshness score)"] + CircuitBreaker["Circuit Breaker\ntracks failure streaks"] + Backoff["Backoff Policy\nexponential w/ jitter"] + PriorityQ["Priority Queue\nmin-heap by NextRun"] + WorkerPool["TaskWorkers\nN concurrent workers"] + Metrics["Metrics & History\nPrometheus + retention"] + Success["Poll Success"] + Failure{"Poll Failure?"} + Reschedule["Reschedule\n(next interval)"] + BackoffPath["Backoff / Breaker Open"] + DeadLetter["Dead-Letter Queue\noperator review"] + + PollLoop --> Scheduler + Staleness --> Scheduler + CircuitBreaker --> Scheduler + Scheduler --> PriorityQ + + PriorityQ -->|due task| WorkerPool + WorkerPool --> Failure + WorkerPool -->|result| Metrics + WorkerPool -->|freshness| Staleness + + Failure -->|No| Success + Success --> CircuitBreaker + Success --> Reschedule + Success --> Metrics + Reschedule --> Scheduler + + Failure -->|Yes| BackoffPath + BackoffPath --> CircuitBreaker + BackoffPath --> Backoff + Backoff --> Scheduler + Backoff --> DeadLetter + DeadLetter -. periodic retry .-> Scheduler + CircuitBreaker -. state change .-> Scheduler + Metrics --> Scheduler ``` - **Scheduler** computes `ScheduledTask` entries using adaptive intervals. @@ -74,6 +108,18 @@ Exposed via Prometheus (`:9091/metrics`): | **Open** | ≥3 consecutive failures. Poll suppressed. | Exponential delay (max 5 min). | | **Half-open**| Retry window elapsed. Limited re-attempt. | Success ⇒ closed. Failure ⇒ open. | +```mermaid +stateDiagram-v2 + [*] --> Closed: Startup / reset + Closed: Default state\nPolling active\nFailure counter increments + Closed --> Open: ≥3 consecutive failures + Open: Polls suppressed\nScheduler schedules backoff (max 5m) + Open --> HalfOpen: Retry window elapsed + HalfOpen: Single probe allowed\nBreaker watches probe result + HalfOpen --> Closed: Probe success\nReset failure streak & delay + HalfOpen --> Open: Probe failure\nIncrease streak & backoff +``` + Backoff configuration: - Initial delay: 5 s diff --git a/docs/operations/pulse-sensor-proxy-runbook.md b/docs/operations/pulse-sensor-proxy-runbook.md index 5d2882bbf..43878ea86 100644 --- a/docs/operations/pulse-sensor-proxy-runbook.md +++ b/docs/operations/pulse-sensor-proxy-runbook.md @@ -9,6 +9,41 @@ - Limiters: ~12 requests/minute per UID (burst 2), per-UID concurrency 2, global concurrency 8, 2 s penalty on validation failures ## Monitoring Alerts & Response + +```mermaid +sequenceDiagram + participant Backend as Pulse Backend + participant Proxy as Sensor Proxy RPC Server + participant Limiter as Limiter (per UID & global) + participant Validator as Payload Validator + participant SSH as Cluster Node (forced `sensors -j`) + participant Metrics as Metrics & Audit Log + + Backend->>Proxy: RPC request (get_temperature) + Proxy->>Proxy: Extract SO_PEERCRED (UID/GID/PID) + Proxy->>Limiter: Check per-UID rate & concurrency + alt Rate limit exceeded + Limiter-->>Proxy: reject + Proxy-->>Backend: 429 Too Many Requests (2 s penalty) + Proxy->>Metrics: increment limiter_rejections_total + else Allowed + Limiter-->>Proxy: permit + Proxy->>Validator: Validate method & payload + alt Validation failure + Validator-->>Proxy: error + Proxy-->>Backend: 400 validation error + Proxy->>Metrics: penalty + audit log entry + else Valid request + Validator-->>Proxy: ok + Proxy->>SSH: run `sensors -j` via forced command + SSH-->>Proxy: temperature JSON + Proxy-->>Backend: telemetry payload + Proxy->>Metrics: record success, latency histogram + Proxy->>Metrics: append audit/audit trail + end + end +``` + ### Rate Limit Hits (`pulse_proxy_limiter_rejections_total`) 1. Check audit log entries tagged `limiter.rejection` for offending UID. 2. Confirm workload legitimacy; if expected, consider increasing limits via config override. diff --git a/docs/script-library-guide.md b/docs/script-library-guide.md index d9697da63..7a8000bd9 100644 --- a/docs/script-library-guide.md +++ b/docs/script-library-guide.md @@ -31,6 +31,38 @@ dist/ # Generated bundled scripts └── install-*.sh # Ready for distribution ``` +### Development & Bundling Workflow + +```mermaid +flowchart TD + Author["Author Code\nscripts/lib/*.sh\nscripts/install-*.sh"] + WriteTests["Write Tests\nscripts/tests/test-*.sh\nscripts/tests/integration/"] + UpdateManifest["Update Bundle Manifest\nscripts/bundle.manifest"] + RunTests["Run Tests\nmake test-scripts\nscripts/tests/run.sh"] + TestPass{"Tests Pass?"} + FixCode["Fix Issues"] + Bundle["Bundle Scripts\nmake bundle-scripts\nbash scripts/bundle.sh"] + ValidateBundled["Validate Bundled Output\nbash -n dist/*.sh\ndist/*.sh --dry-run"] + ValidatePass{"Validation\nPass?"} + Distribute["Distribute\ndist/*.sh ready"] + UpdateDocs["Update Documentation\nscripts/lib/README.md"] + + Author --> WriteTests + WriteTests --> UpdateManifest + UpdateManifest --> RunTests + RunTests --> TestPass + TestPass -->|No| FixCode + FixCode --> Author + TestPass -->|Yes| Bundle + Bundle --> ValidateBundled + ValidateBundled --> ValidatePass + ValidatePass -->|No| FixCode + ValidatePass -->|Yes| UpdateDocs + UpdateDocs --> Distribute +``` + +This workflow emphasizes the library's modular design: develop reusable modules in `scripts/lib`, test thoroughly, bundle for distribution, and validate bundled artifacts before release. + ## 3. Using the Library in Your Script ```bash diff --git a/frontend-modern/src/App.tsx b/frontend-modern/src/App.tsx index f2291b20d..773b05398 100644 --- a/frontend-modern/src/App.tsx +++ b/frontend-modern/src/App.tsx @@ -44,6 +44,8 @@ import { DockerIcon } from '@/components/icons/DockerIcon'; import { AlertsIcon } from '@/components/icons/AlertsIcon'; import { SettingsGearIcon } from '@/components/icons/SettingsGearIcon'; import { TokenRevealDialog } from './components/TokenRevealDialog'; +import { ActivationBanner } from './components/Alerts/ActivationBanner'; +import { useAlertsActivation } from './stores/alertsActivation'; // Enhanced store type with proper typing type EnhancedStore = ReturnType; @@ -88,6 +90,7 @@ function App() { : getGlobalWebSocketStore(); return store || getGlobalWebSocketStore(); }; + const alertsActivation = useAlertsActivation(); const fallbackState: State = { nodes: [], @@ -183,6 +186,11 @@ function App() { } }); + onMount(() => { + void alertsActivation.refreshConfig(); + void alertsActivation.refreshActiveAlerts(); + }); + // No longer need tab state management - using router now // Version info @@ -559,6 +567,15 @@ function App() { +
diff --git a/frontend-modern/src/api/alerts.ts b/frontend-modern/src/api/alerts.ts index 96e2aa726..6df3c9abb 100644 --- a/frontend-modern/src/api/alerts.ts +++ b/frontend-modern/src/api/alerts.ts @@ -58,6 +58,12 @@ export class AlertsAPI { }); } + static async activate(): Promise<{ success: boolean; state: string; activationTime?: string }> { + return apiFetchJSON(`${this.baseUrl}/activate`, { + method: 'POST', + }); + } + static async clearAlert(alertId: string): Promise<{ success: boolean }> { return apiFetchJSON(`${this.baseUrl}/${encodeURIComponent(alertId)}/clear`, { method: 'POST', diff --git a/frontend-modern/src/components/Alerts/ActivationBanner.tsx b/frontend-modern/src/components/Alerts/ActivationBanner.tsx new file mode 100644 index 000000000..ecfec49ef --- /dev/null +++ b/frontend-modern/src/components/Alerts/ActivationBanner.tsx @@ -0,0 +1,117 @@ +import { Show, createEffect, createMemo, createSignal } from 'solid-js'; +import type { JSX } from 'solid-js'; +import type { Alert } from '@/types/api'; +import type { ActivationState, AlertConfig } from '@/types/alerts'; +import { ActivationModal } from './ActivationModal'; + +interface ActivationBannerProps { + activationState: () => ActivationState | null; + activeAlerts: () => Alert[] | undefined; + config: () => AlertConfig | null; + isPastObservationWindow: () => boolean; + isLoading: () => boolean; + refreshActiveAlerts: () => Promise; + activate: () => Promise; +} + +export function ActivationBanner(props: ActivationBannerProps): JSX.Element { + const [isModalOpen, setIsModalOpen] = createSignal(false); + + const shouldShow = createMemo(() => { + const state = props.activationState(); + return state === 'pending_review' || state === 'snoozed'; + }); + + createEffect(() => { + // Close the modal automatically if activation becomes active while it is open + if (!shouldShow() && isModalOpen()) { + setIsModalOpen(false); + } + }); + + const violationCount = createMemo(() => props.activeAlerts()?.length ?? 0); + + const observationSummary = createMemo(() => { + const count = violationCount(); + if (count <= 0) { + return 'No alert violations detected during observation yet.'; + } + const label = count === 1 ? 'issue' : 'issues'; + return `${count} ${label} detected during observation.`; + }); + + const handleReview = async () => { + await props.refreshActiveAlerts(); + setIsModalOpen(true); + }; + + const handleActivated = async () => { + await props.refreshActiveAlerts(); + }; + + return ( + <> + +
+
+
+
+ + + + +
+

+ Monitoring is live; notifications will start after you review settings. +

+

{observationSummary()}

+ +

+ 24h observation ending—activate to start notifications. +

+
+
+
+ +
+ +
+
+
+
+
+ + setIsModalOpen(false)} + onActivated={handleActivated} + config={props.config} + activeAlerts={props.activeAlerts} + isLoading={props.isLoading} + activate={props.activate} + refreshActiveAlerts={props.refreshActiveAlerts} + /> + + ); +} diff --git a/frontend-modern/src/components/Alerts/ActivationModal.tsx b/frontend-modern/src/components/Alerts/ActivationModal.tsx new file mode 100644 index 000000000..f68c4ef67 --- /dev/null +++ b/frontend-modern/src/components/Alerts/ActivationModal.tsx @@ -0,0 +1,354 @@ +import { For, Show, createMemo, createSignal } from 'solid-js'; +import { Portal } from 'solid-js/web'; +import { useNavigate } from '@solidjs/router'; +import type { JSX } from 'solid-js'; +import type { Alert } from '@/types/api'; +import type { AlertConfig, AlertThresholds, HysteresisThreshold } from '@/types/alerts'; +import { showError, showSuccess } from '@/utils/toast'; + +interface ActivationModalProps { + isOpen: boolean; + onClose: () => void; + onActivated?: () => Promise | void; + config: () => AlertConfig | null; + activeAlerts: () => Alert[] | undefined; + isLoading: () => boolean; + activate: () => Promise; + refreshActiveAlerts: () => Promise; +} + +interface ThresholdSummary { + heading: string; + items: Array<{ label: string; value: string }>; +} + +const extractTrigger = ( + threshold?: HysteresisThreshold | number, + legacy?: number, +): number | undefined => { + if (typeof threshold === 'number') { + return threshold; + } + if (threshold && typeof threshold === 'object' && typeof threshold.trigger === 'number') { + return threshold.trigger; + } + if (typeof legacy === 'number') { + return legacy; + } + return undefined; +}; + +const formatThreshold = (value: number | undefined): string => { + if (value === undefined || Number.isNaN(value)) { + return 'Not configured'; + } + if (value <= 0) { + return 'Disabled'; + } + return `${value}%`; +}; + +const summarizeThresholds = (config: AlertConfig | null): ThresholdSummary[] => { + if (!config) { + return []; + } + + const summarize = (thresholds?: AlertThresholds): Array<{ label: string; value: string }> => { + if (!thresholds) return []; + return [ + { + label: 'CPU', + value: formatThreshold(extractTrigger(thresholds.cpu, thresholds.cpuLegacy)), + }, + { + label: 'Memory', + value: formatThreshold(extractTrigger(thresholds.memory, thresholds.memoryLegacy)), + }, + { + label: 'Disk', + value: formatThreshold(extractTrigger(thresholds.disk, thresholds.diskLegacy)), + }, + ]; + }; + + const guestItems = summarize(config.guestDefaults); + const nodeItems = summarize(config.nodeDefaults); + const storageValue = formatThreshold(extractTrigger(config.storageDefault)); + + const summaries: ThresholdSummary[] = []; + + if (guestItems.length > 0) { + summaries.push({ heading: 'Guest thresholds', items: guestItems }); + } + if (nodeItems.length > 0) { + const nodeWithTemperature = [ + ...nodeItems, + { + label: 'Temperature', + value: formatThreshold(extractTrigger(config.nodeDefaults?.temperature)), + }, + ]; + summaries.push({ heading: 'Node thresholds', items: nodeWithTemperature }); + } + summaries.push({ + heading: 'Storage', + items: [ + { + label: 'Usage', + value: storageValue, + }, + ], + }); + + return summaries; +}; + +const getChannelSummary = (config: AlertConfig | null): { status: 'configured' | 'missing'; message: string } => { + if (!config || !config.notifications) { + return { + status: 'missing', + message: 'Notification channels are not configured yet. Configure email or webhook destinations before activation.', + }; + } + + const emailConfigured = Boolean(config.notifications.email?.server); + const webhookConfigured = Boolean(config.notifications.webhooks?.some((hook) => hook.enabled)); + + if (!emailConfigured && !webhookConfigured) { + return { + status: 'missing', + message: 'Notification channels are not configured yet. Configure email or webhook destinations before activation.', + }; + } + + if (emailConfigured && webhookConfigured) { + return { + status: 'configured', + message: 'Email and webhook destinations are ready. You can fine-tune them under Notification Destinations.', + }; + } + + if (emailConfigured) { + return { + status: 'configured', + message: 'Email notifications are configured. Add additional webhook destinations if needed.', + }; + } + + return { + status: 'configured', + message: 'Webhook notifications are configured. Add email fallbacks if needed.', + }; +}; + +export function ActivationModal(props: ActivationModalProps): JSX.Element { + const navigate = useNavigate(); + const [isSubmitting, setIsSubmitting] = createSignal(false); + + const thresholdSummaries = createMemo(() => summarizeThresholds(props.config())); + + const violations = createMemo(() => props.activeAlerts() ?? []); + const violationCount = createMemo(() => violations().length); + + const channelSummary = createMemo(() => getChannelSummary(props.config())); + + const observationHours = createMemo(() => props.config()?.observationWindowHours ?? 24); + + const handleActivate = async () => { + if (isSubmitting()) { + return; + } + setIsSubmitting(true); + const success = await props.activate(); + + if (success) { + await props.refreshActiveAlerts(); + showSuccess('Alert notifications activated. Notifications will now dispatch to configured destinations.'); + if (props.onActivated) { + await props.onActivated(); + } + props.onClose(); + } else { + showError('Failed to activate alert notifications. Please try again.'); + } + + setIsSubmitting(false); + }; + + const handleNavigateDestinations = () => { + props.onClose(); + navigate('/alerts/destinations'); + }; + + return ( + + +
+
+
+
+
+

Review alerts before activating

+

+ Monitoring is already running. Confirm thresholds and destinations before enabling notifications. +

+
+ +
+ +
+
+

+ Current thresholds +

+

+ Thresholds determine when alerts fire. Adjust them under Alert Thresholds if needed before activating. +

+
+ + {(section) => ( +
+

+ {section.heading} +

+
    + + {(item) => ( +
  • + {item.label} + {item.value} +
  • + )} +
    +
+
+ )} +
+
+
+ +
+
+

+ Issues detected +

+ + Observation window: {observationHours()}h + +
+

+ {violationCount() > 0 + ? 'These alerts are currently open. Activating notifications will send them to configured channels.' + : 'No alerts have breached thresholds yet. Activation will notify you immediately when new issues appear.'} +

+ 0} + fallback={ +
+ No active violations detected during the observation window. +
+ } + > +
+ + {(alert) => ( +
+
+
+ + {alert.level} + + + {alert.resourceName || alert.resourceId} + +
+ {alert.type} +
+

{alert.message}

+

+ Threshold {alert.threshold}% • Current {alert.value}% • Since{' '} + {new Date(alert.startTime).toLocaleString()} +

+
+ )} +
+
+
+
+ +
+

+ Notification channels +

+
+

{channelSummary().message}

+ +
+
+
+ +
+

+ You can snooze alerts later if you need a quiet period. +

+
+ + +
+
+
+
+ + + ); +} diff --git a/frontend-modern/src/stores/alertsActivation.ts b/frontend-modern/src/stores/alertsActivation.ts new file mode 100644 index 000000000..8ed29b305 --- /dev/null +++ b/frontend-modern/src/stores/alertsActivation.ts @@ -0,0 +1,96 @@ +import { createSignal } from 'solid-js'; +import { AlertsAPI } from '@/api/alerts'; +import type { AlertConfig, ActivationState as ActivationStateType } from '@/types/alerts'; +import type { Alert } from '@/types/api'; + +// Create signals for activation state +const [config, setConfig] = createSignal(null); +const [activationState, setActivationState] = createSignal(null); +const [isLoading, setIsLoading] = createSignal(false); +const [activeAlerts, setActiveAlerts] = createSignal([]); +const [lastError, setLastError] = createSignal(null); + +// Refresh config from API +const refreshConfig = async (): Promise => { + try { + setIsLoading(true); + setLastError(null); + const alertConfig = await AlertsAPI.getConfig(); + setConfig(alertConfig); + setActivationState(alertConfig.activationState || 'active'); + } catch (error) { + console.error('Failed to fetch alert config:', error); + setLastError(error instanceof Error ? error.message : 'Unknown error'); + } finally { + setIsLoading(false); + } +}; + +// Fetch active alerts (for violation count) +const refreshActiveAlerts = async (): Promise => { + try { + const alerts = await AlertsAPI.getActive(); + setActiveAlerts(alerts); + } catch (error) { + console.error('Failed to fetch active alerts:', error); + // Don't set error state for this - it's not critical + } +}; + +// Activate alert notifications +const activate = async (): Promise => { + try { + setIsLoading(true); + setLastError(null); + const result = await AlertsAPI.activate(); + + if (result.success) { + // Refresh config to get updated state + await refreshConfig(); + return true; + } + return false; + } catch (error) { + console.error('Failed to activate alerts:', error); + setLastError(error instanceof Error ? error.message : 'Unknown error'); + return false; + } finally { + setIsLoading(false); + } +}; + +// Check if past observation window +const isPastObservationWindow = (): boolean => { + const cfg = config(); + if (!cfg || !cfg.activationTime || !cfg.observationWindowHours) { + return false; + } + + const activationTime = new Date(cfg.activationTime); + const windowMs = cfg.observationWindowHours * 60 * 60 * 1000; + const expiryTime = activationTime.getTime() + windowMs; + + return Date.now() > expiryTime; +}; + +// Export the store +export const useAlertsActivation = () => ({ + // Signals + config, + activationState, + isLoading, + activeAlerts, + lastError, + + // Computed + isPastObservationWindow, + + // Actions + refreshConfig, + refreshActiveAlerts, + activate, +}); + +// Initialize on module load +refreshConfig(); +refreshActiveAlerts(); diff --git a/frontend-modern/src/types/alerts.ts b/frontend-modern/src/types/alerts.ts index a2ac141a2..4edad8d63 100644 --- a/frontend-modern/src/types/alerts.ts +++ b/frontend-modern/src/types/alerts.ts @@ -97,8 +97,13 @@ export interface BackupAlertConfig { criticalDays: number; } +export type ActivationState = 'pending_review' | 'active' | 'snoozed'; + export interface AlertConfig { enabled: boolean; + activationState?: ActivationState; + observationWindowHours?: number; + activationTime?: string; guestDefaults: AlertThresholds; nodeDefaults: AlertThresholds; storageDefault: HysteresisThreshold; diff --git a/internal/alerts/alerts.go b/internal/alerts/alerts.go index d0a2b85ab..313ed27e1 100644 --- a/internal/alerts/alerts.go +++ b/internal/alerts/alerts.go @@ -24,6 +24,15 @@ const ( AlertLevelCritical AlertLevel = "critical" ) +// ActivationState represents the alert notification activation state +type ActivationState string + +const ( + ActivationPending ActivationState = "pending_review" + ActivationActive ActivationState = "active" + ActivationSnoozed ActivationState = "snoozed" +) + func normalizePoweredOffSeverity(level AlertLevel) AlertLevel { switch strings.ToLower(string(level)) { case string(AlertLevelCritical): @@ -309,6 +318,9 @@ type GuestLookup struct { // AlertConfig represents the complete alert configuration type AlertConfig struct { Enabled bool `json:"enabled"` + ActivationState ActivationState `json:"activationState,omitempty"` + ObservationWindowHours int `json:"observationWindowHours,omitempty"` + ActivationTime *time.Time `json:"activationTime,omitempty"` GuestDefaults ThresholdConfig `json:"guestDefaults"` NodeDefaults ThresholdConfig `json:"nodeDefaults"` StorageDefault HysteresisThreshold `json:"storageDefault"` @@ -455,7 +467,9 @@ func NewManager() *Manager { pmgAnomalyTrackers: make(map[string]*pmgAnomalyTracker), ackState: make(map[string]ackRecord), config: AlertConfig{ - Enabled: true, + Enabled: true, + ActivationState: ActivationPending, + ObservationWindowHours: 24, GuestDefaults: ThresholdConfig{ PoweredOffSeverity: AlertLevelWarning, CPU: &HysteresisThreshold{Trigger: 80, Clear: 75}, @@ -615,6 +629,15 @@ func (m *Manager) dispatchAlert(alert *Alert, async bool) bool { return false } + // Check activation state - only dispatch notifications if active + if m.config.ActivationState != ActivationActive { + log.Debug(). + Str("alertID", alert.ID). + Str("activationState", string(m.config.ActivationState)). + Msg("Alert notification suppressed - not activated") + return false + } + if suppressed, reason := m.shouldSuppressNotification(alert); suppressed { log.Debug(). Str("alertID", alert.ID). @@ -783,6 +806,27 @@ func (m *Manager) UpdateConfig(config AlertConfig) { config.GuestDefaults.PoweredOffSeverity = normalizePoweredOffSeverity(config.GuestDefaults.PoweredOffSeverity) config.NodeDefaults.PoweredOffSeverity = normalizePoweredOffSeverity(config.NodeDefaults.PoweredOffSeverity) + // Migration logic for activation state (backward compatibility) + if config.ObservationWindowHours <= 0 { + config.ObservationWindowHours = 24 + } + if config.ActivationState == "" { + // Determine if this is an existing installation or new + // Existing installations have active alerts already + isExistingInstall := len(m.activeAlerts) > 0 || len(config.Overrides) > 0 + if isExistingInstall { + // Existing install: auto-activate to preserve behavior + config.ActivationState = ActivationActive + now := time.Now() + config.ActivationTime = &now + log.Info().Msg("Migrating existing installation to active alert state") + } else { + // New install: start in pending review + config.ActivationState = ActivationPending + log.Info().Msg("New installation: alerts pending activation") + } + } + m.config = config for id, override := range m.config.Overrides { override.PoweredOffSeverity = normalizePoweredOffSeverity(override.PoweredOffSeverity) @@ -6548,17 +6592,15 @@ func (m *Manager) LoadActiveAlerts() error { // Only notify for alerts that started recently (within last 2 hours) to avoid spam if alert.Level == AlertLevelCritical && now.Sub(alert.StartTime) < 2*time.Hour { // Use a goroutine and add a small delay to avoid notification spam on startup - if m.onAlert != nil { - alertCopy := alert.Clone() - go func(a *Alert) { - time.Sleep(10 * time.Second) // Wait for system to stabilize after restart - log.Info(). - Str("alertID", a.ID). - Str("resource", a.ResourceName). - Msg("Sending notification for restored critical alert") - m.onAlert(a) - }(alertCopy) - } + alertCopy := alert.Clone() + go func(a *Alert) { + time.Sleep(10 * time.Second) // Wait for system to stabilize after restart + log.Info(). + Str("alertID", a.ID). + Str("resource", a.ResourceName). + Msg("Attempting to send notification for restored critical alert") + m.dispatchAlert(a, false) // Use dispatchAlert to respect activation state and quiet hours + }(alertCopy) } } diff --git a/internal/api/alerts.go b/internal/api/alerts.go index e59ba7eb4..ff32b9a37 100644 --- a/internal/api/alerts.go +++ b/internal/api/alerts.go @@ -103,6 +103,50 @@ func (h *AlertHandlers) UpdateAlertConfig(w http.ResponseWriter, r *http.Request } } +// ActivateAlerts activates alert notifications +func (h *AlertHandlers) ActivateAlerts(w http.ResponseWriter, r *http.Request) { + // Get current config + config := h.monitor.GetAlertManager().GetConfig() + + // Check if already active + if config.ActivationState == alerts.ActivationActive { + if err := utils.WriteJSONResponse(w, map[string]interface{}{ + "status": "success", + "message": "Alerts already activated", + "state": string(config.ActivationState), + }); err != nil { + log.Error().Err(err).Msg("Failed to write activate response") + } + return + } + + // Activate notifications + now := time.Now() + config.ActivationState = alerts.ActivationActive + config.ActivationTime = &now + + // Update config + h.monitor.GetAlertManager().UpdateConfig(config) + + // Save to persistent storage + if err := h.monitor.GetConfigPersistence().SaveAlertConfig(config); err != nil { + log.Error().Err(err).Msg("Failed to save alert configuration after activation") + http.Error(w, "Failed to save configuration", http.StatusInternalServerError) + return + } + + log.Info().Msg("Alert notifications activated") + + if err := utils.WriteJSONResponse(w, map[string]interface{}{ + "status": "success", + "message": "Alert notifications activated", + "state": string(config.ActivationState), + "activationTime": config.ActivationTime, + }); err != nil { + log.Error().Err(err).Msg("Failed to write activate response") + } +} + // GetActiveAlerts returns all active alerts func (h *AlertHandlers) GetActiveAlerts(w http.ResponseWriter, r *http.Request) { alerts := h.monitor.GetAlertManager().GetActiveAlerts() @@ -619,6 +663,8 @@ func (h *AlertHandlers) HandleAlerts(w http.ResponseWriter, r *http.Request) { h.GetAlertConfig(w, r) case path == "config" && r.Method == http.MethodPut: h.UpdateAlertConfig(w, r) + case path == "activate" && r.Method == http.MethodPost: + h.ActivateAlerts(w, r) case path == "active" && r.Method == http.MethodGet: h.GetActiveAlerts(w, r) case path == "history" && r.Method == http.MethodGet: