mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-02-19 07:50:43 +01:00
This commit implements critical reliability features to prevent data loss and improve alert system robustness: **Persistent Notification Queue:** - SQLite-backed queue with WAL journaling for crash recovery - Dead Letter Queue (DLQ) for notifications that exhaust retries - Exponential backoff retry logic (100ms → 200ms → 400ms) - Full audit trail for all notification delivery attempts - New file: internal/notifications/queue.go (661 lines) **DLQ Management API:** - GET /api/notifications/dlq - Retrieve DLQ items - GET /api/notifications/queue/stats - Queue statistics - POST /api/notifications/dlq/retry - Retry failed notifications - POST /api/notifications/dlq/delete - Delete DLQ items - New file: internal/api/notification_queue.go (145 lines) **Prometheus Metrics:** - 18 comprehensive metrics for alerts and notifications - Metric hooks integrated via function pointers to avoid import cycles - /metrics endpoint exposed for Prometheus scraping - New file: internal/metrics/alert_metrics.go (193 lines) **Alert History Reliability:** - Exponential backoff retry for history saves (3 attempts) - Automatic backup restoration on write failure - Modified: internal/alerts/history.go **Flapping Detection:** - Detects and suppresses rapidly oscillating alerts - Configurable window (default: 5 minutes) - Configurable threshold (default: 5 state changes) - Configurable cooldown (default: 15 minutes) - Automatic cleanup of inactive flapping history **Alert TTL & Auto-Cleanup:** - MaxAlertAgeDays: Auto-cleanup old alerts (default: 7 days) - MaxAcknowledgedAgeDays: Faster cleanup for acked alerts (default: 1 day) - AutoAcknowledgeAfterHours: Auto-ack long-running alerts (default: 24 hours) - Prevents memory leaks from long-running alerts **WebSocket Broadcast Sequencer:** - Channel-based sequencing ensures ordered message delivery - 100ms coalescing window for rapid state updates - Prevents race conditions in WebSocket broadcasts - Modified: internal/websocket/hub.go **Configuration Fields Added:** - FlappingEnabled, FlappingWindowSeconds, FlappingThreshold, FlappingCooldownMinutes - MaxAlertAgeDays, MaxAcknowledgedAgeDays, AutoAcknowledgeAfterHours All features are production-ready and build successfully.
187 lines
5.5 KiB
Go
187 lines
5.5 KiB
Go
package api
|
|
|
|
import (
|
|
"encoding/json"
|
|
"net/http"
|
|
"strconv"
|
|
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/config"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/monitoring"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/notifications"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/utils"
|
|
"github.com/rs/zerolog/log"
|
|
)
|
|
|
|
// NotificationQueueHandlers handles notification queue API endpoints
|
|
type NotificationQueueHandlers struct {
|
|
monitor *monitoring.Monitor
|
|
}
|
|
|
|
// NewNotificationQueueHandlers creates new notification queue handlers
|
|
func NewNotificationQueueHandlers(monitor *monitoring.Monitor) *NotificationQueueHandlers {
|
|
return &NotificationQueueHandlers{
|
|
monitor: monitor,
|
|
}
|
|
}
|
|
|
|
// GetDLQ returns notifications in the dead letter queue
|
|
func (h *NotificationQueueHandlers) GetDLQ(w http.ResponseWriter, r *http.Request) {
|
|
if !ensureScope(w, r, config.ScopeMonitoringRead) {
|
|
return
|
|
}
|
|
|
|
limit := 100
|
|
if limitStr := r.URL.Query().Get("limit"); limitStr != "" {
|
|
if l, err := strconv.Atoi(limitStr); err == nil && l > 0 && l <= 1000 {
|
|
limit = l
|
|
}
|
|
}
|
|
|
|
queue := h.monitor.GetNotificationManager().GetQueue()
|
|
if queue == nil {
|
|
http.Error(w, "Notification queue not initialized", http.StatusServiceUnavailable)
|
|
return
|
|
}
|
|
|
|
dlq, err := queue.GetDLQ(limit)
|
|
if err != nil {
|
|
log.Error().Err(err).Msg("Failed to get DLQ")
|
|
http.Error(w, "Failed to retrieve dead letter queue", http.StatusInternalServerError)
|
|
return
|
|
}
|
|
|
|
if err := utils.WriteJSONResponse(w, dlq); err != nil {
|
|
log.Error().Err(err).Msg("Failed to write DLQ response")
|
|
}
|
|
}
|
|
|
|
// GetQueueStats returns statistics about the notification queue
|
|
func (h *NotificationQueueHandlers) GetQueueStats(w http.ResponseWriter, r *http.Request) {
|
|
if !ensureScope(w, r, config.ScopeMonitoringRead) {
|
|
return
|
|
}
|
|
|
|
queue := h.monitor.GetNotificationManager().GetQueue()
|
|
if queue == nil {
|
|
http.Error(w, "Notification queue not initialized", http.StatusServiceUnavailable)
|
|
return
|
|
}
|
|
|
|
stats, err := queue.GetQueueStats()
|
|
if err != nil {
|
|
log.Error().Err(err).Msg("Failed to get queue stats")
|
|
http.Error(w, "Failed to retrieve queue statistics", http.StatusInternalServerError)
|
|
return
|
|
}
|
|
|
|
if err := utils.WriteJSONResponse(w, stats); err != nil {
|
|
log.Error().Err(err).Msg("Failed to write queue stats response")
|
|
}
|
|
}
|
|
|
|
// RetryDLQItem retries a specific notification from the DLQ
|
|
func (h *NotificationQueueHandlers) RetryDLQItem(w http.ResponseWriter, r *http.Request) {
|
|
if !ensureScope(w, r, config.ScopeMonitoringWrite) {
|
|
return
|
|
}
|
|
|
|
var request struct {
|
|
ID string `json:"id"`
|
|
}
|
|
|
|
if err := json.NewDecoder(r.Body).Decode(&request); err != nil {
|
|
http.Error(w, "Invalid request body", http.StatusBadRequest)
|
|
return
|
|
}
|
|
|
|
if request.ID == "" {
|
|
http.Error(w, "Missing notification ID", http.StatusBadRequest)
|
|
return
|
|
}
|
|
|
|
queue := h.monitor.GetNotificationManager().GetQueue()
|
|
if queue == nil {
|
|
http.Error(w, "Notification queue not initialized", http.StatusServiceUnavailable)
|
|
return
|
|
}
|
|
|
|
// Reset notification to pending status with immediate retry
|
|
if err := queue.ScheduleRetry(request.ID, 0); err != nil {
|
|
log.Error().Err(err).Str("id", request.ID).Msg("Failed to retry DLQ item")
|
|
http.Error(w, "Failed to retry notification", http.StatusInternalServerError)
|
|
return
|
|
}
|
|
|
|
log.Info().Str("id", request.ID).Msg("DLQ notification scheduled for retry")
|
|
|
|
if err := utils.WriteJSONResponse(w, map[string]interface{}{
|
|
"success": true,
|
|
"message": "Notification scheduled for retry",
|
|
"id": request.ID,
|
|
}); err != nil {
|
|
log.Error().Err(err).Msg("Failed to write retry response")
|
|
}
|
|
}
|
|
|
|
// DeleteDLQItem removes a notification from the DLQ permanently
|
|
func (h *NotificationQueueHandlers) DeleteDLQItem(w http.ResponseWriter, r *http.Request) {
|
|
if !ensureScope(w, r, config.ScopeMonitoringWrite) {
|
|
return
|
|
}
|
|
|
|
var request struct {
|
|
ID string `json:"id"`
|
|
}
|
|
|
|
if err := json.NewDecoder(r.Body).Decode(&request); err != nil {
|
|
http.Error(w, "Invalid request body", http.StatusBadRequest)
|
|
return
|
|
}
|
|
|
|
if request.ID == "" {
|
|
http.Error(w, "Missing notification ID", http.StatusBadRequest)
|
|
return
|
|
}
|
|
|
|
queue := h.monitor.GetNotificationManager().GetQueue()
|
|
if queue == nil {
|
|
http.Error(w, "Notification queue not initialized", http.StatusServiceUnavailable)
|
|
return
|
|
}
|
|
|
|
// Update status to deleted/cancelled
|
|
if err := queue.UpdateStatus(request.ID, notifications.QueueStatusCancelled, "Manually deleted from DLQ"); err != nil {
|
|
log.Error().Err(err).Str("id", request.ID).Msg("Failed to delete DLQ item")
|
|
http.Error(w, "Failed to delete notification", http.StatusInternalServerError)
|
|
return
|
|
}
|
|
|
|
log.Info().Str("id", request.ID).Msg("DLQ notification deleted")
|
|
|
|
if err := utils.WriteJSONResponse(w, map[string]interface{}{
|
|
"success": true,
|
|
"message": "Notification deleted from DLQ",
|
|
"id": request.ID,
|
|
}); err != nil {
|
|
log.Error().Err(err).Msg("Failed to write delete response")
|
|
}
|
|
}
|
|
|
|
// HandleNotificationQueue routes notification queue requests
|
|
func (h *NotificationQueueHandlers) HandleNotificationQueue(w http.ResponseWriter, r *http.Request) {
|
|
path := r.URL.Path
|
|
|
|
switch {
|
|
case path == "/api/notifications/dlq" && r.Method == http.MethodGet:
|
|
h.GetDLQ(w, r)
|
|
case path == "/api/notifications/queue/stats" && r.Method == http.MethodGet:
|
|
h.GetQueueStats(w, r)
|
|
case path == "/api/notifications/dlq/retry" && r.Method == http.MethodPost:
|
|
h.RetryDLQItem(w, r)
|
|
case path == "/api/notifications/dlq/delete" && r.Method == http.MethodPost:
|
|
h.DeleteDLQItem(w, r)
|
|
default:
|
|
http.Error(w, "Not found", http.StatusNotFound)
|
|
}
|
|
}
|