Files
Pulse/internal/api/middleware.go
rcourtman 495e6c7945 feat: comprehensive diagnostics and observability improvements
Upgrade diagnostics infrastructure from 5/10 to 8/10 production readiness
with enhanced metrics, logging, and request correlation capabilities.

**Request Correlation**
- Wire request IDs through context in middleware
- Return X-Request-ID header in all API responses
- Enable downstream log correlation across request lifecycle

**HTTP/API Metrics** (18 new Prometheus metrics)
- pulse_http_request_duration_seconds - API latency histogram
- pulse_http_requests_total - request counter by method/route/status
- pulse_http_request_errors_total - error counter by type
- Path normalization to control label cardinality

**Per-Node Poll Metrics**
- pulse_monitor_node_poll_duration_seconds - per-node timing
- pulse_monitor_node_poll_total - success/error counts per node
- pulse_monitor_node_poll_errors_total - error breakdown per node
- pulse_monitor_node_poll_last_success_timestamp - freshness tracking
- pulse_monitor_node_poll_staleness_seconds - age since last success
- Enables multi-node hotspot identification

**Scheduler Health Metrics**
- pulse_scheduler_queue_due_soon - ready queue depth
- pulse_scheduler_queue_depth - by instance type
- pulse_scheduler_queue_wait_seconds - time in queue histogram
- pulse_scheduler_dead_letter_depth - failed task tracking
- pulse_scheduler_breaker_state - circuit breaker state
- pulse_scheduler_breaker_failure_count - consecutive failures
- pulse_scheduler_breaker_retry_seconds - time until retry
- Enable alerting on DLQ spikes, breaker opens, queue backlogs

**Diagnostics Endpoint Caching**
- pulse_diagnostics_cache_hits_total - cache performance
- pulse_diagnostics_cache_misses_total - cache misses
- pulse_diagnostics_refresh_duration_seconds - probe timing
- 45-second TTL prevents thundering herd on /api/diagnostics
- Thread-safe with RWMutex
- X-Diagnostics-Cached-At header shows cache freshness

**Debug Log Performance**
- Gate high-frequency debug logs behind IsLevelEnabled() checks
- Reduces CPU waste in production when debug disabled
- Covers scheduler loops, poll cycles, API handlers

**Persistent Logging**
- File logging with automatic rotation
- LOG_FILE, LOG_MAX_SIZE, LOG_MAX_AGE, LOG_COMPRESS env vars
- MultiWriter sends logs to both stderr and file
- Gzip compression support for rotated logs

Files modified:
- internal/api/diagnostics.go (caching layer)
- internal/api/middleware.go (request IDs, HTTP metrics)
- internal/api/http_metrics.go (NEW - HTTP metric definitions)
- internal/logging/logging.go (file logging with rotation)
- internal/monitoring/metrics.go (node + scheduler metrics)
- internal/monitoring/monitor.go (instrumentation, debug gating)

Impact: Dramatically improved production troubleshooting with per-node
visibility, scheduler health metrics, persistent logs, and cached
diagnostics. Fast incident response now possible for multi-node deployments.
2025-10-21 12:37:39 +00:00

211 lines
5.7 KiB
Go

package api
import (
"bufio"
"encoding/json"
"fmt"
"net"
"net/http"
"runtime/debug"
"strings"
"time"
"github.com/rcourtman/pulse-go-rewrite/internal/logging"
"github.com/rs/zerolog/log"
)
// APIError represents a structured API error response
type APIError struct {
ErrorMessage string `json:"error"`
Code string `json:"code,omitempty"`
StatusCode int `json:"status_code"`
Timestamp int64 `json:"timestamp"`
RequestID string `json:"request_id,omitempty"`
Details map[string]string `json:"details,omitempty"`
}
// Error implements the error interface
func (e *APIError) Error() string {
return e.ErrorMessage
}
// ErrorHandler is a middleware that handles panics and errors
func ErrorHandler(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
// Fix for issue #334: Normalize empty path to "/" before ServeMux processes it
// This prevents the automatic redirect from "" to "./"
if r.URL.Path == "" {
r.URL.Path = "/"
}
// Skip error handling for WebSocket endpoints
if r.Header.Get("Upgrade") == "websocket" {
next.ServeHTTP(w, r)
return
}
// Add request ID to context, honoring any incoming header value.
incomingID := strings.TrimSpace(r.Header.Get("X-Request-ID"))
ctxWithID, requestID := logging.WithRequestID(r.Context(), incomingID)
r = r.WithContext(ctxWithID)
// Create a custom response writer to capture status codes
rw := &responseWriter{ResponseWriter: w, statusCode: http.StatusOK}
rw.Header().Set("X-Request-ID", requestID)
start := time.Now()
routeLabel := normalizeRoute(r.URL.Path)
method := r.Method
defer func() {
elapsed := time.Since(start)
recordAPIRequest(method, routeLabel, rw.StatusCode(), elapsed)
}()
// Recover from panics
defer func() {
if err := recover(); err != nil {
log.Error().
Interface("error", err).
Str("path", r.URL.Path).
Str("method", r.Method).
Str("request_id", requestID).
Bytes("stack", debug.Stack()).
Msg("Panic recovered in API handler")
writeErrorResponse(rw, http.StatusInternalServerError, "internal_error",
"An unexpected error occurred", nil)
}
}()
// Call the next handler
next.ServeHTTP(rw, r)
// Log errors (4xx and 5xx)
if rw.statusCode >= 400 {
log.Warn().
Str("path", r.URL.Path).
Str("method", r.Method).
Int("status", rw.statusCode).
Str("request_id", requestID).
Msg("Request failed")
}
})
}
// TimeoutHandler wraps handlers with a timeout
func TimeoutHandler(timeout time.Duration) func(http.Handler) http.Handler {
return func(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
// Skip timeout for WebSocket and SSE endpoints
if r.Header.Get("Upgrade") == "websocket" || r.Header.Get("Accept") == "text/event-stream" {
next.ServeHTTP(w, r)
return
}
http.TimeoutHandler(next, timeout, "Request timeout").ServeHTTP(w, r)
})
}
}
// JSONHandler ensures proper JSON responses and error handling
func JSONHandler(handler func(w http.ResponseWriter, r *http.Request) error) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
if err := handler(w, r); err != nil {
// Check if it's already an APIError
if apiErr, ok := err.(*APIError); ok {
writeErrorResponse(w, apiErr.StatusCode, apiErr.Code, apiErr.ErrorMessage, apiErr.Details)
return
}
// Generic error
log.Error().Err(err).
Str("path", r.URL.Path).
Str("method", r.Method).
Msg("Handler error")
writeErrorResponse(w, http.StatusInternalServerError, "internal_error",
"An error occurred processing the request", nil)
}
}
}
// writeErrorResponse writes a consistent error response
func writeErrorResponse(w http.ResponseWriter, statusCode int, code, message string, details map[string]string) {
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(statusCode)
resp := APIError{
ErrorMessage: message,
Code: code,
StatusCode: statusCode,
Timestamp: time.Now().Unix(),
Details: details,
}
if err := json.NewEncoder(w).Encode(resp); err != nil {
log.Error().Err(err).Msg("Failed to encode error response")
}
}
// responseWriter wraps http.ResponseWriter to capture status codes
type responseWriter struct {
http.ResponseWriter
statusCode int
written bool
}
func (rw *responseWriter) WriteHeader(code int) {
if !rw.written {
rw.statusCode = code
rw.ResponseWriter.WriteHeader(code)
rw.written = true
}
}
func (rw *responseWriter) Write(b []byte) (int, error) {
if !rw.written {
rw.WriteHeader(http.StatusOK)
}
return rw.ResponseWriter.Write(b)
}
func (rw *responseWriter) StatusCode() int {
if rw == nil {
return http.StatusInternalServerError
}
return rw.statusCode
}
// Hijack implements http.Hijacker interface
func (rw *responseWriter) Hijack() (net.Conn, *bufio.ReadWriter, error) {
hijacker, ok := rw.ResponseWriter.(http.Hijacker)
if !ok {
return nil, nil, fmt.Errorf("ResponseWriter does not implement http.Hijacker")
}
return hijacker.Hijack()
}
// NewAPIError creates a new API error
func NewAPIError(statusCode int, code, message string) error {
return &APIError{
ErrorMessage: message,
Code: code,
StatusCode: statusCode,
Timestamp: time.Now().Unix(),
}
}
// ValidationError creates a validation error with field details
func ValidationError(fields map[string]string) error {
return &APIError{
ErrorMessage: "Validation failed",
Code: "validation_error",
StatusCode: http.StatusBadRequest,
Timestamp: time.Now().Unix(),
Details: fields,
}
}