mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-02-18 00:17:39 +01:00
Upgrade diagnostics infrastructure from 5/10 to 8/10 production readiness with enhanced metrics, logging, and request correlation capabilities. **Request Correlation** - Wire request IDs through context in middleware - Return X-Request-ID header in all API responses - Enable downstream log correlation across request lifecycle **HTTP/API Metrics** (18 new Prometheus metrics) - pulse_http_request_duration_seconds - API latency histogram - pulse_http_requests_total - request counter by method/route/status - pulse_http_request_errors_total - error counter by type - Path normalization to control label cardinality **Per-Node Poll Metrics** - pulse_monitor_node_poll_duration_seconds - per-node timing - pulse_monitor_node_poll_total - success/error counts per node - pulse_monitor_node_poll_errors_total - error breakdown per node - pulse_monitor_node_poll_last_success_timestamp - freshness tracking - pulse_monitor_node_poll_staleness_seconds - age since last success - Enables multi-node hotspot identification **Scheduler Health Metrics** - pulse_scheduler_queue_due_soon - ready queue depth - pulse_scheduler_queue_depth - by instance type - pulse_scheduler_queue_wait_seconds - time in queue histogram - pulse_scheduler_dead_letter_depth - failed task tracking - pulse_scheduler_breaker_state - circuit breaker state - pulse_scheduler_breaker_failure_count - consecutive failures - pulse_scheduler_breaker_retry_seconds - time until retry - Enable alerting on DLQ spikes, breaker opens, queue backlogs **Diagnostics Endpoint Caching** - pulse_diagnostics_cache_hits_total - cache performance - pulse_diagnostics_cache_misses_total - cache misses - pulse_diagnostics_refresh_duration_seconds - probe timing - 45-second TTL prevents thundering herd on /api/diagnostics - Thread-safe with RWMutex - X-Diagnostics-Cached-At header shows cache freshness **Debug Log Performance** - Gate high-frequency debug logs behind IsLevelEnabled() checks - Reduces CPU waste in production when debug disabled - Covers scheduler loops, poll cycles, API handlers **Persistent Logging** - File logging with automatic rotation - LOG_FILE, LOG_MAX_SIZE, LOG_MAX_AGE, LOG_COMPRESS env vars - MultiWriter sends logs to both stderr and file - Gzip compression support for rotated logs Files modified: - internal/api/diagnostics.go (caching layer) - internal/api/middleware.go (request IDs, HTTP metrics) - internal/api/http_metrics.go (NEW - HTTP metric definitions) - internal/logging/logging.go (file logging with rotation) - internal/monitoring/metrics.go (node + scheduler metrics) - internal/monitoring/monitor.go (instrumentation, debug gating) Impact: Dramatically improved production troubleshooting with per-node visibility, scheduler health metrics, persistent logs, and cached diagnostics. Fast incident response now possible for multi-node deployments.
211 lines
5.7 KiB
Go
211 lines
5.7 KiB
Go
package api
|
|
|
|
import (
|
|
"bufio"
|
|
"encoding/json"
|
|
"fmt"
|
|
"net"
|
|
"net/http"
|
|
"runtime/debug"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/logging"
|
|
"github.com/rs/zerolog/log"
|
|
)
|
|
|
|
// APIError represents a structured API error response
|
|
type APIError struct {
|
|
ErrorMessage string `json:"error"`
|
|
Code string `json:"code,omitempty"`
|
|
StatusCode int `json:"status_code"`
|
|
Timestamp int64 `json:"timestamp"`
|
|
RequestID string `json:"request_id,omitempty"`
|
|
Details map[string]string `json:"details,omitempty"`
|
|
}
|
|
|
|
// Error implements the error interface
|
|
func (e *APIError) Error() string {
|
|
return e.ErrorMessage
|
|
}
|
|
|
|
// ErrorHandler is a middleware that handles panics and errors
|
|
func ErrorHandler(next http.Handler) http.Handler {
|
|
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
// Fix for issue #334: Normalize empty path to "/" before ServeMux processes it
|
|
// This prevents the automatic redirect from "" to "./"
|
|
if r.URL.Path == "" {
|
|
r.URL.Path = "/"
|
|
}
|
|
|
|
// Skip error handling for WebSocket endpoints
|
|
if r.Header.Get("Upgrade") == "websocket" {
|
|
next.ServeHTTP(w, r)
|
|
return
|
|
}
|
|
|
|
// Add request ID to context, honoring any incoming header value.
|
|
incomingID := strings.TrimSpace(r.Header.Get("X-Request-ID"))
|
|
ctxWithID, requestID := logging.WithRequestID(r.Context(), incomingID)
|
|
r = r.WithContext(ctxWithID)
|
|
|
|
// Create a custom response writer to capture status codes
|
|
rw := &responseWriter{ResponseWriter: w, statusCode: http.StatusOK}
|
|
rw.Header().Set("X-Request-ID", requestID)
|
|
|
|
start := time.Now()
|
|
routeLabel := normalizeRoute(r.URL.Path)
|
|
method := r.Method
|
|
|
|
defer func() {
|
|
elapsed := time.Since(start)
|
|
recordAPIRequest(method, routeLabel, rw.StatusCode(), elapsed)
|
|
}()
|
|
|
|
// Recover from panics
|
|
defer func() {
|
|
if err := recover(); err != nil {
|
|
log.Error().
|
|
Interface("error", err).
|
|
Str("path", r.URL.Path).
|
|
Str("method", r.Method).
|
|
Str("request_id", requestID).
|
|
Bytes("stack", debug.Stack()).
|
|
Msg("Panic recovered in API handler")
|
|
|
|
writeErrorResponse(rw, http.StatusInternalServerError, "internal_error",
|
|
"An unexpected error occurred", nil)
|
|
}
|
|
}()
|
|
|
|
// Call the next handler
|
|
next.ServeHTTP(rw, r)
|
|
|
|
// Log errors (4xx and 5xx)
|
|
if rw.statusCode >= 400 {
|
|
log.Warn().
|
|
Str("path", r.URL.Path).
|
|
Str("method", r.Method).
|
|
Int("status", rw.statusCode).
|
|
Str("request_id", requestID).
|
|
Msg("Request failed")
|
|
}
|
|
})
|
|
}
|
|
|
|
// TimeoutHandler wraps handlers with a timeout
|
|
func TimeoutHandler(timeout time.Duration) func(http.Handler) http.Handler {
|
|
return func(next http.Handler) http.Handler {
|
|
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
// Skip timeout for WebSocket and SSE endpoints
|
|
if r.Header.Get("Upgrade") == "websocket" || r.Header.Get("Accept") == "text/event-stream" {
|
|
next.ServeHTTP(w, r)
|
|
return
|
|
}
|
|
|
|
http.TimeoutHandler(next, timeout, "Request timeout").ServeHTTP(w, r)
|
|
})
|
|
}
|
|
}
|
|
|
|
// JSONHandler ensures proper JSON responses and error handling
|
|
func JSONHandler(handler func(w http.ResponseWriter, r *http.Request) error) http.HandlerFunc {
|
|
return func(w http.ResponseWriter, r *http.Request) {
|
|
w.Header().Set("Content-Type", "application/json")
|
|
|
|
if err := handler(w, r); err != nil {
|
|
// Check if it's already an APIError
|
|
if apiErr, ok := err.(*APIError); ok {
|
|
writeErrorResponse(w, apiErr.StatusCode, apiErr.Code, apiErr.ErrorMessage, apiErr.Details)
|
|
return
|
|
}
|
|
|
|
// Generic error
|
|
log.Error().Err(err).
|
|
Str("path", r.URL.Path).
|
|
Str("method", r.Method).
|
|
Msg("Handler error")
|
|
|
|
writeErrorResponse(w, http.StatusInternalServerError, "internal_error",
|
|
"An error occurred processing the request", nil)
|
|
}
|
|
}
|
|
}
|
|
|
|
// writeErrorResponse writes a consistent error response
|
|
func writeErrorResponse(w http.ResponseWriter, statusCode int, code, message string, details map[string]string) {
|
|
w.Header().Set("Content-Type", "application/json")
|
|
w.WriteHeader(statusCode)
|
|
|
|
resp := APIError{
|
|
ErrorMessage: message,
|
|
Code: code,
|
|
StatusCode: statusCode,
|
|
Timestamp: time.Now().Unix(),
|
|
Details: details,
|
|
}
|
|
|
|
if err := json.NewEncoder(w).Encode(resp); err != nil {
|
|
log.Error().Err(err).Msg("Failed to encode error response")
|
|
}
|
|
}
|
|
|
|
// responseWriter wraps http.ResponseWriter to capture status codes
|
|
type responseWriter struct {
|
|
http.ResponseWriter
|
|
statusCode int
|
|
written bool
|
|
}
|
|
|
|
func (rw *responseWriter) WriteHeader(code int) {
|
|
if !rw.written {
|
|
rw.statusCode = code
|
|
rw.ResponseWriter.WriteHeader(code)
|
|
rw.written = true
|
|
}
|
|
}
|
|
|
|
func (rw *responseWriter) Write(b []byte) (int, error) {
|
|
if !rw.written {
|
|
rw.WriteHeader(http.StatusOK)
|
|
}
|
|
return rw.ResponseWriter.Write(b)
|
|
}
|
|
|
|
func (rw *responseWriter) StatusCode() int {
|
|
if rw == nil {
|
|
return http.StatusInternalServerError
|
|
}
|
|
return rw.statusCode
|
|
}
|
|
|
|
// Hijack implements http.Hijacker interface
|
|
func (rw *responseWriter) Hijack() (net.Conn, *bufio.ReadWriter, error) {
|
|
hijacker, ok := rw.ResponseWriter.(http.Hijacker)
|
|
if !ok {
|
|
return nil, nil, fmt.Errorf("ResponseWriter does not implement http.Hijacker")
|
|
}
|
|
return hijacker.Hijack()
|
|
}
|
|
|
|
// NewAPIError creates a new API error
|
|
func NewAPIError(statusCode int, code, message string) error {
|
|
return &APIError{
|
|
ErrorMessage: message,
|
|
Code: code,
|
|
StatusCode: statusCode,
|
|
Timestamp: time.Now().Unix(),
|
|
}
|
|
}
|
|
|
|
// ValidationError creates a validation error with field details
|
|
func ValidationError(fields map[string]string) error {
|
|
return &APIError{
|
|
ErrorMessage: "Validation failed",
|
|
Code: "validation_error",
|
|
StatusCode: http.StatusBadRequest,
|
|
Timestamp: time.Now().Unix(),
|
|
Details: fields,
|
|
}
|
|
}
|