Files
Pulse/internal/api/http_metrics.go
rcourtman 495e6c7945 feat: comprehensive diagnostics and observability improvements
Upgrade diagnostics infrastructure from 5/10 to 8/10 production readiness
with enhanced metrics, logging, and request correlation capabilities.

**Request Correlation**
- Wire request IDs through context in middleware
- Return X-Request-ID header in all API responses
- Enable downstream log correlation across request lifecycle

**HTTP/API Metrics** (18 new Prometheus metrics)
- pulse_http_request_duration_seconds - API latency histogram
- pulse_http_requests_total - request counter by method/route/status
- pulse_http_request_errors_total - error counter by type
- Path normalization to control label cardinality

**Per-Node Poll Metrics**
- pulse_monitor_node_poll_duration_seconds - per-node timing
- pulse_monitor_node_poll_total - success/error counts per node
- pulse_monitor_node_poll_errors_total - error breakdown per node
- pulse_monitor_node_poll_last_success_timestamp - freshness tracking
- pulse_monitor_node_poll_staleness_seconds - age since last success
- Enables multi-node hotspot identification

**Scheduler Health Metrics**
- pulse_scheduler_queue_due_soon - ready queue depth
- pulse_scheduler_queue_depth - by instance type
- pulse_scheduler_queue_wait_seconds - time in queue histogram
- pulse_scheduler_dead_letter_depth - failed task tracking
- pulse_scheduler_breaker_state - circuit breaker state
- pulse_scheduler_breaker_failure_count - consecutive failures
- pulse_scheduler_breaker_retry_seconds - time until retry
- Enable alerting on DLQ spikes, breaker opens, queue backlogs

**Diagnostics Endpoint Caching**
- pulse_diagnostics_cache_hits_total - cache performance
- pulse_diagnostics_cache_misses_total - cache misses
- pulse_diagnostics_refresh_duration_seconds - probe timing
- 45-second TTL prevents thundering herd on /api/diagnostics
- Thread-safe with RWMutex
- X-Diagnostics-Cached-At header shows cache freshness

**Debug Log Performance**
- Gate high-frequency debug logs behind IsLevelEnabled() checks
- Reduces CPU waste in production when debug disabled
- Covers scheduler loops, poll cycles, API handlers

**Persistent Logging**
- File logging with automatic rotation
- LOG_FILE, LOG_MAX_SIZE, LOG_MAX_AGE, LOG_COMPRESS env vars
- MultiWriter sends logs to both stderr and file
- Gzip compression support for rotated logs

Files modified:
- internal/api/diagnostics.go (caching layer)
- internal/api/middleware.go (request IDs, HTTP metrics)
- internal/api/http_metrics.go (NEW - HTTP metric definitions)
- internal/logging/logging.go (file logging with rotation)
- internal/monitoring/metrics.go (node + scheduler metrics)
- internal/monitoring/monitor.go (instrumentation, debug gating)

Impact: Dramatically improved production troubleshooting with per-node
visibility, scheduler health metrics, persistent logs, and cached
diagnostics. Fast incident response now possible for multi-node deployments.
2025-10-21 12:37:39 +00:00

153 lines
3.0 KiB
Go

package api
import (
"strconv"
"strings"
"sync"
"time"
"github.com/prometheus/client_golang/prometheus"
)
var (
httpMetricsOnce sync.Once
apiRequestDuration *prometheus.HistogramVec
apiRequestTotal *prometheus.CounterVec
apiRequestErrors *prometheus.CounterVec
)
func initHTTPMetrics() {
apiRequestDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: "pulse",
Subsystem: "http",
Name: "request_duration_seconds",
Help: "HTTP request duration observed at the API layer.",
Buckets: []float64{0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10},
},
[]string{"method", "route", "status"},
)
apiRequestTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "pulse",
Subsystem: "http",
Name: "requests_total",
Help: "Total number of HTTP requests handled by the API.",
},
[]string{"method", "route", "status"},
)
apiRequestErrors = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "pulse",
Subsystem: "http",
Name: "request_errors_total",
Help: "Total number of HTTP errors surfaced to clients.",
},
[]string{"method", "route", "status_class"},
)
prometheus.MustRegister(apiRequestDuration, apiRequestTotal, apiRequestErrors)
}
func recordAPIRequest(method, route string, status int, elapsed time.Duration) {
httpMetricsOnce.Do(initHTTPMetrics)
statusCode := strconv.Itoa(status)
apiRequestDuration.WithLabelValues(method, route, statusCode).Observe(elapsed.Seconds())
apiRequestTotal.WithLabelValues(method, route, statusCode).Inc()
if status >= 400 {
apiRequestErrors.WithLabelValues(method, route, classifyStatus(status)).Inc()
}
}
func classifyStatus(status int) string {
switch {
case status >= 500:
return "server_error"
case status >= 400:
return "client_error"
default:
return "none"
}
}
func normalizeRoute(path string) string {
if path == "" || path == "/" {
return "/"
}
// Strip query parameters.
if idx := strings.Index(path, "?"); idx >= 0 {
path = path[:idx]
}
segments := strings.Split(path, "/")
normSegments := make([]string, 0, len(segments))
count := 0
for _, seg := range segments {
if seg == "" {
continue
}
count++
if count > 5 {
break
}
normSegments = append(normSegments, normalizeSegment(seg))
}
if len(normSegments) == 0 {
return "/"
}
return "/" + strings.Join(normSegments, "/")
}
func normalizeSegment(seg string) string {
if isNumeric(seg) {
return ":id"
}
if looksLikeUUID(seg) {
return ":uuid"
}
if len(seg) > 32 {
return ":token"
}
return seg
}
func isNumeric(s string) bool {
if s == "" {
return false
}
for _, r := range s {
if r < '0' || r > '9' {
return false
}
}
return true
}
func looksLikeUUID(s string) bool {
if len(s) != 36 {
return false
}
for i, r := range s {
switch {
case r == '-':
if i != 8 && i != 13 && i != 18 && i != 23 {
return false
}
case (r >= '0' && r <= '9') || (r >= 'a' && r <= 'f') || (r >= 'A' && r <= 'F'):
continue
default:
return false
}
}
return true
}