mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-02-18 00:17:39 +01:00
Upgrade diagnostics infrastructure from 5/10 to 8/10 production readiness with enhanced metrics, logging, and request correlation capabilities. **Request Correlation** - Wire request IDs through context in middleware - Return X-Request-ID header in all API responses - Enable downstream log correlation across request lifecycle **HTTP/API Metrics** (18 new Prometheus metrics) - pulse_http_request_duration_seconds - API latency histogram - pulse_http_requests_total - request counter by method/route/status - pulse_http_request_errors_total - error counter by type - Path normalization to control label cardinality **Per-Node Poll Metrics** - pulse_monitor_node_poll_duration_seconds - per-node timing - pulse_monitor_node_poll_total - success/error counts per node - pulse_monitor_node_poll_errors_total - error breakdown per node - pulse_monitor_node_poll_last_success_timestamp - freshness tracking - pulse_monitor_node_poll_staleness_seconds - age since last success - Enables multi-node hotspot identification **Scheduler Health Metrics** - pulse_scheduler_queue_due_soon - ready queue depth - pulse_scheduler_queue_depth - by instance type - pulse_scheduler_queue_wait_seconds - time in queue histogram - pulse_scheduler_dead_letter_depth - failed task tracking - pulse_scheduler_breaker_state - circuit breaker state - pulse_scheduler_breaker_failure_count - consecutive failures - pulse_scheduler_breaker_retry_seconds - time until retry - Enable alerting on DLQ spikes, breaker opens, queue backlogs **Diagnostics Endpoint Caching** - pulse_diagnostics_cache_hits_total - cache performance - pulse_diagnostics_cache_misses_total - cache misses - pulse_diagnostics_refresh_duration_seconds - probe timing - 45-second TTL prevents thundering herd on /api/diagnostics - Thread-safe with RWMutex - X-Diagnostics-Cached-At header shows cache freshness **Debug Log Performance** - Gate high-frequency debug logs behind IsLevelEnabled() checks - Reduces CPU waste in production when debug disabled - Covers scheduler loops, poll cycles, API handlers **Persistent Logging** - File logging with automatic rotation - LOG_FILE, LOG_MAX_SIZE, LOG_MAX_AGE, LOG_COMPRESS env vars - MultiWriter sends logs to both stderr and file - Gzip compression support for rotated logs Files modified: - internal/api/diagnostics.go (caching layer) - internal/api/middleware.go (request IDs, HTTP metrics) - internal/api/http_metrics.go (NEW - HTTP metric definitions) - internal/logging/logging.go (file logging with rotation) - internal/monitoring/metrics.go (node + scheduler metrics) - internal/monitoring/monitor.go (instrumentation, debug gating) Impact: Dramatically improved production troubleshooting with per-node visibility, scheduler health metrics, persistent logs, and cached diagnostics. Fast incident response now possible for multi-node deployments.
153 lines
3.0 KiB
Go
153 lines
3.0 KiB
Go
package api
|
|
|
|
import (
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
)
|
|
|
|
var (
|
|
httpMetricsOnce sync.Once
|
|
|
|
apiRequestDuration *prometheus.HistogramVec
|
|
apiRequestTotal *prometheus.CounterVec
|
|
apiRequestErrors *prometheus.CounterVec
|
|
)
|
|
|
|
func initHTTPMetrics() {
|
|
apiRequestDuration = prometheus.NewHistogramVec(
|
|
prometheus.HistogramOpts{
|
|
Namespace: "pulse",
|
|
Subsystem: "http",
|
|
Name: "request_duration_seconds",
|
|
Help: "HTTP request duration observed at the API layer.",
|
|
Buckets: []float64{0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10},
|
|
},
|
|
[]string{"method", "route", "status"},
|
|
)
|
|
|
|
apiRequestTotal = prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: "pulse",
|
|
Subsystem: "http",
|
|
Name: "requests_total",
|
|
Help: "Total number of HTTP requests handled by the API.",
|
|
},
|
|
[]string{"method", "route", "status"},
|
|
)
|
|
|
|
apiRequestErrors = prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: "pulse",
|
|
Subsystem: "http",
|
|
Name: "request_errors_total",
|
|
Help: "Total number of HTTP errors surfaced to clients.",
|
|
},
|
|
[]string{"method", "route", "status_class"},
|
|
)
|
|
|
|
prometheus.MustRegister(apiRequestDuration, apiRequestTotal, apiRequestErrors)
|
|
}
|
|
|
|
func recordAPIRequest(method, route string, status int, elapsed time.Duration) {
|
|
httpMetricsOnce.Do(initHTTPMetrics)
|
|
|
|
statusCode := strconv.Itoa(status)
|
|
|
|
apiRequestDuration.WithLabelValues(method, route, statusCode).Observe(elapsed.Seconds())
|
|
apiRequestTotal.WithLabelValues(method, route, statusCode).Inc()
|
|
|
|
if status >= 400 {
|
|
apiRequestErrors.WithLabelValues(method, route, classifyStatus(status)).Inc()
|
|
}
|
|
}
|
|
|
|
func classifyStatus(status int) string {
|
|
switch {
|
|
case status >= 500:
|
|
return "server_error"
|
|
case status >= 400:
|
|
return "client_error"
|
|
default:
|
|
return "none"
|
|
}
|
|
}
|
|
|
|
func normalizeRoute(path string) string {
|
|
if path == "" || path == "/" {
|
|
return "/"
|
|
}
|
|
|
|
// Strip query parameters.
|
|
if idx := strings.Index(path, "?"); idx >= 0 {
|
|
path = path[:idx]
|
|
}
|
|
|
|
segments := strings.Split(path, "/")
|
|
normSegments := make([]string, 0, len(segments))
|
|
count := 0
|
|
for _, seg := range segments {
|
|
if seg == "" {
|
|
continue
|
|
}
|
|
count++
|
|
if count > 5 {
|
|
break
|
|
}
|
|
normSegments = append(normSegments, normalizeSegment(seg))
|
|
}
|
|
|
|
if len(normSegments) == 0 {
|
|
return "/"
|
|
}
|
|
|
|
return "/" + strings.Join(normSegments, "/")
|
|
}
|
|
|
|
func normalizeSegment(seg string) string {
|
|
if isNumeric(seg) {
|
|
return ":id"
|
|
}
|
|
if looksLikeUUID(seg) {
|
|
return ":uuid"
|
|
}
|
|
if len(seg) > 32 {
|
|
return ":token"
|
|
}
|
|
return seg
|
|
}
|
|
|
|
func isNumeric(s string) bool {
|
|
if s == "" {
|
|
return false
|
|
}
|
|
for _, r := range s {
|
|
if r < '0' || r > '9' {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
func looksLikeUUID(s string) bool {
|
|
if len(s) != 36 {
|
|
return false
|
|
}
|
|
for i, r := range s {
|
|
switch {
|
|
case r == '-':
|
|
if i != 8 && i != 13 && i != 18 && i != 23 {
|
|
return false
|
|
}
|
|
case (r >= '0' && r <= '9') || (r >= 'a' && r <= 'f') || (r >= 'A' && r <= 'F'):
|
|
continue
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|