Files
Pulse/internal/tempproxy/client.go
rcourtman 1e25fa572a security: add resilience and error handling to tempproxy client
Implements comprehensive client-side improvements for production reliability:

1. Context Support with Deadlines:
   - Added callWithContext() for context-aware RPC calls
   - Respects context deadlines and cancellation
   - Prevents goroutine pileup under network issues

2. Exponential Backoff with Jitter:
   - Automatic retry with exponential backoff (100ms → 10s)
   - ±10% jitter to prevent thundering herd
   - Max 3 retries for transient failures
   - Smart retry decision based on error classification

3. Error Classification:
   - ProxyError type with classification (Transport, Auth, SSH, Sensor, Timeout)
   - Retryable vs non-retryable error identification
   - Better error messages for debugging
   - Structured error handling throughout

4. Improved Connection Handling:
   - DialContext for cancellable connections
   - Proper deadline propagation
   - Clean separation of single-attempt vs retry logic
   - Legacy call() method preserved for backwards compatibility

Security Notes:
- SSH fallback already blocked in containers (temperature.go:69-77)
- Per-client token auth not needed after method-level authz (commit d55112ac4)
- ID-mapped root blocked from privileged methods

Performance:
- No retry on non-retryable errors (auth, sensor failures)
- Context cancellation short-circuits retry loops
- Jitter prevents synchronized retry storms

Addresses Codex findings #4 and #5 from security audit.
2025-10-19 16:37:11 +00:00

437 lines
10 KiB
Go

package tempproxy
import (
"context"
"encoding/json"
"fmt"
"math"
"math/rand"
"net"
"os"
"time"
)
const (
defaultSocketPath = "/run/pulse-sensor-proxy/pulse-sensor-proxy.sock"
containerSocketPath = "/mnt/pulse-proxy/pulse-sensor-proxy.sock"
defaultTimeout = 30 * time.Second // Increased to accommodate SSH operations
// Exponential backoff constants
initialBackoff = 100 * time.Millisecond
maxBackoff = 10 * time.Second
backoffFactor = 2.0
jitterFraction = 0.1
maxRetries = 3
)
// ErrorType classifies proxy errors for better error handling
type ErrorType int
const (
ErrorTypeUnknown ErrorType = iota
ErrorTypeTransport // Socket connection/communication failures
ErrorTypeAuth // Authorization failures
ErrorTypeSSH // SSH connectivity issues
ErrorTypeSensor // Sensor command failures
ErrorTypeTimeout // Operation timeout
)
// ProxyError wraps errors with classification
type ProxyError struct {
Type ErrorType
Message string
Retryable bool
Wrapped error
}
func (e *ProxyError) Error() string {
if e.Wrapped != nil {
return fmt.Sprintf("%s: %v", e.Message, e.Wrapped)
}
return e.Message
}
func (e *ProxyError) Unwrap() error {
return e.Wrapped
}
// Client communicates with pulse-sensor-proxy via unix socket
type Client struct {
socketPath string
timeout time.Duration
}
// NewClient creates a new proxy client
func NewClient() *Client {
socketPath := os.Getenv("PULSE_SENSOR_PROXY_SOCKET")
if socketPath == "" {
if _, err := os.Stat(defaultSocketPath); err == nil {
socketPath = defaultSocketPath
} else if _, err := os.Stat(containerSocketPath); err == nil {
socketPath = containerSocketPath
} else {
socketPath = defaultSocketPath
}
}
return &Client{
socketPath: socketPath,
timeout: defaultTimeout,
}
}
// IsAvailable checks if the proxy is running and accessible
func (c *Client) IsAvailable() bool {
_, err := os.Stat(c.socketPath)
return err == nil
}
// RPCRequest represents a request to the proxy
type RPCRequest struct {
Method string `json:"method"`
Params map[string]interface{} `json:"params"`
}
// RPCResponse represents a response from the proxy
type RPCResponse struct {
Success bool `json:"success"`
Data map[string]interface{} `json:"data,omitempty"`
Error string `json:"error,omitempty"`
}
// calculateBackoff calculates exponential backoff with jitter
func calculateBackoff(attempt int) time.Duration {
if attempt <= 0 {
return initialBackoff
}
// Calculate base backoff: initialBackoff * (backoffFactor ^ attempt)
backoff := float64(initialBackoff) * math.Pow(backoffFactor, float64(attempt))
// Cap at maxBackoff
if backoff > float64(maxBackoff) {
backoff = float64(maxBackoff)
}
// Add jitter: ±10% randomization
jitter := backoff * jitterFraction * (rand.Float64()*2 - 1)
backoff += jitter
return time.Duration(backoff)
}
// classifyError categorizes errors for retry logic
func classifyError(err error, respError string) *ProxyError {
if err == nil {
return nil
}
// Check for timeout
if netErr, ok := err.(net.Error); ok && netErr.Timeout() {
return &ProxyError{
Type: ErrorTypeTimeout,
Message: "operation timed out",
Retryable: true,
Wrapped: err,
}
}
// Check for connection errors (socket unavailable)
if _, ok := err.(*net.OpError); ok {
return &ProxyError{
Type: ErrorTypeTransport,
Message: "failed to connect to proxy socket",
Retryable: true,
Wrapped: err,
}
}
// Check response error messages
if respError != "" {
if respError == "unauthorized" || respError == "method requires host-level privileges" {
return &ProxyError{
Type: ErrorTypeAuth,
Message: respError,
Retryable: false,
Wrapped: fmt.Errorf("%s", respError),
}
}
// SSH-related errors
if contains(respError, "ssh", "connection", "timeout") {
return &ProxyError{
Type: ErrorTypeSSH,
Message: "SSH connectivity issue",
Retryable: true,
Wrapped: fmt.Errorf("%s", respError),
}
}
// Sensor errors
if contains(respError, "sensor", "temperature") {
return &ProxyError{
Type: ErrorTypeSensor,
Message: "sensor command failed",
Retryable: false,
Wrapped: fmt.Errorf("%s", respError),
}
}
}
// Unknown error
return &ProxyError{
Type: ErrorTypeUnknown,
Message: "unknown proxy error",
Retryable: false,
Wrapped: err,
}
}
// contains checks if any of the substrings are in the main string (case-insensitive)
func contains(s string, substrs ...string) bool {
s = fmt.Sprintf("%v", s)
for _, substr := range substrs {
if len(s) >= len(substr) {
for i := 0; i <= len(s)-len(substr); i++ {
match := true
for j := 0; j < len(substr); j++ {
c1 := s[i+j]
c2 := substr[j]
if c1 >= 'A' && c1 <= 'Z' {
c1 += 32
}
if c2 >= 'A' && c2 <= 'Z' {
c2 += 32
}
if c1 != c2 {
match = false
break
}
}
if match {
return true
}
}
}
}
return false
}
// callWithContext sends an RPC request with context and retry support
func (c *Client) callWithContext(ctx context.Context, method string, params map[string]interface{}) (*RPCResponse, error) {
var lastErr error
for attempt := 0; attempt <= maxRetries; attempt++ {
// Check if context is already cancelled
select {
case <-ctx.Done():
return nil, &ProxyError{
Type: ErrorTypeTimeout,
Message: "context cancelled before retry",
Retryable: false,
Wrapped: ctx.Err(),
}
default:
}
// Try the call
resp, err := c.callOnce(ctx, method, params)
// Success
if err == nil && resp != nil && resp.Success {
return resp, nil
}
// Classify error
respError := ""
if resp != nil {
respError = resp.Error
}
proxyErr := classifyError(err, respError)
// Don't retry non-retryable errors
if proxyErr != nil && !proxyErr.Retryable {
return resp, proxyErr
}
lastErr = proxyErr
if lastErr == nil {
lastErr = err
}
// Don't sleep after last attempt
if attempt < maxRetries {
backoff := calculateBackoff(attempt)
select {
case <-time.After(backoff):
// Continue to next attempt
case <-ctx.Done():
return nil, &ProxyError{
Type: ErrorTypeTimeout,
Message: "context cancelled during backoff",
Retryable: false,
Wrapped: ctx.Err(),
}
}
}
}
// All retries exhausted
return nil, &ProxyError{
Type: ErrorTypeTransport,
Message: fmt.Sprintf("max retries (%d) exhausted", maxRetries),
Retryable: false,
Wrapped: lastErr,
}
}
// callOnce sends a single RPC request without retries
func (c *Client) callOnce(ctx context.Context, method string, params map[string]interface{}) (*RPCResponse, error) {
// Create a dialer with context
dialer := net.Dialer{
Timeout: c.timeout,
}
// Connect to unix socket with context
conn, err := dialer.DialContext(ctx, "unix", c.socketPath)
if err != nil {
return nil, fmt.Errorf("failed to connect to proxy: %w", err)
}
defer conn.Close()
// Set deadline from context or use default timeout
deadline, ok := ctx.Deadline()
if !ok {
deadline = time.Now().Add(c.timeout)
}
conn.SetDeadline(deadline)
// Send request
req := RPCRequest{
Method: method,
Params: params,
}
encoder := json.NewEncoder(conn)
if err := encoder.Encode(req); err != nil {
return nil, fmt.Errorf("failed to encode request: %w", err)
}
// Read response (server uses newline-delimited framing)
var resp RPCResponse
decoder := json.NewDecoder(conn)
if err := decoder.Decode(&resp); err != nil {
return nil, fmt.Errorf("failed to decode response: %w", err)
}
return &resp, nil
}
// call sends an RPC request and returns the response (legacy method, uses default context)
func (c *Client) call(method string, params map[string]interface{}) (*RPCResponse, error) {
ctx, cancel := context.WithTimeout(context.Background(), c.timeout)
defer cancel()
return c.callWithContext(ctx, method, params)
}
// GetStatus returns proxy status
func (c *Client) GetStatus() (map[string]interface{}, error) {
resp, err := c.call("get_status", nil)
if err != nil {
return nil, err
}
if !resp.Success {
return nil, fmt.Errorf("proxy error: %s", resp.Error)
}
return resp.Data, nil
}
// RegisterNodes returns list of discovered nodes with SSH status
func (c *Client) RegisterNodes() ([]map[string]interface{}, error) {
resp, err := c.call("register_nodes", nil)
if err != nil {
return nil, err
}
if !resp.Success {
return nil, fmt.Errorf("proxy error: %s", resp.Error)
}
// Extract nodes array from data
nodesRaw, ok := resp.Data["nodes"]
if !ok {
return nil, fmt.Errorf("no nodes in response")
}
// Type assertion to []interface{} first, then convert
nodesArray, ok := nodesRaw.([]interface{})
if !ok {
return nil, fmt.Errorf("nodes is not an array")
}
nodes := make([]map[string]interface{}, len(nodesArray))
for i, nodeRaw := range nodesArray {
node, ok := nodeRaw.(map[string]interface{})
if !ok {
return nil, fmt.Errorf("node %d is not a map", i)
}
nodes[i] = node
}
return nodes, nil
}
// GetTemperature fetches temperature data from a specific node
func (c *Client) GetTemperature(nodeHost string) (string, error) {
params := map[string]interface{}{
"node": nodeHost,
}
resp, err := c.call("get_temperature", params)
if err != nil {
return "", err
}
if !resp.Success {
return "", fmt.Errorf("proxy error: %s", resp.Error)
}
// Extract temperature JSON string
tempRaw, ok := resp.Data["temperature"]
if !ok {
return "", fmt.Errorf("no temperature data in response")
}
tempStr, ok := tempRaw.(string)
if !ok {
return "", fmt.Errorf("temperature is not a string")
}
return tempStr, nil
}
// RequestCleanup signals the proxy to trigger host-side cleanup workflow.
func (c *Client) RequestCleanup(host string) error {
params := make(map[string]interface{}, 1)
if host != "" {
params["host"] = host
}
resp, err := c.call("request_cleanup", params)
if err != nil {
return err
}
if !resp.Success {
if resp.Error != "" {
return fmt.Errorf("proxy error: %s", resp.Error)
}
return fmt.Errorf("proxy rejected cleanup request")
}
return nil
}