Pulse/cmd/pulse-sensor-proxy/throttle.go

package main

import (
	"fmt"
	"sync"
	"time"

	"golang.org/x/time/rate"
)

// peerID identifies a connecting principal (grouped by UID or ID range)
type peerID struct {
	uid      uint32
	uidRange *idRange
}

func (p peerID) String() string {
	if p.uidRange != nil {
		end := p.uidRange.start + p.uidRange.length - 1
		return fmt.Sprintf("range:%d-%d", p.uidRange.start, end)
	}
	return fmt.Sprintf("uid:%d", p.uid)
}

// limiterEntry holds rate limiting and concurrency controls for a peer
type limiterEntry struct {
	limiter   *rate.Limiter
	semaphore chan struct{}
	lastSeen  time.Time
}

type limiterPolicy struct {
	perPeerLimit       rate.Limit
	perPeerBurst       int
	perPeerConcurrency int
	globalConcurrency  int
	penaltyDuration    time.Duration
}

// rateLimiter manages per-peer rate limits and concurrency
type rateLimiter struct {
	mu        sync.Mutex
	entries   map[string]*limiterEntry
	quitChan  chan struct{}
	globalSem chan struct{}
	policy    limiterPolicy
	metrics   *ProxyMetrics
	uidRanges []idRange
	gidRanges []idRange
}

const (
	defaultPerPeerBurst       = 5 // Allow burst of 5 requests for multi-node polling
	defaultPerPeerConcurrency = 2
	defaultGlobalConcurrency  = 8
)

var (
	defaultPerPeerRateInterval = 1 * time.Second // 1 qps (60/min) - supports 5-10 node deployments
	defaultPenaltyDuration     = 2 * time.Second
	defaultPerPeerLimit        = rate.Every(defaultPerPeerRateInterval)
)

// newRateLimiter creates a new rate limiter with cleanup loop
// If rateLimitCfg is provided, it overrides the default rate limit settings
func newRateLimiter(metrics *ProxyMetrics, rateLimitCfg *RateLimitConfig, uidRanges, gidRanges []idRange) *rateLimiter {
	// Use defaults
	perPeerLimit := defaultPerPeerLimit
	perPeerBurst := defaultPerPeerBurst

	// Override with config if provided
	if rateLimitCfg != nil {
		if rateLimitCfg.PerPeerIntervalMs > 0 {
			interval := time.Duration(rateLimitCfg.PerPeerIntervalMs) * time.Millisecond
			perPeerLimit = rate.Every(interval)
		}
		if rateLimitCfg.PerPeerBurst > 0 {
			perPeerBurst = rateLimitCfg.PerPeerBurst
		}
	}

	rl := &rateLimiter{
		entries:   make(map[string]*limiterEntry),
		quitChan:  make(chan struct{}),
		globalSem: make(chan struct{}, defaultGlobalConcurrency),
		policy: limiterPolicy{
			perPeerLimit:       perPeerLimit,
			perPeerBurst:       perPeerBurst,
			perPeerConcurrency: defaultPerPeerConcurrency,
			globalConcurrency:  defaultGlobalConcurrency,
			penaltyDuration:    defaultPenaltyDuration,
		},
		metrics:   metrics,
		uidRanges: append([]idRange(nil), uidRanges...),
		gidRanges: append([]idRange(nil), gidRanges...),
	}
	if rl.metrics != nil {
		rl.metrics.setLimiterPeers(0)
	}
	go rl.cleanupLoop()
	return rl
}

// allow checks if a peer is allowed to make a request and reserves concurrency.
// Returns a release function, rejection reason (if any), and whether the request is allowed.
func (rl *rateLimiter) allow(id peerID) (release func(), reason string, allowed bool) {
	key := id.String()
	rl.mu.Lock()
	entry := rl.entries[key]
	if entry == nil {
		entry = &limiterEntry{
			limiter:   rate.NewLimiter(rl.policy.perPeerLimit, rl.policy.perPeerBurst),
			semaphore: make(chan struct{}, rl.policy.perPeerConcurrency),
		}
		rl.entries[key] = entry
		if rl.metrics != nil {
			rl.metrics.setLimiterPeers(len(rl.entries))
		}
	}
	entry.lastSeen = time.Now()
	rl.mu.Unlock()

	// Check rate limit
	if !entry.limiter.Allow() {
		rl.recordRejection("rate", key)
		return nil, "rate", false
	}

	// Acquire global concurrency
	select {
	case rl.globalSem <- struct{}{}:
		if rl.metrics != nil {
			rl.metrics.incGlobalConcurrency()
		}
	default:
		rl.recordRejection("global_concurrency", key)
		return nil, "global_concurrency", false
	}

	// Try to acquire per-peer concurrency slot
	select {
	case entry.semaphore <- struct{}{}:
		return func() {
			<-entry.semaphore
			<-rl.globalSem
			if rl.metrics != nil {
				rl.metrics.decGlobalConcurrency()
			}
		}, "", true
	default:
		<-rl.globalSem
		if rl.metrics != nil {
			rl.metrics.decGlobalConcurrency()
		}
		rl.recordRejection("peer_concurrency", key)
		return nil, "peer_concurrency", false
	}
}

// cleanupLoop periodically removes idle peer entries
func (rl *rateLimiter) cleanupLoop() {
	ticker := time.NewTicker(5 * time.Minute)
	defer ticker.Stop()
	for {
		select {
		case <-ticker.C:
			rl.mu.Lock()
			for key, entry := range rl.entries {
				if time.Since(entry.lastSeen) > 10*time.Minute {
					delete(rl.entries, key)
				}
			}
			if rl.metrics != nil {
				rl.metrics.setLimiterPeers(len(rl.entries))
			}
			rl.mu.Unlock()
		case <-rl.quitChan:
			return
		}
	}
}

// shutdown stops the cleanup loop
func (rl *rateLimiter) shutdown() {
	close(rl.quitChan)
}

func (rl *rateLimiter) penalize(peerLabel, reason string) {
	if rl.policy.penaltyDuration <= 0 {
		return
	}
	time.Sleep(rl.policy.penaltyDuration)
	if rl.metrics != nil {
		rl.metrics.recordPenalty(reason, peerLabel)
	}
}

func (rl *rateLimiter) recordRejection(reason, peerLabel string) {
	if rl.metrics != nil {
		rl.metrics.recordLimiterReject(reason, peerLabel)
	}
}

func (rl *rateLimiter) identifyPeer(cred *peerCredentials) peerID {
	if cred == nil {
		return peerID{}
	}
	if rl == nil {
		return peerID{uid: cred.uid}
	}

	if len(rl.uidRanges) == 0 || len(rl.gidRanges) == 0 {
		return peerID{uid: cred.uid}
	}

	uidRange := findRange(rl.uidRanges, cred.uid)
	gidRange := findRange(rl.gidRanges, cred.gid)

	if uidRange != nil && gidRange != nil {
		return peerID{uid: cred.uid, uidRange: uidRange}
	}

	return peerID{uid: cred.uid}
}

func findRange(ranges []idRange, value uint32) *idRange {
	for i := range ranges {
		if ranges[i].contains(value) {
			return &ranges[i]
		}
	}
	return nil
}

// nodeGate controls per-node concurrency for temperature requests
type nodeGate struct {
	mu       sync.Mutex
	inFlight map[string]*nodeLock
}

// nodeLock tracks in-flight requests for a specific node
type nodeLock struct {
	refCount int
	guard    chan struct{}
}

// newNodeGate creates a new node concurrency gate
func newNodeGate() *nodeGate {
	return &nodeGate{
		inFlight: make(map[string]*nodeLock),
	}
}

// acquire gets exclusive access to make requests to a node
// Returns a release function that must be called when done
func (g *nodeGate) acquire(node string) func() {
	g.mu.Lock()
	lock := g.inFlight[node]
	if lock == nil {
		lock = &nodeLock{
			guard: make(chan struct{}, 1), // single slot = only one SSH fetch per node
		}
		g.inFlight[node] = lock
	}
	lock.refCount++
	g.mu.Unlock()

	// Wait for exclusive access
	lock.guard <- struct{}{}

	// Return release function
	return func() {
		<-lock.guard
		g.mu.Lock()
		lock.refCount--
		if lock.refCount == 0 {
			delete(g.inFlight, node)
		}
		g.mu.Unlock()
	}
}