mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-02-18 00:17:39 +01:00
fix(sensor-proxy): critical Phase 2 locking and validation fixes
Fixes critical issues found by Codex code review:
**1. Fixed file locking race condition (CRITICAL)**
- Lock file was being replaced by atomic rename, invalidating the lock
- New approach: lock a separate `.lock` file that persists across renames
- Ensures concurrent writers (installer + self-heal timer) are properly serialized
- Without this fix, corruption was still possible despite Phase 2
**2. Fixed validation to honor configured allowed_nodes_file path**
- validate command now uses loadConfig() to read actual config
- Respects allowed_nodes_file setting instead of assuming default path
- Prevents false positives/negatives when path is customized
**3. Allow empty allowed_nodes lists**
- Empty lists are valid (admin may clear for security, or rely on IPC validation)
- validate no longer fails on empty lists
- set-allowed-nodes --replace with zero nodes now supported
- Critical for operational flexibility
**4. Installer error propagation**
- update_allowed_nodes failures now exit installer with error
- Prevents silent failures that leave stale allowlists
- Self-heal will abort instead of masking CLI errors
**Technical Details:**
- withLockedFile() now locks `<path>.lock` instead of target file
- Lock held for entire duration of read-modify-write-rename
- atomicWriteFile() completes while lock is still held
- Empty lists represented as `allowed_nodes: []` in YAML
**Testing:**
✅ Lock file created and persists across operations
✅ Empty list can be written with --replace
✅ Validation passes with empty lists
✅ Config path from allowed_nodes_file honored
✅ Concurrent operations properly serialized
These fixes ensure Phase 2 actually eliminates corruption by design.
Identified by Codex code review
Related to Phase 2 commit 3dc073a28
This commit is contained in:
@@ -34,17 +34,26 @@ var validateCmd = &cobra.Command{
|
||||
cfgPath = defaultConfigPath
|
||||
}
|
||||
|
||||
allowedNodesPath := allowedNodesPathFlag
|
||||
if allowedNodesPath == "" {
|
||||
allowedNodesPath = filepath.Join(filepath.Dir(cfgPath), "allowed_nodes.yaml")
|
||||
}
|
||||
|
||||
if err := validateConfigFile(cfgPath); err != nil {
|
||||
// Use loadConfig to parse config properly and get the actual allowed_nodes_file path
|
||||
cfg, err := loadConfig(cfgPath)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Config validation failed: %v\n", err)
|
||||
return err
|
||||
}
|
||||
|
||||
// Determine allowed_nodes path from config (honors allowed_nodes_file setting)
|
||||
allowedNodesPath := allowedNodesPathFlag
|
||||
if allowedNodesPath == "" {
|
||||
if cfg.AllowedNodesFile != "" {
|
||||
allowedNodesPath = cfg.AllowedNodesFile
|
||||
} else {
|
||||
// Default fallback
|
||||
allowedNodesPath = filepath.Join(filepath.Dir(cfgPath), "allowed_nodes.yaml")
|
||||
}
|
||||
}
|
||||
|
||||
// Check if allowed_nodes.yaml exists and validate it
|
||||
// Empty lists are allowed (admin may clear for security or cluster relies on IPC validation)
|
||||
if _, err := os.Stat(allowedNodesPath); err == nil {
|
||||
if err := validateAllowedNodesFile(allowedNodesPath); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Allowed nodes validation failed: %v\n", err)
|
||||
@@ -76,8 +85,9 @@ Examples:
|
||||
allowedNodesPath = "/etc/pulse-sensor-proxy/allowed_nodes.yaml"
|
||||
}
|
||||
|
||||
if len(mergeNodesFlag) == 0 {
|
||||
return fmt.Errorf("no nodes specified (use --merge flag)")
|
||||
// Allow zero nodes when using --replace (admin may want to clear the list)
|
||||
if len(mergeNodesFlag) == 0 && !replaceMode {
|
||||
return fmt.Errorf("no nodes specified (use --merge flag, or --replace to clear)")
|
||||
}
|
||||
|
||||
if err := setAllowedNodes(allowedNodesPath, mergeNodesFlag, replaceMode); err != nil {
|
||||
@@ -155,7 +165,8 @@ func validateAllowedNodesFile(path string) error {
|
||||
return fmt.Errorf("failed to parse allowed_nodes YAML: %w", err)
|
||||
}
|
||||
|
||||
// Extract nodes
|
||||
// Extract nodes (empty list is valid - admin may clear for security)
|
||||
// Clusters can also rely on IPC-based validation instead of static allowlists
|
||||
var nodes []string
|
||||
switch v := result.(type) {
|
||||
case map[string]interface{}:
|
||||
@@ -174,12 +185,12 @@ func validateAllowedNodesFile(path string) error {
|
||||
nodes = append(nodes, s)
|
||||
}
|
||||
}
|
||||
case nil:
|
||||
// Empty file is valid
|
||||
return nil
|
||||
}
|
||||
|
||||
if len(nodes) == 0 {
|
||||
return fmt.Errorf("allowed_nodes file is empty or invalid")
|
||||
}
|
||||
|
||||
// Empty list is allowed - don't enforce minimum
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -191,8 +202,9 @@ func setAllowedNodes(path string, newNodes []string, replace bool) error {
|
||||
return fmt.Errorf("failed to create config directory: %w", err)
|
||||
}
|
||||
|
||||
// Use file locking to prevent concurrent writes
|
||||
return withLockedFile(path, func(f *os.File) error {
|
||||
// Use a separate lock file that persists across renames
|
||||
lockPath := path + ".lock"
|
||||
return withLockedFile(lockPath, func(f *os.File) error {
|
||||
var existing []string
|
||||
|
||||
// Read existing nodes if not in replace mode
|
||||
@@ -205,6 +217,7 @@ func setAllowedNodes(path string, newNodes []string, replace bool) error {
|
||||
// Merge and deduplicate
|
||||
merged := normalizeNodes(append(existing, newNodes...))
|
||||
|
||||
// Allow empty lists (admin may want to clear the file)
|
||||
// Serialize to YAML
|
||||
output := map[string]interface{}{
|
||||
"allowed_nodes": merged,
|
||||
@@ -218,17 +231,17 @@ func setAllowedNodes(path string, newNodes []string, replace bool) error {
|
||||
header := "# Managed by pulse-sensor-proxy config CLI\n# Do not edit manually while service is running\n"
|
||||
finalData := []byte(header + string(data))
|
||||
|
||||
// Write atomically
|
||||
// Write atomically while holding lock
|
||||
return atomicWriteFile(path, finalData, 0644)
|
||||
})
|
||||
}
|
||||
|
||||
// withLockedFile opens a file with exclusive locking and runs a callback
|
||||
func withLockedFile(path string, fn func(f *os.File) error) error {
|
||||
// Open or create the file
|
||||
f, err := os.OpenFile(path, os.O_RDWR|os.O_CREATE, 0644)
|
||||
// withLockedFile opens a lock file with exclusive locking and runs a callback
|
||||
func withLockedFile(lockPath string, fn func(f *os.File) error) error {
|
||||
// Open or create the lock file (never deleted, persists across renames)
|
||||
f, err := os.OpenFile(lockPath, os.O_RDWR|os.O_CREATE, 0644)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open file: %w", err)
|
||||
return fmt.Errorf("failed to open lock file: %w", err)
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
@@ -238,7 +251,7 @@ func withLockedFile(path string, fn func(f *os.File) error) error {
|
||||
}
|
||||
defer unix.Flock(int(f.Fd()), unix.LOCK_UN) //nolint:errcheck
|
||||
|
||||
// Run callback
|
||||
// Run callback while holding lock
|
||||
return fn(f)
|
||||
}
|
||||
|
||||
|
||||
@@ -2830,7 +2830,10 @@ if command -v pvecm >/dev/null 2>&1; then
|
||||
all_nodes+=("${CONTROL_PLANE_ALLOWED_NODE_LIST[@]}")
|
||||
fi
|
||||
# Use helper function to safely update allowed_nodes (prevents duplicates on re-run)
|
||||
update_allowed_nodes "Cluster nodes (auto-discovered during installation)" "${all_nodes[@]}"
|
||||
if ! update_allowed_nodes "Cluster nodes (auto-discovered during installation)" "${all_nodes[@]}"; then
|
||||
print_error "Failed to update allowed_nodes list"
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
# No cluster found - configure standalone node
|
||||
print_info "No cluster detected, configuring standalone node..."
|
||||
@@ -2858,7 +2861,10 @@ if command -v pvecm >/dev/null 2>&1; then
|
||||
all_nodes+=("${CONTROL_PLANE_ALLOWED_NODE_LIST[@]}")
|
||||
fi
|
||||
# Use helper function to safely update allowed_nodes (prevents duplicates on re-run)
|
||||
update_allowed_nodes "Standalone node configuration (auto-configured during installation)" "${all_nodes[@]}"
|
||||
if ! update_allowed_nodes "Standalone node configuration (auto-configured during installation)" "${all_nodes[@]}"; then
|
||||
print_error "Failed to update allowed_nodes list"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
else
|
||||
# Proxmox host but pvecm not available (shouldn't happen, but handle it)
|
||||
@@ -2885,7 +2891,10 @@ else
|
||||
all_nodes+=("${CONTROL_PLANE_ALLOWED_NODE_LIST[@]}")
|
||||
fi
|
||||
# Use helper function to safely update allowed_nodes (prevents duplicates on re-run)
|
||||
update_allowed_nodes "Localhost fallback configuration (pvecm unavailable)" "${all_nodes[@]}"
|
||||
if ! update_allowed_nodes "Localhost fallback configuration (pvecm unavailable)" "${all_nodes[@]}"; then
|
||||
print_error "Failed to update allowed_nodes list"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
cleanup_inline_allowed_nodes
|
||||
|
||||
Reference in New Issue
Block a user