fix(sensor-proxy): eliminate allowed_nodes config corruption

Phase 1 hotfix to address recurring config file corruption that causes
99% of temperature monitoring failures. The root cause was the installer
oscillating between inline and file-based allowlist modes, creating
duplicate `allowed_nodes:` keys in config.yaml.

Changes:
- Force file-based allowlist mode exclusively (refuse versions < v4.31.1)
- Add automatic migration from inline to file-based config
- Remove inline mode code path from update_allowed_nodes()
- Migration runs on every install/self-heal to clean up existing corruption

The self-heal timer runs every 5 minutes and was the primary source of
corruption when version detection failed or encountered edge cases.

This eliminates the dual code paths and ensures config.yaml is never
edited for allowlist changes - only /etc/pulse-sensor-proxy/allowed_nodes.yaml
is modified.

Phase 2 (next release) will implement proper Go-based config management
with atomic writes, locking, and systemd validation to prevent corruption
by design.

Related to recurring temperature monitoring outages
This commit is contained in:
rcourtman
2025-11-19 09:21:54 +00:00
parent f4fd4924d6
commit 53dec6010c

View File

@@ -90,7 +90,7 @@ determine_allowlist_mode() {
INSTALLED_PROXY_VERSION="$(detect_proxy_version "$BINARY_PATH")"
if [[ -z "$INSTALLED_PROXY_VERSION" ]]; then
print_warn "Unable to detect installed pulse-sensor-proxy version; assuming allowed_nodes_file is supported"
# During initial install, version detection fails - that's expected
ALLOWLIST_MODE="file"
return
fi
@@ -103,8 +103,11 @@ determine_allowlist_mode() {
return
fi
ALLOWLIST_MODE="inline"
print_warn "pulse-sensor-proxy ${INSTALLED_PROXY_VERSION} does not support allowed_nodes_file; using inline allow list updates"
# Refuse to install/upgrade on unsupported versions
print_error "pulse-sensor-proxy ${INSTALLED_PROXY_VERSION} is too old (< ${MIN_ALLOWED_NODES_FILE_VERSION})"
print_error "File-based allowlist is now required. Please upgrade the proxy binary first."
print_error "Download latest from: https://github.com/rcourtman/Pulse/releases/latest"
exit 1
}
record_pending_control_plane() {
@@ -308,6 +311,61 @@ PY
fi
}
migrate_inline_allowed_nodes_to_file() {
# Extract any inline allowed_nodes from config.yaml and migrate them to allowed_nodes.yaml
# This is called during install/upgrade to ensure we never have inline blocks
if [[ ! -f "$CONFIG_FILE" ]]; then
return
fi
if ! command -v python3 >/dev/null 2>&1; then
print_warn "python3 not available; skipping inline allowed_nodes migration"
return
fi
# Extract inline nodes if they exist
local inline_nodes
inline_nodes=$(python3 - "$CONFIG_FILE" <<'PY'
import sys
from pathlib import Path
import yaml
path = Path(sys.argv[1])
if not path.exists():
sys.exit(0)
try:
data = yaml.safe_load(path.read_text())
if isinstance(data, dict) and 'allowed_nodes' in data:
nodes = data.get('allowed_nodes', [])
if isinstance(nodes, list):
for node in nodes:
print(node)
except yaml.YAMLError:
pass
PY
)
if [[ -n "$inline_nodes" ]]; then
# We found inline nodes - migrate them to the file
local -a nodes_array
mapfile -t nodes_array <<<"$inline_nodes"
if [[ ${#nodes_array[@]} -gt 0 ]]; then
print_info "Migrating ${#nodes_array[@]} inline allowed_nodes entries to ${ALLOWED_NODES_FILE}"
# Add them to the file (update_allowed_nodes will merge with existing)
update_allowed_nodes "Migrated from inline config" "${nodes_array[@]}"
# Now remove the inline block from config.yaml
normalize_allowed_nodes_section
print_success "Migration complete: inline allowed_nodes moved to file"
fi
fi
}
write_inline_allowed_nodes() {
local comment_line="$1"
shift || true
@@ -591,11 +649,7 @@ update_allowed_nodes() {
shift
local nodes=("$@")
if [[ "$ALLOWLIST_MODE" == "inline" ]]; then
write_inline_allowed_nodes "$comment_line" "${nodes[@]}"
return
fi
# File mode is now required - inline mode has been removed
ensure_allowed_nodes_file_reference
remove_allowed_nodes_block
@@ -1662,6 +1716,10 @@ for entry in nodes:
}
determine_allowlist_mode
# Migrate any existing inline allowed_nodes to file (Phase 1 hotfix for config corruption)
migrate_inline_allowed_nodes_to_file
cleanup_inline_allowed_nodes
# Create base config file if it doesn't exist