mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-02-18 23:41:48 +01:00
Dramatically improve temperature proxy installation robustness
Users were abandoning Pulse due to catastrophic temperature monitoring setup failures. This commit addresses the root causes: **Problem 1: Silent Failures** - Installations reported "SUCCESS" even when proxy never started - UI showed green checkmarks with no temperature data - Zero feedback when things went wrong **Problem 2: Missing Diagnostics** - Service failures logged only in journald - Users saw "Something going on with the proxy" with no actionable guidance - No way to troubleshoot from error messages **Problem 3: Standalone Node Issues** - Proxy daemon logged continuous pvecm errors as warnings - "ipcc_send_rec" and "Unknown error -1" messages confused users - These are expected for non-clustered/LXC setups **Solutions Implemented:** 1. **Health Gate in install.sh (lines 1588-1629)** - Verify service is running after installation - Check socket exists on host - Confirm socket visible inside container via bind mount - Fail loudly with specific diagnostics if any check fails 2. **Actionable Error Messages in install-sensor-proxy.sh (lines 822-877)** - When service fails to start: dump full systemctl status + 40 lines of logs - When socket missing: show permissions, service status, and remediation command - Include common issues checklist (missing user, permission errors, lm-sensors, etc.) - Direct link to troubleshooting docs 3. **Better Standalone Node Detection in ssh.go (lines 585-595)** - Recognize "Unknown error -1" and "Unable to load access control list" as LXC indicators - Log at INFO level (not WARN) since this is expected behavior - Clarify message: "using localhost for temperature collection" **Impact:** - Eliminates "green checkmark but no temps" scenario - Users get immediate actionable feedback on failures - Standalone/LXC installations work silently without error spam - Reduces support burden from #571 (15+ comments of user frustration) Related to #571
This commit is contained in:
@@ -582,15 +582,20 @@ func discoverClusterNodes() ([]string, error) {
|
||||
// Check if this is a standalone node or LXC container
|
||||
// - "does not exist" or "not part of a cluster": standalone node
|
||||
// - "ipcc_send_rec": running in LXC container without corosync access
|
||||
// - "Unknown error -1": LXC container corosync communication failure
|
||||
// - "Unable to load access control list": Permission/access issues in containers
|
||||
// Note: Some Proxmox versions write these messages to stdout, others to stderr
|
||||
if strings.Contains(combinedOutput, "does not exist") ||
|
||||
strings.Contains(combinedOutput, "not part of a cluster") ||
|
||||
strings.Contains(combinedOutput, "ipcc_send_rec") {
|
||||
log.Info().Msg("Standalone Proxmox node or LXC container detected - discovering local host addresses")
|
||||
strings.Contains(combinedOutput, "ipcc_send_rec") ||
|
||||
strings.Contains(combinedOutput, "Unknown error -1") ||
|
||||
strings.Contains(combinedOutput, "Unable to load access control list") {
|
||||
// Log at INFO level since this is expected for standalone/container scenarios
|
||||
log.Info().Msg("Standalone Proxmox node or LXC container detected - using localhost for temperature collection")
|
||||
return discoverLocalHostAddresses()
|
||||
}
|
||||
// For other errors, fail
|
||||
log.Warn().Str("stderr", stderrStr).Str("stdout", stdoutStr).Msg("pvecm status failed")
|
||||
// For other unexpected errors, fail with details
|
||||
log.Warn().Str("stderr", stderrStr).Str("stdout", stdoutStr).Msg("pvecm status failed with unexpected error")
|
||||
return nil, fmt.Errorf("failed to get cluster status: %w (stderr: %s, stdout: %s)", err, stderrStr, stdoutStr)
|
||||
}
|
||||
|
||||
|
||||
45
install.sh
45
install.sh
@@ -1583,7 +1583,50 @@ fi'; then
|
||||
fi
|
||||
|
||||
if bash "$proxy_script" "${proxy_install_args[@]}" 2>&1 | tee /tmp/proxy-install-${CTID}.log; then
|
||||
print_info "Temperature proxy installed successfully"
|
||||
print_info "Temperature proxy installation script completed"
|
||||
|
||||
# Verify proxy is actually working
|
||||
echo
|
||||
print_info "Verifying temperature proxy health..."
|
||||
local proxy_health_ok=true
|
||||
|
||||
# Check 1: Service is running
|
||||
if ! systemctl is-active --quiet pulse-sensor-proxy 2>/dev/null; then
|
||||
print_error "✗ Service not running"
|
||||
proxy_health_ok=false
|
||||
else
|
||||
print_info "✓ Service running"
|
||||
fi
|
||||
|
||||
# Check 2: Socket exists
|
||||
if [[ ! -S /run/pulse-sensor-proxy/pulse-sensor-proxy.sock ]]; then
|
||||
print_error "✗ Socket not found at /run/pulse-sensor-proxy/pulse-sensor-proxy.sock"
|
||||
proxy_health_ok=false
|
||||
else
|
||||
print_info "✓ Socket exists"
|
||||
fi
|
||||
|
||||
# Check 3: Socket is accessible from container
|
||||
if ! pct exec $CTID -- test -S /mnt/pulse-proxy/pulse-sensor-proxy.sock 2>/dev/null; then
|
||||
print_error "✗ Socket not visible inside container at /mnt/pulse-proxy/pulse-sensor-proxy.sock"
|
||||
print_error " Bind mount may not be configured correctly"
|
||||
proxy_health_ok=false
|
||||
else
|
||||
print_info "✓ Socket accessible from container"
|
||||
fi
|
||||
|
||||
if [[ "$proxy_health_ok" != "true" ]]; then
|
||||
echo
|
||||
print_error "Temperature proxy health check failed"
|
||||
print_error "See diagnostics above and logs: /tmp/proxy-install-${CTID}.log"
|
||||
print_error ""
|
||||
print_error "Check: systemctl status pulse-sensor-proxy"
|
||||
print_error "Check: journalctl -u pulse-sensor-proxy -n 50"
|
||||
echo
|
||||
exit 1
|
||||
fi
|
||||
|
||||
print_success "Temperature proxy is healthy and ready"
|
||||
# Clean up temporary binary if it was copied
|
||||
[[ -f "$local_proxy_binary" ]] && rm -f "$local_proxy_binary"
|
||||
else
|
||||
|
||||
@@ -821,8 +821,26 @@ fi
|
||||
|
||||
if ! systemctl restart pulse-sensor-proxy.service; then
|
||||
print_error "Failed to start pulse-sensor-proxy service"
|
||||
print_error "Check service logs:"
|
||||
journalctl -u pulse-sensor-proxy -n 20 --no-pager
|
||||
print_error ""
|
||||
print_error "═══════════════════════════════════════════════════════"
|
||||
print_error "Service Status:"
|
||||
print_error "═══════════════════════════════════════════════════════"
|
||||
systemctl status pulse-sensor-proxy --no-pager --lines=0 2>&1 || true
|
||||
print_error ""
|
||||
print_error "═══════════════════════════════════════════════════════"
|
||||
print_error "Recent Logs (last 40 lines):"
|
||||
print_error "═══════════════════════════════════════════════════════"
|
||||
journalctl -u pulse-sensor-proxy -n 40 --no-pager 2>&1 || true
|
||||
print_error ""
|
||||
print_error "═══════════════════════════════════════════════════════"
|
||||
print_error "Common Issues:"
|
||||
print_error "═══════════════════════════════════════════════════════"
|
||||
print_error "1. Missing user: Run 'useradd --system --no-create-home --group pulse-sensor-proxy'"
|
||||
print_error "2. Permission errors: Check ownership of /var/lib/pulse-sensor-proxy"
|
||||
print_error "3. lm-sensors not installed: Run 'apt-get install lm-sensors && sensors-detect --auto'"
|
||||
print_error "4. Standalone node detection: If you see 'pvecm' errors, this is expected for non-clustered hosts"
|
||||
print_error ""
|
||||
print_error "For more help: https://github.com/rcourtman/Pulse/blob/main/docs/TROUBLESHOOTING.md"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
@@ -836,8 +854,26 @@ for i in {1..10}; do
|
||||
done
|
||||
|
||||
if [[ ! -S "$SOCKET_PATH" ]]; then
|
||||
print_error "Socket did not appear after 10 seconds"
|
||||
print_info "Check service status: systemctl status pulse-sensor-proxy"
|
||||
print_error "Socket did not appear at $SOCKET_PATH after 10 seconds"
|
||||
print_error ""
|
||||
print_error "═══════════════════════════════════════════════════════"
|
||||
print_error "Diagnostics:"
|
||||
print_error "═══════════════════════════════════════════════════════"
|
||||
print_error "Service Status:"
|
||||
systemctl status pulse-sensor-proxy --no-pager 2>&1 || true
|
||||
print_error ""
|
||||
print_error "Socket Directory Permissions:"
|
||||
ls -la /run/pulse-sensor-proxy/ 2>&1 || echo "Directory does not exist"
|
||||
print_error ""
|
||||
print_error "Recent Logs:"
|
||||
journalctl -u pulse-sensor-proxy -n 20 --no-pager 2>&1 || true
|
||||
print_error ""
|
||||
print_error "Common Causes:"
|
||||
print_error " • Service failed to start (check logs above)"
|
||||
print_error " • RuntimeDirectory permissions issue"
|
||||
print_error " • Systemd socket creation failed"
|
||||
print_error ""
|
||||
print_error "Try: systemctl restart pulse-sensor-proxy && watch -n 0.5 'ls -la /run/pulse-sensor-proxy/'"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
Reference in New Issue
Block a user