diff --git a/cmd/pulse-sensor-proxy/ssh.go b/cmd/pulse-sensor-proxy/ssh.go index 8134bde89..2730f7a01 100644 --- a/cmd/pulse-sensor-proxy/ssh.go +++ b/cmd/pulse-sensor-proxy/ssh.go @@ -582,15 +582,20 @@ func discoverClusterNodes() ([]string, error) { // Check if this is a standalone node or LXC container // - "does not exist" or "not part of a cluster": standalone node // - "ipcc_send_rec": running in LXC container without corosync access + // - "Unknown error -1": LXC container corosync communication failure + // - "Unable to load access control list": Permission/access issues in containers // Note: Some Proxmox versions write these messages to stdout, others to stderr if strings.Contains(combinedOutput, "does not exist") || strings.Contains(combinedOutput, "not part of a cluster") || - strings.Contains(combinedOutput, "ipcc_send_rec") { - log.Info().Msg("Standalone Proxmox node or LXC container detected - discovering local host addresses") + strings.Contains(combinedOutput, "ipcc_send_rec") || + strings.Contains(combinedOutput, "Unknown error -1") || + strings.Contains(combinedOutput, "Unable to load access control list") { + // Log at INFO level since this is expected for standalone/container scenarios + log.Info().Msg("Standalone Proxmox node or LXC container detected - using localhost for temperature collection") return discoverLocalHostAddresses() } - // For other errors, fail - log.Warn().Str("stderr", stderrStr).Str("stdout", stdoutStr).Msg("pvecm status failed") + // For other unexpected errors, fail with details + log.Warn().Str("stderr", stderrStr).Str("stdout", stdoutStr).Msg("pvecm status failed with unexpected error") return nil, fmt.Errorf("failed to get cluster status: %w (stderr: %s, stdout: %s)", err, stderrStr, stdoutStr) } diff --git a/install.sh b/install.sh index cf04cde10..91f9138a8 100755 --- a/install.sh +++ b/install.sh @@ -1583,7 +1583,50 @@ fi'; then fi if bash "$proxy_script" "${proxy_install_args[@]}" 2>&1 | tee /tmp/proxy-install-${CTID}.log; then - print_info "Temperature proxy installed successfully" + print_info "Temperature proxy installation script completed" + + # Verify proxy is actually working + echo + print_info "Verifying temperature proxy health..." + local proxy_health_ok=true + + # Check 1: Service is running + if ! systemctl is-active --quiet pulse-sensor-proxy 2>/dev/null; then + print_error "✗ Service not running" + proxy_health_ok=false + else + print_info "✓ Service running" + fi + + # Check 2: Socket exists + if [[ ! -S /run/pulse-sensor-proxy/pulse-sensor-proxy.sock ]]; then + print_error "✗ Socket not found at /run/pulse-sensor-proxy/pulse-sensor-proxy.sock" + proxy_health_ok=false + else + print_info "✓ Socket exists" + fi + + # Check 3: Socket is accessible from container + if ! pct exec $CTID -- test -S /mnt/pulse-proxy/pulse-sensor-proxy.sock 2>/dev/null; then + print_error "✗ Socket not visible inside container at /mnt/pulse-proxy/pulse-sensor-proxy.sock" + print_error " Bind mount may not be configured correctly" + proxy_health_ok=false + else + print_info "✓ Socket accessible from container" + fi + + if [[ "$proxy_health_ok" != "true" ]]; then + echo + print_error "Temperature proxy health check failed" + print_error "See diagnostics above and logs: /tmp/proxy-install-${CTID}.log" + print_error "" + print_error "Check: systemctl status pulse-sensor-proxy" + print_error "Check: journalctl -u pulse-sensor-proxy -n 50" + echo + exit 1 + fi + + print_success "Temperature proxy is healthy and ready" # Clean up temporary binary if it was copied [[ -f "$local_proxy_binary" ]] && rm -f "$local_proxy_binary" else diff --git a/scripts/install-sensor-proxy.sh b/scripts/install-sensor-proxy.sh index 0ae591afe..385950b27 100755 --- a/scripts/install-sensor-proxy.sh +++ b/scripts/install-sensor-proxy.sh @@ -821,8 +821,26 @@ fi if ! systemctl restart pulse-sensor-proxy.service; then print_error "Failed to start pulse-sensor-proxy service" - print_error "Check service logs:" - journalctl -u pulse-sensor-proxy -n 20 --no-pager + print_error "" + print_error "═══════════════════════════════════════════════════════" + print_error "Service Status:" + print_error "═══════════════════════════════════════════════════════" + systemctl status pulse-sensor-proxy --no-pager --lines=0 2>&1 || true + print_error "" + print_error "═══════════════════════════════════════════════════════" + print_error "Recent Logs (last 40 lines):" + print_error "═══════════════════════════════════════════════════════" + journalctl -u pulse-sensor-proxy -n 40 --no-pager 2>&1 || true + print_error "" + print_error "═══════════════════════════════════════════════════════" + print_error "Common Issues:" + print_error "═══════════════════════════════════════════════════════" + print_error "1. Missing user: Run 'useradd --system --no-create-home --group pulse-sensor-proxy'" + print_error "2. Permission errors: Check ownership of /var/lib/pulse-sensor-proxy" + print_error "3. lm-sensors not installed: Run 'apt-get install lm-sensors && sensors-detect --auto'" + print_error "4. Standalone node detection: If you see 'pvecm' errors, this is expected for non-clustered hosts" + print_error "" + print_error "For more help: https://github.com/rcourtman/Pulse/blob/main/docs/TROUBLESHOOTING.md" exit 1 fi @@ -836,8 +854,26 @@ for i in {1..10}; do done if [[ ! -S "$SOCKET_PATH" ]]; then - print_error "Socket did not appear after 10 seconds" - print_info "Check service status: systemctl status pulse-sensor-proxy" + print_error "Socket did not appear at $SOCKET_PATH after 10 seconds" + print_error "" + print_error "═══════════════════════════════════════════════════════" + print_error "Diagnostics:" + print_error "═══════════════════════════════════════════════════════" + print_error "Service Status:" + systemctl status pulse-sensor-proxy --no-pager 2>&1 || true + print_error "" + print_error "Socket Directory Permissions:" + ls -la /run/pulse-sensor-proxy/ 2>&1 || echo "Directory does not exist" + print_error "" + print_error "Recent Logs:" + journalctl -u pulse-sensor-proxy -n 20 --no-pager 2>&1 || true + print_error "" + print_error "Common Causes:" + print_error " • Service failed to start (check logs above)" + print_error " • RuntimeDirectory permissions issue" + print_error " • Systemd socket creation failed" + print_error "" + print_error "Try: systemctl restart pulse-sensor-proxy && watch -n 0.5 'ls -la /run/pulse-sensor-proxy/'" exit 1 fi