Add guest agent caching and update doc hints (refs #560)

This commit is contained in:
rcourtman
2025-10-16 08:15:49 +00:00
parent 605512aa6b
commit 3a4fc044ea
15 changed files with 211 additions and 59 deletions

View File

@@ -30,7 +30,7 @@ run: build
# Development - rebuild everything and restart service
dev: frontend backend
sudo systemctl restart pulse-backend
sudo systemctl restart pulse-hot-dev
dev-hot:
./scripts/dev-hot.sh
@@ -42,7 +42,7 @@ clean:
# Quick rebuild and restart for development
restart: frontend backend
sudo systemctl restart pulse-backend
sudo systemctl restart pulse-hot-dev
# Run linters for both backend and frontend
lint: lint-backend lint-frontend

View File

@@ -362,7 +362,7 @@ Configure persistent alert policies in **Settings → Alerts → Custom Rules**:
### HTTPS/TLS Configuration
Enable HTTPS by setting these environment variables:
```bash
# Systemd: sudo systemctl edit pulse-backend
# Systemd (service: pulse; legacy installs may use pulse-backend): sudo systemctl edit pulse
Environment="HTTPS_ENABLED=true"
Environment="TLS_CERT_FILE=/etc/pulse/cert.pem"
Environment="TLS_KEY_FILE=/etc/pulse/key.pem"
@@ -379,7 +379,7 @@ docker run -d -p 7655:7655 \
For deployment overrides (ports, etc), use environment variables:
```bash
# Systemd: sudo systemctl edit pulse-backend
# Systemd (service: pulse; legacy installs may use pulse-backend): sudo systemctl edit pulse
Environment="FRONTEND_PORT=8080"
# Docker: -e FRONTEND_PORT=8080

View File

@@ -8,12 +8,14 @@ Pulse provides a REST API for monitoring and managing Proxmox VE and PBS instanc
Pulse supports multiple authentication methods that can be used independently or together:
> **Service name note:** Systemd deployments use `pulse.service`. If your host still uses the legacy `pulse-backend.service`, substitute that name in the commands below.
### Password Authentication
Set a username and password for web UI access. Passwords are hashed with bcrypt (cost 12) for security.
```bash
# Systemd
sudo systemctl edit pulse-backend
sudo systemctl edit pulse
# Add:
[Service]
Environment="PULSE_AUTH_USER=admin"
@@ -32,7 +34,7 @@ For programmatic API access and automation. Manage tokens via **Settings → Sec
```bash
# Systemd
sudo systemctl edit pulse-backend
sudo systemctl edit pulse
# Add:
[Service]
Environment="API_TOKENS=token-a,token-b"

View File

@@ -69,7 +69,7 @@ Keeping application configuration separate from authentication credentials:
```bash
systemctl list-units | grep pulse
```
It might be `pulse` or `pulse-backend` depending on your installation method.
It might be `pulse` (default), `pulse-backend` (legacy), or `pulse-hot-dev` (dev environment) depending on your installation method.
2. Verify the configuration is loaded:
```bash

View File

@@ -767,7 +767,7 @@ EOF
# Should show socket
# Check Pulse logs for connection success
journalctl -u pulse-backend -n 50 | grep -i temperature
journalctl -u pulse -n 50 | grep -i temperature
```
**Phase 4: End-to-End Validation**

View File

@@ -285,7 +285,7 @@ ws.onerror = (e) => console.error('WebSocket error:', e);
### 502 Bad Gateway
- Pulse not running on expected port (default 7655)
- Check with: `curl http://localhost:7655/api/health`
- Verify Pulse service: `systemctl status pulse-backend`
- Verify Pulse service: `systemctl status pulse` (use `pulse-backend` if you're on a legacy unit)
### WebSocket closes immediately
- Timeout too short in proxy configuration
@@ -315,6 +315,6 @@ ws.onerror = (e) => console.error('WebSocket error:', e);
If WebSockets still don't work after following this guide:
1. Check browser console for errors (F12)
2. Verify Pulse logs: `journalctl -u pulse-backend -f`
2. Verify Pulse logs: `journalctl -u pulse -f`
3. Test without proxy first: `http://your-server:7655`
4. Report issues: https://github.com/rcourtman/Pulse/issues
4. Report issues: https://github.com/rcourtman/Pulse/issues

View File

@@ -4,6 +4,8 @@
**Starting with v4.5.0, authentication setup is prompted for all new Pulse installations.** This protects your Proxmox API credentials from unauthorized access.
> **Service name note:** Systemd deployments use `pulse.service`. If you're upgrading from an older install that still registers `pulse-backend.service`, substitute that name in the commands below.
### First-Run Security Setup
When you first access Pulse, you'll be guided through a mandatory security setup:
- Create your admin username and password
@@ -28,7 +30,7 @@ Legacy configuration (no longer applicable):
PULSE_TRUSTED_NETWORKS=192.168.1.0/24,10.0.0.0/24
# Or in systemd
sudo systemctl edit pulse-backend
sudo systemctl edit pulse
[Service]
Environment="PULSE_TRUSTED_NETWORKS=192.168.1.0/24,10.0.0.0/24"
```
@@ -78,14 +80,14 @@ By default, configuration export/import is blocked for security. You have two op
### Option 1: Set API Tokens (Recommended)
```bash
# Using systemd (secure)
sudo systemctl edit pulse-backend
sudo systemctl edit pulse
# Add:
[Service]
Environment="API_TOKENS=ansible-token,docker-agent-token"
Environment="API_TOKEN=legacy-token" # Optional fallback
# Then restart:
sudo systemctl restart pulse-backend
sudo systemctl restart pulse
# Docker
docker run -e API_TOKENS=ansible-token,docker-agent-token rcourtman/pulse:latest
@@ -94,7 +96,7 @@ docker run -e API_TOKENS=ansible-token,docker-agent-token rcourtman/pulse:latest
### Option 2: Allow Unprotected Export (Homelab)
```bash
# Using systemd
sudo systemctl edit pulse-backend
sudo systemctl edit pulse
# Add:
[Service]
Environment="ALLOW_UNPROTECTED_EXPORT=true"
@@ -187,7 +189,7 @@ This automatically:
#### Manual Setup (Advanced)
```bash
# Using systemd (password will be hashed automatically)
sudo systemctl edit pulse-backend
sudo systemctl edit pulse
# Add:
[Service]
Environment="PULSE_AUTH_USER=admin"
@@ -223,7 +225,7 @@ The Quick Security Setup automatically:
#### Manual Token Setup
```bash
# Using systemd (plain text values are auto-hashed on startup)
sudo systemctl edit pulse-backend
sudo systemctl edit pulse
# Add:
[Service]
Environment="API_TOKENS=ansible-token,docker-agent-token"
@@ -278,7 +280,7 @@ If you need to access Pulse API from a different domain:
docker run -e ALLOWED_ORIGINS="https://app.example.com" rcourtman/pulse:latest
# systemd
sudo systemctl edit pulse-backend
sudo systemctl edit pulse
[Service]
Environment="ALLOWED_ORIGINS=https://app.example.com"

View File

@@ -123,13 +123,16 @@ sudo systemctl restart pulse
#### Service name confusion
Pulse uses different service names depending on installation method:
- **ProxmoxVE Script**: `pulse`
- **Manual Install**: `pulse-backend`
- **Default systemd install**: `pulse`
- **Legacy installs (pre-v4.7)**: `pulse-backend`
- **Hot dev environment**: `pulse-hot-dev`
- **Docker**: N/A (container name)
To check which you have:
```bash
systemctl status pulse 2>/dev/null || systemctl status pulse-backend
systemctl status pulse 2>/dev/null \
|| systemctl status pulse-backend 2>/dev/null \
|| systemctl status pulse-hot-dev
```
### Notification Issues

View File

@@ -81,7 +81,7 @@ pveum acl modify /nodes -user pulse-monitor@pam -role PVEAuditor
Disable ZFS monitoring if not needed:
```bash
echo "PULSE_DISABLE_ZFS_MONITORING=true" >> /opt/pulse/.env
systemctl restart pulse-backend
systemctl restart pulse
```
## Example Alert
@@ -95,4 +95,4 @@ Errors: 12 read, 0 write, 3 checksum
Device sdb2: DEGRADED with 12 read errors
```
This helps administrators identify failing drives before complete failure occurs.
This helps administrators identify failing drives before complete failure occurs.

View File

@@ -3543,7 +3543,7 @@ const Settings: Component<SettingsProps> = (props) => {
Restart the development server:
</p>
<code class="block text-xs bg-gray-100 dark:bg-gray-700 p-2 rounded mt-1">
sudo systemctl restart pulse-backend
sudo systemctl restart pulse-hot-dev
</code>
</div>
</Show>

View File

@@ -281,6 +281,8 @@ type Monitor struct {
removedDockerHosts map[string]time.Time // Track deliberately removed Docker hosts (ID -> removal time)
dockerCommands map[string]*dockerHostCommand
dockerCommandIndex map[string]string
guestMetadataMu sync.RWMutex
guestMetadataCache map[string]guestMetadataCacheEntry
}
type rrdMemCacheEntry struct {
@@ -323,8 +325,17 @@ const (
dockerMaximumHealthWindow = 10 * time.Minute
nodeRRDCacheTTL = 30 * time.Second
nodeRRDRequestTimeout = 2 * time.Second
guestMetadataCacheTTL = 5 * time.Minute
)
type guestMetadataCacheEntry struct {
ipAddresses []string
networkInterfaces []models.GuestNetworkInterface
osName string
osVersion string
fetchedAt time.Time
}
func (m *Monitor) getNodeRRDMemAvailable(ctx context.Context, client PVEClientInterface, nodeName string) (uint64, error) {
if client == nil || nodeName == "" {
return 0, fmt.Errorf("invalid arguments for RRD lookup")
@@ -813,14 +824,33 @@ func sortContent(content string) string {
return strings.Join(parts, ",")
}
func fetchGuestAgentMetadata(ctx context.Context, client PVEClientInterface, instanceName, nodeName, vmName string, vmid int, vmStatus *proxmox.VMStatus) ([]string, []models.GuestNetworkInterface, string, string) {
if vmStatus == nil {
func (m *Monitor) fetchGuestAgentMetadata(ctx context.Context, client PVEClientInterface, instanceName, nodeName, vmName string, vmid int, vmStatus *proxmox.VMStatus) ([]string, []models.GuestNetworkInterface, string, string) {
if vmStatus == nil || client == nil {
m.clearGuestMetadataCache(instanceName, nodeName, vmid)
return nil, nil, "", ""
}
var ipAddresses []string
var networkIfaces []models.GuestNetworkInterface
var osName, osVersion string
if vmStatus.Agent <= 0 {
m.clearGuestMetadataCache(instanceName, nodeName, vmid)
return nil, nil, "", ""
}
key := guestMetadataCacheKey(instanceName, nodeName, vmid)
now := time.Now()
m.guestMetadataMu.RLock()
cached, ok := m.guestMetadataCache[key]
m.guestMetadataMu.RUnlock()
if ok && now.Sub(cached.fetchedAt) < guestMetadataCacheTTL {
return cloneStringSlice(cached.ipAddresses), cloneGuestNetworkInterfaces(cached.networkInterfaces), cached.osName, cached.osVersion
}
// Start with cached values as fallback in case new calls fail
ipAddresses := cloneStringSlice(cached.ipAddresses)
networkIfaces := cloneGuestNetworkInterfaces(cached.networkInterfaces)
osName := cached.osName
osVersion := cached.osVersion
ifaceCtx, cancelIface := context.WithTimeout(ctx, 5*time.Second)
interfaces, err := client.GetVMNetworkInterfaces(ifaceCtx, nodeName, vmid)
@@ -834,27 +864,86 @@ func fetchGuestAgentMetadata(ctx context.Context, client PVEClientInterface, ins
Msg("Guest agent network interfaces unavailable")
} else if len(interfaces) > 0 {
ipAddresses, networkIfaces = processGuestNetworkInterfaces(interfaces)
} else {
ipAddresses = nil
networkIfaces = nil
}
if vmStatus.Agent > 0 {
osCtx, cancelOS := context.WithTimeout(ctx, 3*time.Second)
agentInfo, err := client.GetVMAgentInfo(osCtx, nodeName, vmid)
cancelOS()
if err != nil {
log.Debug().
Str("instance", instanceName).
Str("vm", vmName).
Int("vmid", vmid).
Err(err).
Msg("Guest agent OS info unavailable")
} else if len(agentInfo) > 0 {
osName, osVersion = extractGuestOSInfo(agentInfo)
}
osCtx, cancelOS := context.WithTimeout(ctx, 3*time.Second)
agentInfo, err := client.GetVMAgentInfo(osCtx, nodeName, vmid)
cancelOS()
if err != nil {
log.Debug().
Str("instance", instanceName).
Str("vm", vmName).
Int("vmid", vmid).
Err(err).
Msg("Guest agent OS info unavailable")
} else if len(agentInfo) > 0 {
osName, osVersion = extractGuestOSInfo(agentInfo)
} else {
osName = ""
osVersion = ""
}
entry := guestMetadataCacheEntry{
ipAddresses: cloneStringSlice(ipAddresses),
networkInterfaces: cloneGuestNetworkInterfaces(networkIfaces),
osName: osName,
osVersion: osVersion,
fetchedAt: time.Now(),
}
m.guestMetadataMu.Lock()
if m.guestMetadataCache == nil {
m.guestMetadataCache = make(map[string]guestMetadataCacheEntry)
}
m.guestMetadataCache[key] = entry
m.guestMetadataMu.Unlock()
return ipAddresses, networkIfaces, osName, osVersion
}
func guestMetadataCacheKey(instanceName, nodeName string, vmid int) string {
return fmt.Sprintf("%s|%s|%d", instanceName, nodeName, vmid)
}
func (m *Monitor) clearGuestMetadataCache(instanceName, nodeName string, vmid int) {
if m == nil {
return
}
key := guestMetadataCacheKey(instanceName, nodeName, vmid)
m.guestMetadataMu.Lock()
if m.guestMetadataCache != nil {
delete(m.guestMetadataCache, key)
}
m.guestMetadataMu.Unlock()
}
func cloneStringSlice(src []string) []string {
if len(src) == 0 {
return nil
}
dst := make([]string, len(src))
copy(dst, src)
return dst
}
func cloneGuestNetworkInterfaces(src []models.GuestNetworkInterface) []models.GuestNetworkInterface {
if len(src) == 0 {
return nil
}
dst := make([]models.GuestNetworkInterface, len(src))
for i, iface := range src {
dst[i] = iface
if len(iface.Addresses) > 0 {
dst[i].Addresses = cloneStringSlice(iface.Addresses)
}
}
return dst
}
func processGuestNetworkInterfaces(raw []proxmox.VMNetworkInterface) ([]string, []models.GuestNetworkInterface) {
ipSet := make(map[string]struct{})
ipAddresses := make([]string, 0)
@@ -1194,9 +1283,10 @@ func New(cfg *config.Config) (*Monitor, error) {
nodeSnapshots: make(map[string]NodeMemorySnapshot),
guestSnapshots: make(map[string]GuestMemorySnapshot),
nodeRRDMemCache: make(map[string]rrdMemCacheEntry),
removedDockerHosts: make(map[string]time.Time),
dockerCommands: make(map[string]*dockerHostCommand),
dockerCommandIndex: make(map[string]string),
removedDockerHosts: make(map[string]time.Time),
dockerCommands: make(map[string]*dockerHostCommand),
dockerCommandIndex: make(map[string]string),
guestMetadataCache: make(map[string]guestMetadataCacheEntry),
}
// Load saved configurations
@@ -3108,7 +3198,7 @@ func (m *Monitor) pollVMsAndContainersEfficient(ctx context.Context, instanceNam
}
// Gather guest metadata from the agent when available
guestIPs, guestIfaces, guestOSName, guestOSVersion := fetchGuestAgentMetadata(ctx, client, instanceName, res.Node, res.Name, res.VMID, detailedStatus)
guestIPs, guestIfaces, guestOSName, guestOSVersion := m.fetchGuestAgentMetadata(ctx, client, instanceName, res.Node, res.Name, res.VMID, detailedStatus)
if len(guestIPs) > 0 {
ipAddresses = guestIPs
}
@@ -3796,7 +3886,7 @@ func (m *Monitor) pollVMsWithNodes(ctx context.Context, instanceName string, cli
memUsed = memTotal
}
guestIPs, guestIfaces, guestOSName, guestOSVersion := fetchGuestAgentMetadata(ctx, client, instanceName, node.Node, vm.Name, vm.VMID, status)
guestIPs, guestIfaces, guestOSName, guestOSVersion := m.fetchGuestAgentMetadata(ctx, client, instanceName, node.Node, vm.Name, vm.VMID, status)
if len(guestIPs) > 0 {
ipAddresses = guestIPs
}

View File

@@ -266,7 +266,7 @@ func (m *Monitor) pollVMsWithNodesOptimized(ctx context.Context, instanceName st
}
if vm.Status == "running" && vmStatus != nil {
guestIPs, guestIfaces, guestOSName, guestOSVersion := fetchGuestAgentMetadata(ctx, client, instanceName, n.Node, vm.Name, vm.VMID, vmStatus)
guestIPs, guestIfaces, guestOSName, guestOSVersion := m.fetchGuestAgentMetadata(ctx, client, instanceName, n.Node, vm.Name, vm.VMID, vmStatus)
if len(guestIPs) > 0 {
ipAddresses = guestIPs
}

View File

@@ -35,6 +35,8 @@ echo -e "${GREEN}✓ Backup created: $BACKUP_FILE${NC}"
# Stop backend to prevent writes during cleanup
echo "Stopping backend..."
pkill -x pulse 2>/dev/null || true
sudo systemctl stop pulse-hot-dev 2>/dev/null || true
sudo systemctl stop pulse 2>/dev/null || true
sudo systemctl stop pulse-backend 2>/dev/null || true
sleep 2
@@ -58,4 +60,5 @@ echo -e "${GREEN}✓ Mock alerts removed successfully${NC}"
echo ""
echo "To restart the backend, run:"
echo " ./scripts/hot-dev.sh (for development)"
echo " sudo systemctl start pulse-backend (for production)"
echo " sudo systemctl start pulse (systemd)"
echo " sudo systemctl start pulse-backend (legacy)"

View File

@@ -18,21 +18,56 @@ NC='\033[0m'
# STATE DETECTION
#########################################
detect_backend_service() {
local services=("pulse-hot-dev" "pulse" "pulse-backend")
for svc in "${services[@]}"; do
if systemctl list-unit-files --no-legend 2>/dev/null | grep -q "^${svc}\\.service"; then
echo "$svc"
return 0
fi
done
echo ""
}
detect_running_backend_service() {
local services=("pulse-hot-dev" "pulse" "pulse-backend")
for svc in "${services[@]}"; do
if systemctl is-active --quiet "$svc" 2>/dev/null; then
echo "$svc"
return 0
fi
done
echo ""
}
detect_backend_state() {
local state="{}"
local running_service=$(detect_running_backend_service)
# Check if pulse-backend service is running
if systemctl is-active --quiet pulse-backend 2>/dev/null; then
state=$(echo "$state" | jq '. + {backend_running: true, backend_type: "systemd"}')
if [[ -n "$running_service" ]]; then
local backend_type="systemd"
if [[ "$running_service" == "pulse-hot-dev" ]]; then
backend_type="hot-dev"
fi
state=$(echo "$state" | jq ". + {backend_running: true, backend_type: \"$backend_type\", backend_service: \"$running_service\"}")
# Check mock mode from logs (multiple possible indicators, look at last 2 minutes for reliability)
if sudo journalctl -u pulse-backend --since "2 minutes ago" | grep -qE "(Mock mode enabled|mockEnabled=true|mock mode trackedNodes)"; then
if sudo journalctl -u "$running_service" --since "2 minutes ago" | grep -qE "(Mock mode enabled|mockEnabled=true|mock mode trackedNodes)"; then
state=$(echo "$state" | jq '. + {mock_mode: true}')
else
state=$(echo "$state" | jq '. + {mock_mode: false}')
fi
else
state=$(echo "$state" | jq '. + {backend_running: false}')
local configured_service=$(detect_backend_service)
if [[ -n "$configured_service" ]]; then
local backend_type="systemd"
if [[ "$configured_service" == "pulse-hot-dev" ]]; then
backend_type="hot-dev"
fi
state=$(echo "$state" | jq ". + {backend_service: \"$configured_service\", backend_type: \"$backend_type\"}")
fi
fi
# Check what's configured in mock.env.local
@@ -82,6 +117,11 @@ get_full_state() {
switch_to_mock() {
echo -e "${YELLOW}Switching to mock mode...${NC}"
local service=$(detect_backend_service)
if [[ -z "$service" ]]; then
echo -e "${RED}✗ No Pulse systemd service detected${NC}"
return 1
fi
# Update mock.env.local (preferred) or mock.env
if [ -f "$ROOT_DIR/mock.env.local" ]; then
@@ -93,14 +133,14 @@ switch_to_mock() {
fi
# Restart backend
sudo systemctl restart pulse-backend
sudo systemctl restart "$service"
echo -e "${GREEN}✓ Backend restarted${NC}"
# Wait for backend to be ready
sleep 3
# Verify
if sudo journalctl -u pulse-backend --since "5 seconds ago" | grep -qE "(Mock mode enabled|mockEnabled=true|mock mode trackedNodes)"; then
if sudo journalctl -u "$service" --since "5 seconds ago" | grep -qE "(Mock mode enabled|mockEnabled=true|mock mode trackedNodes)"; then
echo -e "${GREEN}✓ Mock mode ACTIVE${NC}"
return 0
else
@@ -111,6 +151,11 @@ switch_to_mock() {
switch_to_production() {
echo -e "${YELLOW}Switching to production mode...${NC}"
local service=$(detect_backend_service)
if [[ -z "$service" ]]; then
echo -e "${RED}✗ No Pulse systemd service detected${NC}"
return 1
fi
# Sync production config first
if [ -f "$ROOT_DIR/scripts/sync-production-config.sh" ]; then
@@ -128,7 +173,7 @@ switch_to_production() {
fi
# Restart backend
sudo systemctl restart pulse-backend
sudo systemctl restart "$service"
echo -e "${GREEN}✓ Backend restarted${NC}"
# Wait for backend to be ready
@@ -211,7 +256,12 @@ cmd_prod() {
cmd_restart() {
echo -e "${YELLOW}Restarting backend...${NC}"
sudo systemctl restart pulse-backend
local service=$(detect_backend_service)
if [[ -z "$service" ]]; then
echo -e "${RED}✗ No Pulse systemd service detected${NC}"
return 1
fi
sudo systemctl restart "$service"
sleep 2
echo -e "${GREEN}✓ Backend restarted${NC}"
}

View File

@@ -75,6 +75,7 @@ kill_port() {
printf "[hot-dev] Cleaning up existing processes...\n"
sudo systemctl stop pulse-hot-dev 2>/dev/null || true
sudo systemctl stop pulse-backend 2>/dev/null || true
sudo systemctl stop pulse 2>/dev/null || true
sudo systemctl stop pulse-frontend 2>/dev/null || true
@@ -196,7 +197,8 @@ cleanup() {
pkill -f vite 2>/dev/null || true
pkill -f "npm run dev" 2>/dev/null || true
pkill -9 -x "pulse" 2>/dev/null || true
echo "Hot-dev stopped. To restart normal service, run: sudo systemctl start pulse-backend"
echo "Hot-dev stopped. To restart normal service, run: sudo systemctl start pulse"
echo "(Legacy installs may use: sudo systemctl start pulse-backend)"
}
trap cleanup INT TERM EXIT