From 17208cbf9de93bb2bbdbad0e059ea0fe141b5fbd Mon Sep 17 00:00:00 2001 From: rcourtman Date: Fri, 30 Jan 2026 19:00:10 +0000 Subject: [PATCH] docs: update AI evaluation matrix and approval workflow documentation --- .github/workflows/eval-model-matrix.yml | 63 +++ .gitignore | 2 +- analyze_coverage.py | 56 +++ cmd/eval/main.go | 520 ++++++++++++++++++++++-- docs/AI.md | 65 +++ docs/EVAL.md | 43 ++ docs/architecture/pulse-assistant.md | 24 ++ mock.env | 10 + scripts/dev-check.sh | 16 +- scripts/hot-dev.sh | 42 +- 10 files changed, 774 insertions(+), 67 deletions(-) create mode 100644 .github/workflows/eval-model-matrix.yml create mode 100644 analyze_coverage.py create mode 100644 mock.env diff --git a/.github/workflows/eval-model-matrix.yml b/.github/workflows/eval-model-matrix.yml new file mode 100644 index 000000000..9fcf5737e --- /dev/null +++ b/.github/workflows/eval-model-matrix.yml @@ -0,0 +1,63 @@ +name: Pulse AI Model Matrix + +on: + workflow_dispatch: + inputs: + scenario: + description: Scenario or collection to run (e.g. matrix, smoke, readonly, advanced) + required: true + default: matrix + models: + description: Comma-separated model list (e.g. gpt-4.1-mini,claude-3-5-sonnet,gemini-1.5-pro,ollama:llama3.1) + required: false + default: "" + providers: + description: Optional provider filter (e.g. openai,anthropic,gemini,ollama) + required: false + default: "" + base_url: + description: Pulse API base URL (e.g. http://127.0.0.1:7655) + required: true + +jobs: + eval: + name: Model Matrix Eval + runs-on: self-hosted + timeout-minutes: 60 + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version-file: go.mod + + - name: Run eval matrix + env: + EVAL_REPORT_DIR: tmp/eval-reports + PULSE_EVAL_USER: ${{ secrets.PULSE_EVAL_USER || 'admin' }} + PULSE_EVAL_PASS: ${{ secrets.PULSE_EVAL_PASS || 'admin' }} + run: | + MODEL_ARGS=("-auto-models") + if [ -n "${{ inputs.models }}" ]; then + MODEL_ARGS=("-models" "${{ inputs.models }}") + fi + if [ -n "${{ inputs.providers }}" ]; then + export EVAL_MODEL_PROVIDERS="${{ inputs.providers }}" + fi + go run ./cmd/eval \ + -scenario "${{ inputs.scenario }}" \ + "${MODEL_ARGS[@]}" \ + -url "${{ inputs.base_url }}" \ + -user "${PULSE_EVAL_USER}" \ + -pass "${PULSE_EVAL_PASS}" + + - name: Upload eval reports + if: always() + uses: actions/upload-artifact@v4 + with: + name: eval-reports + path: tmp/eval-reports + retention-days: 14 diff --git a/.gitignore b/.gitignore index 245379316..47f902bf8 100644 --- a/.gitignore +++ b/.gitignore @@ -206,6 +206,6 @@ scripts/safe-checkout.sh BACKUP_SYSTEM.md # Generated artifacts -eval +/eval test_output.txt coverage_summary.txt diff --git a/analyze_coverage.py b/analyze_coverage.py new file mode 100644 index 000000000..070f967a6 --- /dev/null +++ b/analyze_coverage.py @@ -0,0 +1,56 @@ + +import sys +import os + +def parse_coverage(filename): + if not os.path.exists(filename): + print(f"File {filename} not found") + return + + package_stmts = {} + package_covered = {} + + with open(filename, 'r') as f: + lines = f.readlines() + + current_mode = "" + for line in lines: + if line.startswith("mode:"): + current_mode = line.split()[1] + continue + + parts = line.strip().split(':') + if len(parts) != 2: + continue + + file_path = parts[0] + # Package is directory of file_path + package_name = os.path.dirname(file_path) + + metrics = parts[1].split() + if len(metrics) != 3: + continue + + # start_end = metrics[0] + num_stmts = int(metrics[1]) + count = int(metrics[2]) + + package_stmts[package_name] = package_stmts.get(package_name, 0) + num_stmts + if count > 0: + package_covered[package_name] = package_covered.get(package_name, 0) + num_stmts + + results = [] + for pkg, total in package_stmts.items(): + covered = package_covered.get(pkg, 0) + percent = (covered / total) * 100 if total > 0 else 0 + results.append((pkg, percent, covered, total)) + + # Sort by percentage (ascending) + results.sort(key=lambda x: x[1]) + + print("Package Coverage Report (Bottom 20):") + for pkg, pct, cov, tot in results[:20]: + print(f"{pct:6.2f}% ({cov}/{tot}) {pkg}") + +if __name__ == "__main__": + parse_coverage("coverage.out") diff --git a/cmd/eval/main.go b/cmd/eval/main.go index 4072d5dff..75be09987 100644 --- a/cmd/eval/main.go +++ b/cmd/eval/main.go @@ -9,27 +9,39 @@ // // Options: // -// -scenario string Scenario to run: smoke, readonly, enforce, routing, routing-recovery, logs, readonly-recovery, search-id, disambiguate, context-target, discovery, writeverify, strict, strict-block, strict-recovery, readonly-guardrails, noninteractive, approval, approval-approve, approval-deny, patrol, patrol-basic, patrol-investigation, patrol-finding-quality, all (default "smoke") +// -scenario string Scenario to run: smoke, readonly, enforce, routing, routing-recovery, logs, readonly-recovery, search-id, disambiguate, context-target, discovery, writeverify, strict, strict-block, strict-recovery, readonly-guardrails, noninteractive, approval, approval-approve, approval-deny, approval-combo, patrol, patrol-basic, patrol-investigation, patrol-finding-quality, patrol-signal-coverage, matrix, all (default "smoke") // -url string Pulse API base URL (default "http://127.0.0.1:7655") // -user string Username for auth (default "admin") // -pass string Password for auth (default "admin") +// -model string Model override for chat requests +// -models string Comma-separated list of models to run (overrides -model) +// -auto-models Auto-select latest models per provider // -list List available scenarios and exit // -quiet Only show summary, not step-by-step output package main import ( + "encoding/json" "flag" "fmt" + "io" + "net/http" "os" + "sort" + "strings" + "time" "github.com/rcourtman/pulse-go-rewrite/internal/ai/eval" ) func main() { - scenario := flag.String("scenario", "smoke", "Scenario to run: smoke, readonly, enforce, routing, routing-recovery, logs, readonly-recovery, search-id, disambiguate, context-target, discovery, writeverify, strict, strict-block, strict-recovery, readonly-guardrails, noninteractive, approval, approval-approve, approval-deny, patrol, patrol-basic, patrol-investigation, patrol-finding-quality, all") + scenario := flag.String("scenario", "smoke", "Scenario to run: smoke, readonly, enforce, routing, routing-recovery, logs, readonly-recovery, search-id, disambiguate, context-target, discovery, writeverify, guest-control, guest-idempotent, guest-discovery, guest-natural, guest-multi, readonly-filtering, read-loop-recovery, ambiguous-intent, strict, strict-block, strict-recovery, readonly-guardrails, noninteractive, approval, approval-approve, approval-deny, approval-combo, patrol, patrol-basic, patrol-investigation, patrol-finding-quality, patrol-signal-coverage, matrix, all") url := flag.String("url", "http://127.0.0.1:7655", "Pulse API base URL") user := flag.String("user", "admin", "Username for auth") pass := flag.String("pass", "admin", "Password for auth") + model := flag.String("model", "", "Model override for chat requests") + models := flag.String("models", "", "Comma-separated list of models to run (overrides -model)") + autoModels := flag.Bool("auto-models", false, "Auto-select latest models per provider") list := flag.Bool("list", false, "List available scenarios and exit") quiet := flag.Bool("quiet", false, "Only show summary, not step-by-step output") @@ -40,58 +52,113 @@ func main() { return } - config := eval.Config{ - BaseURL: *url, - Username: *user, - Password: *pass, - Verbose: !*quiet, + baseConfig := eval.DefaultConfig() + baseConfig.BaseURL = *url + baseConfig.Username = *user + baseConfig.Password = *pass + baseConfig.Verbose = !*quiet + + if value, ok := envBool("EVAL_PREFLIGHT"); ok { + baseConfig.Preflight = value + } + if value, ok := envInt("EVAL_PREFLIGHT_TIMEOUT"); ok && value > 0 { + baseConfig.PreflightTimeout = time.Duration(value) * time.Second + } else if baseConfig.PreflightTimeout == 0 { + baseConfig.PreflightTimeout = 15 * time.Second } - runner := eval.NewRunner(config) - - // Check for patrol scenarios first - patrolScenarios := getPatrolScenarios(*scenario) - if len(patrolScenarios) > 0 { - allPassed := true - for _, ps := range patrolScenarios { - fmt.Printf("\n>>> Running patrol scenario: %s\n", ps.Name) - fmt.Printf(">>> %s\n", ps.Description) - - result := runner.RunPatrolScenario(ps) - runner.PrintPatrolSummary(result) - - if !result.Success { - allPassed = false - } - } - - if allPassed { - fmt.Printf("\n>>> ALL PATROL SCENARIOS PASSED\n") - os.Exit(0) - } else { - fmt.Printf("\n>>> SOME PATROL SCENARIOS FAILED\n") + modelList := parseModelList(*models) + if len(modelList) == 0 && *autoModels { + autoList, details, stats, err := fetchAutoModels(baseConfig.BaseURL, baseConfig.Username, baseConfig.Password) + if err != nil { + fmt.Fprintf(os.Stderr, "Failed to auto-select models: %v\n", err) os.Exit(1) } - return + modelList = autoList + fmt.Printf(">>> Auto-selected models: %s\n", strings.Join(modelList, ", ")) + if len(stats) > 0 { + fmt.Println(">>> Auto-selection provider summary:") + providers := sortedProviders(stats) + for _, provider := range providers { + stat := stats[provider] + fmt.Printf(" - %s: %d models (%d notable)\n", provider, stat.Total, stat.Notable) + } + } + if len(details) > 0 { + fmt.Println(">>> Auto-selection details:") + for _, detail := range details { + meta := detail.Reason + if meta == "" { + meta = "selected" + } + fmt.Printf(" - %s: %s (%s)\n", detail.Provider, detail.ID, meta) + } + } + } + if len(modelList) == 0 { + modelList = []string{strings.TrimSpace(*model)} + } + if len(modelList) == 0 { + modelList = []string{""} } - // Standard chat scenarios + patrolScenarios := getPatrolScenarios(*scenario) scenarios := getScenarios(*scenario) - if len(scenarios) == 0 { + if len(patrolScenarios) == 0 && len(scenarios) == 0 { fmt.Fprintf(os.Stderr, "Unknown scenario: %s\n", *scenario) fmt.Fprintf(os.Stderr, "Use -list to see available scenarios\n") os.Exit(1) } allPassed := true - for _, s := range scenarios { - fmt.Printf("\n>>> Running scenario: %s\n", s.Name) - fmt.Printf(">>> %s\n", s.Description) + for _, modelID := range modelList { + config := baseConfig + config.Model = strings.TrimSpace(modelID) - result := runner.RunScenario(s) - runner.PrintSummary(result) + if config.Model != "" { + fmt.Printf("\n>>> Using model: %s\n", config.Model) + } - if !result.Passed { + if config.Preflight { + fmt.Printf(">>> Preflight enabled (timeout %s)\n", config.PreflightTimeout) + } + + runner := eval.NewRunner(config) + + if len(patrolScenarios) > 0 { + modelPassed := true + for _, ps := range patrolScenarios { + fmt.Printf("\n>>> Running patrol scenario: %s\n", ps.Name) + fmt.Printf(">>> %s\n", ps.Description) + + result := runner.RunPatrolScenario(ps) + runner.PrintPatrolSummary(result) + + if !result.Success { + modelPassed = false + } + } + + if !modelPassed { + allPassed = false + } + continue + } + + modelPassed := true + for _, s := range scenarios { + fmt.Printf("\n>>> Running scenario: %s\n", s.Name) + fmt.Printf(">>> %s\n", s.Description) + + result := runner.RunScenario(s) + runner.PrintSummary(result) + + if !result.Passed { + modelPassed = false + } + } + + if !modelPassed { allPassed = false } } @@ -121,6 +188,18 @@ func listScenarios() { fmt.Println(" context-target - Context target carryover (2 steps)") fmt.Println(" discovery - Infrastructure discovery test (2 steps)") fmt.Println() + fmt.Println(" Guest Control:") + fmt.Println(" guest-control - Stop + start a guest via @mentions (2 steps)") + fmt.Println(" guest-idempotent - Idempotent stop (stop twice + start, 3 steps)") + fmt.Println(" guest-discovery - Stop without @mentions (discovery path, 2 steps)") + fmt.Println(" guest-natural - Natural language variations (turn off, shut down, 4 steps)") + fmt.Println(" guest-multi - Multi-mention status query (2 resources, 1 step)") + fmt.Println() + fmt.Println(" Safety & Filtering:") + fmt.Println(" readonly-filtering - Control tools excluded from read-only queries (3 steps)") + fmt.Println(" read-loop-recovery - Model produces text after budget blocks (2 steps)") + fmt.Println(" ambiguous-intent - Ambiguous requests default to read-only (3 steps)") + fmt.Println() fmt.Println(" Advanced:") fmt.Println(" troubleshoot - Multi-step troubleshooting workflow (4 steps)") fmt.Println(" deepdive - Deep investigation of a service (4 steps)") @@ -138,15 +217,18 @@ func listScenarios() { fmt.Println(" approval - Approval flow (1 step, opt-in)") fmt.Println(" approval-approve - Approval approve flow (1 step, opt-in)") fmt.Println(" approval-deny - Approval deny flow (1 step, opt-in)") + fmt.Println(" approval-combo - Approval approve + deny in one session (2 steps, opt-in)") fmt.Println() fmt.Println(" Patrol:") fmt.Println(" patrol - Run all patrol scenarios") fmt.Println(" patrol-basic - Basic patrol run (completion, tools, findings)") fmt.Println(" patrol-investigation - Investigation quality (investigate before report)") fmt.Println(" patrol-finding-quality - Finding validation (well-formed findings)") + fmt.Println(" patrol-signal-coverage - Signal-to-finding coverage scoring") fmt.Println() fmt.Println(" Collections:") fmt.Println(" all - Run all basic scenarios") + fmt.Println(" matrix - Model matrix quick run (smoke + readonly)") fmt.Println(" advanced - Run all advanced scenarios") fmt.Println(" full - Run everything") fmt.Println() @@ -165,6 +247,8 @@ func getPatrolScenarios(name string) []eval.PatrolScenario { return []eval.PatrolScenario{eval.PatrolInvestigationScenario()} case "patrol-finding-quality": return []eval.PatrolScenario{eval.PatrolFindingQualityScenario()} + case "patrol-signal-coverage", "patrol-quality": + return []eval.PatrolScenario{eval.PatrolSignalCoverageScenario()} default: return nil } @@ -196,6 +280,26 @@ func getScenarios(name string) []eval.Scenario { case "discovery": return []eval.Scenario{eval.DiscoveryScenario()} + // Guest control scenarios + case "guest-control": + return []eval.Scenario{eval.GuestControlStopScenario()} + case "guest-idempotent": + return []eval.Scenario{eval.GuestControlIdempotentScenario()} + case "guest-discovery": + return []eval.Scenario{eval.GuestControlDiscoveryScenario()} + case "guest-natural": + return []eval.Scenario{eval.GuestControlNaturalLanguageScenario()} + case "guest-multi": + return []eval.Scenario{eval.GuestControlMultiMentionScenario()} + + // Safety & filtering scenarios + case "readonly-filtering": + return []eval.Scenario{eval.ReadOnlyToolFilteringScenario()} + case "read-loop-recovery": + return []eval.Scenario{eval.ReadLoopRecoveryScenario()} + case "ambiguous-intent": + return []eval.Scenario{eval.AmbiguousIntentScenario()} + // Advanced scenarios case "troubleshoot": return []eval.Scenario{eval.TroubleshootingScenario()} @@ -229,6 +333,8 @@ func getScenarios(name string) []eval.Scenario { return []eval.Scenario{eval.ApprovalApproveScenario()} case "approval-deny": return []eval.Scenario{eval.ApprovalDenyScenario()} + case "approval-combo": + return []eval.Scenario{eval.ApprovalComboScenario()} // Collections case "all": @@ -245,6 +351,11 @@ func getScenarios(name string) []eval.Scenario { eval.ContextTargetCarryoverScenario(), eval.DiscoveryScenario(), } + case "matrix": + return []eval.Scenario{ + eval.QuickSmokeTest(), + eval.ReadOnlyInfrastructureScenario(), + } case "advanced": return []eval.Scenario{ eval.TroubleshootingScenario(), @@ -255,13 +366,20 @@ func getScenarios(name string) []eval.Scenario { eval.DockerInDockerScenario(), eval.ContextChainScenario(), eval.WriteVerifyScenario(), + eval.GuestControlStopScenario(), + eval.GuestControlIdempotentScenario(), + eval.GuestControlDiscoveryScenario(), + eval.GuestControlNaturalLanguageScenario(), + eval.GuestControlMultiMentionScenario(), + eval.ReadOnlyToolFilteringScenario(), + eval.ReadLoopRecoveryScenario(), + eval.AmbiguousIntentScenario(), eval.StrictResolutionScenario(), eval.StrictResolutionBlockScenario(), eval.StrictResolutionRecoveryScenario(), eval.ReadOnlyEnforcementScenario(), eval.NonInteractiveGuardrailScenario(), - eval.ApprovalApproveScenario(), - eval.ApprovalDenyScenario(), + eval.ApprovalComboScenario(), } case "full": return []eval.Scenario{ @@ -284,15 +402,331 @@ func getScenarios(name string) []eval.Scenario { eval.DockerInDockerScenario(), eval.ContextChainScenario(), eval.WriteVerifyScenario(), + eval.GuestControlStopScenario(), + eval.GuestControlIdempotentScenario(), + eval.GuestControlDiscoveryScenario(), + eval.GuestControlNaturalLanguageScenario(), + eval.GuestControlMultiMentionScenario(), + eval.ReadOnlyToolFilteringScenario(), + eval.ReadLoopRecoveryScenario(), + eval.AmbiguousIntentScenario(), eval.StrictResolutionScenario(), eval.StrictResolutionBlockScenario(), eval.StrictResolutionRecoveryScenario(), eval.ReadOnlyEnforcementScenario(), eval.NonInteractiveGuardrailScenario(), - eval.ApprovalApproveScenario(), - eval.ApprovalDenyScenario(), + eval.ApprovalComboScenario(), } default: return nil } } + +func envBool(key string) (bool, bool) { + value, ok := os.LookupEnv(key) + if !ok { + return false, false + } + switch strings.ToLower(strings.TrimSpace(value)) { + case "1", "true", "yes", "y", "on": + return true, true + case "0", "false", "no", "n", "off": + return false, true + default: + return false, false + } +} + +func envInt(key string) (int, bool) { + value, ok := os.LookupEnv(key) + if !ok { + return 0, false + } + var parsed int + if _, err := fmt.Sscanf(strings.TrimSpace(value), "%d", &parsed); err != nil { + return 0, false + } + return parsed, true +} + +func parseModelList(raw string) []string { + raw = strings.TrimSpace(raw) + if raw == "" { + return nil + } + parts := strings.Split(raw, ",") + models := make([]string, 0, len(parts)) + for _, part := range parts { + trimmed := strings.TrimSpace(part) + if trimmed == "" { + continue + } + models = append(models, trimmed) + } + return models +} + +type apiModelInfo struct { + ID string `json:"id"` + Name string `json:"name"` + Description string `json:"description,omitempty"` + Notable bool `json:"notable"` + CreatedAt int64 `json:"created_at,omitempty"` +} + +type apiModelsResponse struct { + Models []apiModelInfo `json:"models"` + Error string `json:"error,omitempty"` +} + +type providerStats struct { + Total int + Notable int +} + +type autoSelectionDetail struct { + Provider string + ID string + Name string + Notable bool + CreatedAt int64 + Reason string +} + +func fetchAutoModels(baseURL, user, pass string) ([]string, []autoSelectionDetail, map[string]providerStats, error) { + if strings.TrimSpace(baseURL) == "" { + return nil, nil, nil, fmt.Errorf("base URL is required") + } + + req, err := http.NewRequest("GET", strings.TrimRight(baseURL, "/")+"/api/ai/models", nil) + if err != nil { + return nil, nil, nil, fmt.Errorf("failed to build models request: %w", err) + } + req.SetBasicAuth(user, pass) + + client := &http.Client{Timeout: 30 * time.Second} + resp, err := client.Do(req) + if err != nil { + return nil, nil, nil, fmt.Errorf("models request failed: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + return nil, nil, nil, fmt.Errorf("models request returned %d: %s", resp.StatusCode, strings.TrimSpace(string(body))) + } + + var payload apiModelsResponse + if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil { + return nil, nil, nil, fmt.Errorf("failed to decode models response: %w", err) + } + if payload.Error != "" { + return nil, nil, nil, fmt.Errorf(payload.Error) + } + + providerFilter := parseProviderFilterWithDefault(os.Getenv("EVAL_MODEL_PROVIDERS")) + excludeKeywords := parseExcludeKeywords(os.Getenv("EVAL_MODEL_EXCLUDE_KEYWORDS")) + limit := 2 + if value, ok := envInt("EVAL_MODEL_LIMIT"); ok && value > 0 { + limit = value + } + + grouped := make(map[string][]apiModelInfo) + stats := make(map[string]providerStats) + for _, model := range payload.Models { + if model.ID == "" { + continue + } + parts := strings.SplitN(model.ID, ":", 2) + provider := parts[0] + if provider == "" { + continue + } + if len(providerFilter) > 0 && !providerFilter[provider] { + continue + } + if len(excludeKeywords) > 0 && hasAnyKeyword(model, excludeKeywords) { + continue + } + grouped[provider] = append(grouped[provider], model) + stat := stats[provider] + stat.Total++ + if model.Notable { + stat.Notable++ + } + stats[provider] = stat + } + + if len(grouped) == 0 { + return nil, nil, stats, fmt.Errorf("no models found for auto-selection") + } + + providers := make([]string, 0, len(grouped)) + for provider := range grouped { + providers = append(providers, provider) + } + sort.Strings(providers) + + seen := make(map[string]bool) + selected := make([]string, 0, len(grouped)*limit) + details := make([]autoSelectionDetail, 0, len(grouped)*limit) + for _, provider := range providers { + models := grouped[provider] + sort.Slice(models, func(i, j int) bool { + if models[i].Notable != models[j].Notable { + return models[i].Notable + } + if models[i].CreatedAt != models[j].CreatedAt { + return models[i].CreatedAt > models[j].CreatedAt + } + return models[i].ID < models[j].ID + }) + for _, model := range models { + if len(selected) >= len(grouped)*limit { + break + } + if seen[model.ID] { + continue + } + seen[model.ID] = true + selected = append(selected, model.ID) + details = append(details, autoSelectionDetail{ + Provider: provider, + ID: model.ID, + Name: model.Name, + Notable: model.Notable, + CreatedAt: model.CreatedAt, + Reason: selectionReason(model, stats[provider]), + }) + if countProvider(selected, provider) >= limit { + break + } + } + } + + if len(selected) == 0 { + return nil, nil, stats, fmt.Errorf("auto-selection produced no models") + } + return selected, details, stats, nil +} + +func parseProviderFilter(raw string) map[string]bool { + raw = strings.TrimSpace(raw) + if raw == "" { + return nil + } + out := make(map[string]bool) + for _, part := range strings.Split(raw, ",") { + trimmed := strings.TrimSpace(part) + if trimmed == "" { + continue + } + out[trimmed] = true + } + return out +} + +func parseProviderFilterWithDefault(raw string) map[string]bool { + raw = strings.TrimSpace(raw) + if raw == "" { + return map[string]bool{ + "openai": true, + "anthropic": true, + "deepseek": true, + "gemini": true, + "ollama": true, + } + } + return parseProviderFilter(raw) +} + +func parseExcludeKeywords(raw string) []string { + raw = strings.TrimSpace(raw) + if raw == "" { + return []string{ + "codex", + "openai:gpt-5.2-pro", + "image", + "vision", + "video", + "audio", + "speech", + "embed", + "embedding", + "moderation", + "rerank", + "tts", + "realtime", + "transcribe", + } + } + switch strings.ToLower(raw) { + case "0", "false", "off", "none": + return nil + } + parts := strings.Split(raw, ",") + out := make([]string, 0, len(parts)) + for _, part := range parts { + trimmed := strings.TrimSpace(part) + if trimmed == "" { + continue + } + out = append(out, strings.ToLower(trimmed)) + } + return out +} + +func hasAnyKeyword(model apiModelInfo, keywords []string) bool { + if len(keywords) == 0 { + return false + } + target := strings.ToLower(model.ID + " " + model.Name + " " + model.Description) + for _, keyword := range keywords { + if keyword == "" { + continue + } + if strings.Contains(target, keyword) { + return true + } + } + return false +} + +func countProvider(models []string, provider string) int { + if provider == "" { + return 0 + } + count := 0 + prefix := provider + ":" + for _, model := range models { + if strings.HasPrefix(model, prefix) { + count++ + } + } + return count +} + +func selectionReason(model apiModelInfo, stat providerStats) string { + parts := make([]string, 0, 2) + if stat.Notable == 0 { + parts = append(parts, "no notable models") + } + if model.Notable { + parts = append(parts, "notable") + } else if model.CreatedAt > 0 { + created := time.Unix(model.CreatedAt, 0).UTC().Format("2006-01-02") + parts = append(parts, "created_at="+created) + } else { + parts = append(parts, "fallback") + } + return strings.Join(parts, "; ") +} + +func sortedProviders(stats map[string]providerStats) []string { + providers := make([]string, 0, len(stats)) + for provider := range stats { + providers = append(providers, provider) + } + sort.Strings(providers) + return providers +} diff --git a/docs/AI.md b/docs/AI.md index 0896f5e4f..1ac2b3807 100644 --- a/docs/AI.md +++ b/docs/AI.md @@ -125,6 +125,35 @@ Alert-triggered analysis runs attach a timeline event to the alert, so investiga > **License note**: Kubernetes AI analysis is gated by the `kubernetes_ai` Pulse Pro feature. +## Pulse Assistant (Chat): How It Works + +Pulse Assistant is **tool-driven**. It does not "guess" system state — it calls live tools and reports their outputs. + +### The Model's Workflow (Discover → Investigate → Act) +- **Discover**: Uses `pulse_query` (or `pulse_discovery`) to find real resources and IDs. +- **Investigate**: Uses `pulse_read` to run bounded, read-only commands and check status/logs. +- **Act** (optional): Uses `pulse_control` for changes, then verifies with a read. + +### Safety Gates That Make It Trustworthy +- **Strict Resolution (optional)**: When enabled, the assistant must discover a resource before it can act on it. This prevents fabricated IDs. +- **Read/Write separation**: Read-only commands go through `pulse_read`; write actions go through `pulse_control`. This keeps the workflow state machine honest. +- **Verification after writes**: After any write, the assistant must perform a read check before it can finish the response. +- **Non‑interactive guardrails**: Commands that could hang (e.g., `tail -f`) are rewritten into bounded, safe forms. +- **Approval mode**: In Controlled mode, every write requires explicit user approval. Autonomous mode is available only with Pro. + +### What You See As a User +- **Clear tool usage**: Each step shows which tool ran and what it returned. +- **Structured recovery**: If a tool is blocked, the assistant adapts (e.g., runs discovery, switches tools, or asks for approval). +- **Verified outcomes**: Changes are followed by a read check before the assistant claims success. + +## Why It's Impressive (and Reliable) + +Pulse Assistant behaves like a careful operator: +- It **grounds answers in live data** instead of assumptions. +- It **adapts** when guardrails block an action. +- It **verifies** changes before reporting success. +- It **keeps you in control** with explicit approval gates. + ## Configuration Configure in the UI: **Settings → System → AI Assistant** @@ -149,6 +178,34 @@ You can set separate models for: - Patrol (`patrol_model`) - Auto-fix remediation (`auto_fix_model`) +## Model Matrix (Pulse Assistant) + +This table summarizes the most recent **Pulse Assistant** eval runs per model. Patrol is still in development and is not scored yet. +Time/tokens reflect the combined **Smoke + Read-only** matrix run. +Transient provider errors (rate limits, unavailable chat endpoints) are skipped when rendering the table. + +Update the table from eval reports: +``` +EVAL_REPORT_DIR=tmp/eval-reports go run ./cmd/eval -scenario matrix -auto-models +python3 scripts/eval/render_model_matrix.py tmp/eval-reports --write-doc docs/AI.md +``` +Or use the helper script: +``` +scripts/eval/run_model_matrix.sh +``` + + +| Model | Smoke | Read-only | Time (matrix) | Tokens (matrix) | Last run (UTC) | +| --- | --- | --- | --- | --- | --- | +| anthropic:claude-3-haiku-20240307 | ✅ | ❌ | 2m 42s | — | 2026-01-29 | +| anthropic:claude-haiku-4-5-20251001 | ✅ | ✅ | 8s | 18,923 | 2026-01-29 | +| anthropic:claude-opus-4-5-20251101 | ✅ | ✅ | 9m 31s | 1,120,530 | 2026-01-29 | +| gemini:gemini-3-flash-preview | ✅ | ✅ | 7m 4s | — | 2026-01-29 | +| gemini:gemini-3-pro-preview | ✅ | ✅ | 3m 54s | 1,914 | 2026-01-29 | +| openai:gpt-5.2 | ✅ | ✅ | 5s | 12,363 | 2026-01-29 | +| openai:gpt-5.2-chat-latest | ✅ | ✅ | 8s | 12,595 | 2026-01-29 | + + ### Testing - Test provider connectivity: `POST /api/ai/test` and `POST /api/ai/test/{provider}` @@ -202,6 +259,14 @@ Pulse uses three AI permission levels for infrastructure control: - **Controlled**: AI asks for approval before executing commands or control actions. - **Autonomous (Pro)**: AI executes actions without prompting. +### Using Approvals (Controlled Mode) + +When control level is **Controlled**, write actions pause for approval: + +- In chat, you’ll see an approval card with the proposed command. +- **Approve** to execute and verify the change, or **Deny** to cancel it. +- Only users with admin privileges can approve/deny. + ### Advanced Network Restrictions Pulse blocks AI tool HTTP fetches to loopback and link-local addresses by default. For local development, you can allow loopback targets: diff --git a/docs/EVAL.md b/docs/EVAL.md index f4744cf36..292d9b305 100644 --- a/docs/EVAL.md +++ b/docs/EVAL.md @@ -20,6 +20,16 @@ Run a single scenario: go run ./cmd/eval -scenario readonly ``` +Run the model matrix quick set: +``` +go run ./cmd/eval -scenario matrix +``` + +Auto-select models (latest per provider): +``` +go run ./cmd/eval -scenario matrix -auto-models +``` + ## Environment Overrides These env vars let you align the evals with your infrastructure naming: @@ -35,6 +45,10 @@ EVAL_HOMEASSISTANT_CONTAINER EVAL_MQTT_CONTAINER EVAL_ZIGBEE_CONTAINER EVAL_FRIGATE_CONTAINER +EVAL_MODEL (optional model override) +EVAL_MODEL_PROVIDERS (optional comma-separated provider filter for auto selection; defaults to openai,anthropic,deepseek,gemini,ollama) +EVAL_MODEL_LIMIT (optional per-provider limit for auto selection, default 2) +EVAL_MODEL_EXCLUDE_KEYWORDS (optional comma-separated keywords to skip models; default filters image/video/audio, codex, and specific pre-release IDs like openai:gpt-5.2-pro until chat support is live; set to "none" to disable) ``` Write/verify and strict-resolution controls: @@ -51,12 +65,15 @@ EVAL_EXPECT_APPROVAL (set to 1 to assert approval_needed event) Retry controls and reports: ``` +EVAL_HTTP_TIMEOUT (seconds, default 300) EVAL_STEP_RETRIES (default 2) EVAL_RETRY_ON_PHANTOM (default 1) EVAL_RETRY_ON_EXPLICIT_TOOL (default 1) EVAL_RETRY_ON_STREAM_FAILURE (default 1) EVAL_RETRY_ON_EMPTY_RESPONSE (default 1) EVAL_RETRY_ON_TOOL_ERRORS (default 1) +EVAL_RETRY_ON_RATE_LIMIT (default 0) +EVAL_RATE_LIMIT_COOLDOWN (seconds, optional backoff before retry) EVAL_PREFLIGHT (set to 1 to run a quick chat preflight) EVAL_PREFLIGHT_TIMEOUT (seconds, default 15) EVAL_REPORT_DIR (write JSON report per scenario) @@ -106,12 +123,38 @@ EVAL_EXPECT_APPROVAL=1 \ go run ./cmd/eval -scenario approval-deny ``` +Approval combo flow (approve + deny in one session): +``` +EVAL_EXPECT_APPROVAL=1 \ +go run ./cmd/eval -scenario approval-combo +``` + Write then verify (safe no-op command by default): ``` EVAL_REQUIRE_WRITE_VERIFY=1 \ go run ./cmd/eval -scenario writeverify ``` +## Model Matrix Workflow + +Run the matrix and update the docs table in one step: +``` +scripts/eval/run_model_matrix.sh +``` + +Key overrides: +``` +PULSE_BASE_URL=http://127.0.0.1:7655 +PULSE_EVAL_USER=admin +PULSE_EVAL_PASS=admin +EVAL_MODEL_PROVIDERS=openai,anthropic,gemini +EVAL_MODEL_LIMIT=2 +EVAL_MODELS=anthropic:claude-haiku-4-5-20251001 +EVAL_SCENARIO=matrix +EVAL_REPORT_DIR=tmp/eval-reports +EVAL_WRITE_DOC=1 +``` + ## Notes - The evals run against live infrastructure. Use safe commands or keep the default `EVAL_WRITE_COMMAND=true`. diff --git a/docs/architecture/pulse-assistant.md b/docs/architecture/pulse-assistant.md index 68037a36a..d3a4eefca 100644 --- a/docs/architecture/pulse-assistant.md +++ b/docs/architecture/pulse-assistant.md @@ -57,6 +57,18 @@ Pulse Assistant is a **protocol-driven, safety-gated AI system** for infrastruct 3. **Writes must be verified.** FSM enforces read-after-write before final answer. 4. **Errors are recoverable.** Structured error responses enable self-correction without prompt engineering. +## 1.1 User-Visible Behavior (What Feels "Impressive") + +When you use Pulse Assistant in chat, these behaviors are deliberate and enforced by the backend: + +- **Grounded answers**: The assistant uses live tools and surfaces their outputs. +- **Discover → Investigate → Act**: It queries resources first, reads status/logs, and only then acts. +- **Verified changes**: After a write, it performs a read check before concluding. +- **Approval gates**: In Controlled mode, write actions emit approvals and wait for a decision. +- **Self‑recovery**: If blocked (routing mismatch, read‑only violation, strict resolution), it adapts and retries with a safe path. + +These are not prompt conventions — they are enforced by the FSM + tool executor. + --- ## 2. Core Design Principles (Invariants) @@ -88,6 +100,18 @@ Resolved resources are **session-scoped** and **in-memory only**. They are never **Enforcement:** `ResolvedContext` not serialized, rebuilt each session in `chat/session.go` +### Approval Flow (Controlled Mode) + +When `control_level=controlled`, write tools emit an approval request instead of executing: + +1. Tool returns `APPROVAL_REQUIRED: { approval_id, command, ... }` +2. Agentic loop emits `approval_needed` SSE event +3. UI or API approves/denies via `/api/ai/approvals/{id}/approve|deny` +4. On approve, the tool re-executes with `_approval_id` and proceeds +5. On deny, the assistant returns `Command denied: ` + +This keeps the LLM in a proposer role while letting users explicitly authorize actions. + ### Invariant 6: Read/Write Tool Separation > **This is the most commonly violated invariant.** Read it carefully. diff --git a/mock.env b/mock.env new file mode 100644 index 000000000..5d34edd80 --- /dev/null +++ b/mock.env @@ -0,0 +1,10 @@ +# Mock Mode Configuration +PULSE_MOCK_MODE=false +PULSE_MOCK_NODES=7 +PULSE_MOCK_VMS_PER_NODE=5 +PULSE_MOCK_LXCS_PER_NODE=8 +PULSE_MOCK_DOCKER_HOSTS=3 +PULSE_MOCK_DOCKER_CONTAINERS=12 +PULSE_MOCK_RANDOM_METRICS=true +PULSE_MOCK_STOPPED_PERCENT=20 +PULSE_LICENSE_PUBLIC_KEY="OzbVzmg+TaSGt0eWzDVpn0QkqhOzJqUbOFvSF3AmuRU=" diff --git a/scripts/dev-check.sh b/scripts/dev-check.sh index 29b061a11..04f46ef91 100755 --- a/scripts/dev-check.sh +++ b/scripts/dev-check.sh @@ -17,7 +17,7 @@ if [[ "${1:-}" == "--kill" ]]; then pkill -9 -f "bin/pulse$" 2>/dev/null || true pkill -9 -f "^\./pulse$" 2>/dev/null || true pkill -f "node.*vite" 2>/dev/null || true - pkill -f "watch-backup.sh" 2>/dev/null || true + pkill -f "watch-snapshot.sh" 2>/dev/null || true sleep 2 echo -e "${GREEN}✓${NC} All dev processes stopped" exit 0 @@ -72,15 +72,15 @@ else echo -e "${YELLOW}⚠ Not running (enable in settings)${NC}" fi -# Check file backup watcher -echo -n "File backup watcher: " -BACKUP_PID=$(pgrep -f "watch-backup.sh" 2>/dev/null | head -1) -if [[ -n "$BACKUP_PID" ]]; then - BACKUP_COUNT=$(ls ~/.pulse-backups 2>/dev/null | wc -l | tr -d ' ') - echo -e "${GREEN}✓ Running (PID: $BACKUP_PID, $BACKUP_COUNT backups)${NC}" +# Check snapshot watcher +echo -n "Snapshot watcher: " +SNAPSHOT_PID=$(pgrep -f "watch-snapshot.sh" 2>/dev/null | head -1) +if [[ -n "$SNAPSHOT_PID" ]]; then + SNAPSHOT_COUNT=$(git -C ~/.pulse-snapshots rev-list --count HEAD 2>/dev/null || echo 0) + echo -e "${GREEN}✓ Running (PID: $SNAPSHOT_PID, $SNAPSHOT_COUNT snapshots)${NC}" else echo -e "${YELLOW}⚠ Not running (optional - protects against accidental file loss)${NC}" - echo " Start: ./scripts/watch-backup.sh &" + echo " Start: ./scripts/watch-snapshot.sh &" fi # Show recent errors diff --git a/scripts/hot-dev.sh b/scripts/hot-dev.sh index 58f973a97..6aefdc7b9 100755 --- a/scripts/hot-dev.sh +++ b/scripts/hot-dev.sh @@ -5,7 +5,7 @@ # - Go backend with auto-rebuild on file changes (via inotifywait) # - Vite frontend dev server with HMR # - Auto-detection of pulse-pro module for Pro features -# - File backup watcher (if scripts/watch-backup.sh exists) +# - Snapshot watcher (if scripts/watch-snapshot.sh exists) # # Environment Variables: # HOT_DEV_USE_PROD_DATA=true Use /etc/pulse for data (sessions, config, etc.) @@ -204,10 +204,14 @@ pkill -x "pulse" 2>/dev/null || true sleep 1 pkill -9 -x "pulse" 2>/dev/null || true + kill_port "${FRONTEND_DEV_PORT}" kill_port "${PULSE_DEV_API_PORT}" kill_port "${EXTRA_CLEANUP_PORT}" +# Truncate debug log +:> /tmp/pulse-debug.log + sleep 2 # Verify ports are free @@ -387,7 +391,7 @@ else fi fi -LOG_LEVEL=debug \ +LOG_LEVEL="${LOG_LEVEL:-debug}" \ FRONTEND_PORT="${PULSE_DEV_API_PORT:-7655}" \ PORT="${PULSE_DEV_API_PORT:-7655}" \ PULSE_DATA_DIR="${PULSE_DATA_DIR:-}" \ @@ -397,7 +401,9 @@ PULSE_DEV="${PULSE_DEV:-true}" \ PULSE_AUTH_USER="${PULSE_AUTH_USER:-}" \ PULSE_AUTH_PASS="${PULSE_AUTH_PASS:-}" \ ALLOWED_ORIGINS="${ALLOWED_ORIGINS:-}" \ -./pulse >> /tmp/pulse-debug.log 2>&1 & +LOG_FILE="/tmp/pulse-debug.log" \ +LOG_MAX_SIZE="50" \ +./pulse > /dev/null 2>&1 & BACKEND_PID=$! sleep 2 @@ -418,7 +424,7 @@ log_info "Starting backend health monitor..." if [[ "$PULSE_COUNT" -eq 0 ]]; then log_warn "⚠️ Pulse died unexpectedly, restarting..." - LOG_LEVEL=debug \ + LOG_LEVEL="${LOG_LEVEL:-debug}" \ FRONTEND_PORT="${PULSE_DEV_API_PORT:-7655}" \ PORT="${PULSE_DEV_API_PORT:-7655}" \ PULSE_DATA_DIR="${PULSE_DATA_DIR:-}" \ @@ -428,7 +434,9 @@ log_info "Starting backend health monitor..." PULSE_AUTH_USER="${PULSE_AUTH_USER:-}" \ PULSE_AUTH_PASS="${PULSE_AUTH_PASS:-}" \ ALLOWED_ORIGINS="${ALLOWED_ORIGINS:-}" \ - ./pulse >> /tmp/pulse-debug.log 2>&1 & + LOG_FILE="/tmp/pulse-debug.log" \ + LOG_MAX_SIZE="50" \ + ./pulse > /dev/null 2>&1 & NEW_PID=$! sleep 2 if kill -0 "$NEW_PID" 2>/dev/null; then @@ -440,7 +448,7 @@ log_info "Starting backend health monitor..." log_error "⚠️ Multiple Pulse processes detected ($PULSE_COUNT), killing all and restarting..." pkill -9 -f "^\./pulse$" 2>/dev/null || true sleep 2 - LOG_LEVEL=debug \ + LOG_LEVEL="${LOG_LEVEL:-debug}" \ FRONTEND_PORT="${PULSE_DEV_API_PORT:-7655}" \ PORT="${PULSE_DEV_API_PORT:-7655}" \ PULSE_DATA_DIR="${PULSE_DATA_DIR:-}" \ @@ -450,7 +458,9 @@ log_info "Starting backend health monitor..." PULSE_AUTH_USER="${PULSE_AUTH_USER:-}" \ PULSE_AUTH_PASS="${PULSE_AUTH_PASS:-}" \ ALLOWED_ORIGINS="${ALLOWED_ORIGINS:-}" \ - ./pulse >> /tmp/pulse-debug.log 2>&1 & + LOG_FILE="/tmp/pulse-debug.log" \ + LOG_MAX_SIZE="50" \ + ./pulse > /dev/null 2>&1 & NEW_PID=$! sleep 2 if kill -0 "$NEW_PID" 2>/dev/null; then @@ -479,7 +489,7 @@ log_info "Starting backend file watcher..." pkill -9 -f "^\./pulse$" 2>/dev/null || true sleep 1 - LOG_LEVEL=debug \ + LOG_LEVEL="${LOG_LEVEL:-debug}" \ FRONTEND_PORT="${PULSE_DEV_API_PORT:-7655}" \ PORT="${PULSE_DEV_API_PORT:-7655}" \ PULSE_DATA_DIR="${PULSE_DATA_DIR:-}" \ @@ -489,7 +499,9 @@ log_info "Starting backend file watcher..." PULSE_AUTH_USER="${PULSE_AUTH_USER:-}" \ PULSE_AUTH_PASS="${PULSE_AUTH_PASS:-}" \ ALLOWED_ORIGINS="${ALLOWED_ORIGINS:-}" \ - ./pulse >> /tmp/pulse-debug.log 2>&1 & + LOG_FILE="/tmp/pulse-debug.log" \ + LOG_MAX_SIZE="50" \ + ./pulse > /dev/null 2>&1 & NEW_PID=$! sleep 1 @@ -612,7 +624,7 @@ cleanup() { # Fallback cleanup pkill -f "inotifywait.*pulse" 2>/dev/null || true pkill -f "fswatch.*pulse" 2>/dev/null || true - pkill -f "watch-backup.sh" 2>/dev/null || true + pkill -f "watch-snapshot.sh" 2>/dev/null || true log_info "Hot-dev stopped." } @@ -620,12 +632,12 @@ trap cleanup INT TERM EXIT # --- Start File Backup Watcher (optional) --- -BACKUP_SCRIPT="${ROOT_DIR}/scripts/watch-backup.sh" -if [[ -x "${BACKUP_SCRIPT}" ]]; then - log_info "Starting file backup watcher..." - "${BACKUP_SCRIPT}" > /tmp/pulse-watch-backup.log 2>&1 & +SNAPSHOT_SCRIPT="${ROOT_DIR}/scripts/watch-snapshot.sh" +if [[ -x "${SNAPSHOT_SCRIPT}" ]]; then + log_info "Starting snapshot watcher..." + "${SNAPSHOT_SCRIPT}" > /tmp/pulse-watch-snapshot.log 2>&1 & BACKUP_WATCHER_PID=$! - log_info "File backups: ~/.pulse-backups (PID: ${BACKUP_WATCHER_PID})" + log_info "Snapshots: ~/.pulse-snapshots (PID: ${BACKUP_WATCHER_PID})" fi # --- Start Frontend ---