mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-02-18 00:17:39 +01:00
test(ai): add unit tests for eval runner
- Add unit tests for internal/ai/eval package - Validate configuration, retry logic, and custom SSE parsing - Enables coverage for eval framework without requiring live Pulse server
This commit is contained in:
173
internal/ai/eval/runner_test.go
Normal file
173
internal/ai/eval/runner_test.go
Normal file
@@ -0,0 +1,173 @@
|
||||
package eval
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestDefaultConfig(t *testing.T) {
|
||||
cfg := DefaultConfig()
|
||||
assert.Equal(t, "http://127.0.0.1:7655", cfg.BaseURL)
|
||||
assert.Equal(t, "admin", cfg.Username)
|
||||
assert.Equal(t, 2, cfg.StepRetries)
|
||||
}
|
||||
|
||||
func TestSanitizeFilename(t *testing.T) {
|
||||
tests := []struct {
|
||||
input string
|
||||
expected string
|
||||
}{
|
||||
{"Test Scenario", "test-scenario"},
|
||||
{"Test/Scenario", "test-scenario"},
|
||||
{"Test:Scenario", "test-scenario"},
|
||||
{" Test ", "test"},
|
||||
}
|
||||
|
||||
for _, tc := range tests {
|
||||
assert.Equal(t, tc.expected, sanitizeFilename(tc.input))
|
||||
}
|
||||
}
|
||||
|
||||
func TestRequiresExplicitTool(t *testing.T) {
|
||||
tests := []struct {
|
||||
prompt string
|
||||
expected bool
|
||||
}{
|
||||
{"use pulse_read please", true},
|
||||
{"check the system", false},
|
||||
{"use a read-only tool", true},
|
||||
{"use a control tool", true},
|
||||
}
|
||||
|
||||
for _, tc := range tests {
|
||||
assert.Equal(t, tc.expected, requiresExplicitTool(tc.prompt), "Prompt: %s", tc.prompt)
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyEvalEnvOverrides(t *testing.T) {
|
||||
os.Setenv("EVAL_STEP_RETRIES", "5")
|
||||
os.Setenv("EVAL_RETRY_ON_PHANTOM", "false")
|
||||
defer os.Unsetenv("EVAL_STEP_RETRIES")
|
||||
defer os.Unsetenv("EVAL_RETRY_ON_PHANTOM")
|
||||
|
||||
cfg := DefaultConfig()
|
||||
applyEvalEnvOverrides(&cfg)
|
||||
|
||||
assert.Equal(t, 5, cfg.StepRetries)
|
||||
assert.False(t, cfg.RetryOnPhantom)
|
||||
}
|
||||
|
||||
func TestRunner_RunScenario(t *testing.T) {
|
||||
// Mock server validating the request and returning a fake SSE stream
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
assert.Equal(t, "/api/ai/chat", r.URL.Path)
|
||||
assert.Equal(t, "text/event-stream", r.Header.Get("Accept"))
|
||||
|
||||
// Check basic auth
|
||||
u, p, ok := r.BasicAuth()
|
||||
assert.True(t, ok)
|
||||
assert.Equal(t, "admin", u)
|
||||
assert.Equal(t, "admin", p)
|
||||
|
||||
w.Header().Set("Content-Type", "text/event-stream")
|
||||
|
||||
// Send some events
|
||||
// 1. Tool call
|
||||
// Pulse internal protocol expects data to be a JSON object with "type" and "data" fields
|
||||
fmt.Fprintf(w, "data: {\"type\":\"tool_start\",\"data\":{\"id\":\"call_1\",\"name\":\"pulse_read\",\"input\":\"\"}}\n\n")
|
||||
|
||||
// 2. Tool output
|
||||
fmt.Fprintf(w, "data: {\"type\":\"tool_end\",\"data\":{\"id\":\"call_1\",\"name\":\"pulse_read\",\"output\":\"output\",\"success\":true}}\n\n")
|
||||
|
||||
// 3. Content
|
||||
fmt.Fprintf(w, "data: {\"type\":\"content\",\"data\":{\"text\":\"Hello world\"}}\n\n")
|
||||
|
||||
// 4. Done
|
||||
fmt.Fprintf(w, "data: {\"type\":\"done\",\"data\":{}}\n\n")
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
cfg := DefaultConfig()
|
||||
cfg.BaseURL = server.URL
|
||||
cfg.Verbose = false
|
||||
runner := NewRunner(cfg)
|
||||
|
||||
scenario := Scenario{
|
||||
Name: "Test Scenario",
|
||||
Steps: []Step{
|
||||
{Name: "Step 1", Prompt: "Hello"},
|
||||
},
|
||||
}
|
||||
|
||||
result := runner.RunScenario(scenario)
|
||||
|
||||
assert.True(t, result.Passed)
|
||||
require.Len(t, result.Steps, 1)
|
||||
step := result.Steps[0]
|
||||
assert.Equal(t, "Hello world", step.Content)
|
||||
require.Len(t, step.ToolCalls, 1)
|
||||
assert.Equal(t, "pulse_read", step.ToolCalls[0].Name)
|
||||
}
|
||||
|
||||
func TestRunner_ShouldRetry(t *testing.T) {
|
||||
cfg := DefaultConfig()
|
||||
cfg.RetryOnPhantom = true
|
||||
runner := NewRunner(cfg)
|
||||
|
||||
// Case 1: Phantom detection
|
||||
res := &StepResult{
|
||||
Content: "I apologize, but I wasn't able to access the infrastructure tools needed to complete that request",
|
||||
ToolCalls: []ToolCallEvent{},
|
||||
}
|
||||
retry, reason := runner.shouldRetryStep(res, Step{})
|
||||
assert.True(t, retry)
|
||||
assert.Equal(t, "phantom_detection", reason)
|
||||
|
||||
// Case 2: Success
|
||||
res = &StepResult{
|
||||
Content: "OK",
|
||||
}
|
||||
retry, _ = runner.shouldRetryStep(res, Step{})
|
||||
assert.False(t, retry)
|
||||
}
|
||||
|
||||
func TestRunner_UpdateAISettings(t *testing.T) {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method == http.MethodGet {
|
||||
w.Write([]byte(`{"patrol_model": "old-model"}`))
|
||||
return
|
||||
}
|
||||
if r.Method == http.MethodPut {
|
||||
w.Write([]byte("{}"))
|
||||
return
|
||||
}
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
cfg := DefaultConfig()
|
||||
cfg.BaseURL = server.URL
|
||||
runner := NewRunner(cfg)
|
||||
|
||||
// Test Get
|
||||
settings, err := runner.getAISettings(context.Background())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, "old-model", settings.PatrolModel)
|
||||
|
||||
// Test Update
|
||||
update := "new-model"
|
||||
err = runner.updateAISettings(context.Background(), aiSettingsUpdateRequest{PatrolModel: &update})
|
||||
require.NoError(t, err)
|
||||
}
|
||||
|
||||
func TestNormalizeModelString(t *testing.T) {
|
||||
// ParseModelString likely defaults to openai provider if missing
|
||||
assert.Equal(t, "", normalizeModelString(" "))
|
||||
assert.Equal(t, "openai:gpt-4", normalizeModelString("gpt-4"))
|
||||
}
|
||||
Reference in New Issue
Block a user