Skip to content

Commit d01ae4d

Browse files
committed
test: add smoke test runner
Signed-off-by: Nick Hale <[email protected]>
1 parent a875d29 commit d01ae4d

12 files changed

+14019
-1
lines changed

go.mod

+1-1
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ require (
3232
golang.org/x/sync v0.7.0
3333
golang.org/x/term v0.20.0
3434
gopkg.in/yaml.v3 v3.0.1
35+
gotest.tools/v3 v3.5.1
3536
sigs.k8s.io/yaml v1.4.0
3637
)
3738

@@ -107,6 +108,5 @@ require (
107108
golang.org/x/sys v0.20.0 // indirect
108109
golang.org/x/text v0.15.0 // indirect
109110
golang.org/x/tools v0.20.0 // indirect
110-
gotest.tools/v3 v3.5.1 // indirect
111111
mvdan.cc/gofumpt v0.6.0 // indirect
112112
)

pkg/tests/judge/judge.go

+129
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
package judge
2+
3+
import (
4+
"context"
5+
"encoding/json"
6+
"fmt"
7+
8+
"github.com/getkin/kin-openapi/openapi3gen"
9+
openai "github.com/gptscript-ai/chat-completion-client"
10+
)
11+
12+
const instructions = `When given JSON objects that conform to the following JSONSchema:
13+
14+
%s
15+
16+
Determine if "actual" is equal to "expected" based on the comparison constraints described by "criteria".
17+
"actual" is considered equal to "expected" if and only if the all of the constraints described by "criteria" are satisfied.
18+
19+
After making a determination, respond with a JSON object that conforms to the following JSONSchema:
20+
21+
{
22+
"name": "ruling",
23+
"type": "object",
24+
"properties": {
25+
"equal": {
26+
"type": "boolean",
27+
"description": "Set to true if and only if actual is considered equal to expected."
28+
},
29+
"reasoning": {
30+
"type": "string",
31+
"description": "The reasoning used to come to the determination, that points out all instances where the given criteria was violated"
32+
}
33+
},
34+
"required": [
35+
"equal",
36+
"reasoning"
37+
]
38+
}
39+
40+
Your responses are concise and include only the json object described above.
41+
`
42+
43+
type Judge[T any] struct {
44+
client *openai.Client
45+
instructions string
46+
}
47+
48+
type comparison[T any] struct {
49+
Expected T `json:"expected"`
50+
Actual T `json:"actual"`
51+
Criteria string `json:"criteria"`
52+
}
53+
54+
type ruling struct {
55+
Equal bool `json:"equal"`
56+
Reasoning string `json:"reasoning"`
57+
}
58+
59+
func New[T any](client *openai.Client) (*Judge[T], error) {
60+
schema, err := openapi3gen.NewSchemaRefForValue(
61+
new(comparison[T]),
62+
nil,
63+
openapi3gen.CreateComponentSchemas(
64+
openapi3gen.ExportComponentSchemasOptions{
65+
ExportComponentSchemas: true,
66+
// ExportTopLevelSchema: true,
67+
// ExportGenerics: false,
68+
ExportGenerics: false,
69+
}),
70+
)
71+
if err != nil {
72+
return nil, fmt.Errorf("failed to generate JSONSchema for %T: %w", new(T), err)
73+
}
74+
75+
schemaJSON, err := json.MarshalIndent(schema, "", " ")
76+
if err != nil {
77+
return nil, fmt.Errorf("failed to marshal JSONSchema for %T: %w", new(T), err)
78+
}
79+
80+
return &Judge[T]{
81+
client: client,
82+
instructions: fmt.Sprintf(instructions, schemaJSON),
83+
}, nil
84+
}
85+
86+
func (j *Judge[T]) Equal(ctx context.Context, expected, actual T, criteria string) (equal bool, reasoning string, err error) {
87+
comparisonJSON, err := json.MarshalIndent(&comparison[T]{
88+
Expected: expected,
89+
Actual: actual,
90+
Criteria: criteria,
91+
}, "", " ")
92+
if err != nil {
93+
return false, "", fmt.Errorf("failed to marshal judge testcase JSON: %w", err)
94+
}
95+
96+
request := openai.ChatCompletionRequest{
97+
Model: openai.GPT4o,
98+
Temperature: new(float32),
99+
N: 1,
100+
ResponseFormat: &openai.ChatCompletionResponseFormat{
101+
Type: openai.ChatCompletionResponseFormatTypeJSONObject,
102+
},
103+
Messages: []openai.ChatCompletionMessage{
104+
{
105+
Role: openai.ChatMessageRoleSystem,
106+
Content: j.instructions,
107+
},
108+
{
109+
Role: openai.ChatMessageRoleUser,
110+
Content: string(comparisonJSON),
111+
},
112+
},
113+
}
114+
response, err := j.client.CreateChatCompletion(ctx, request)
115+
if err != nil {
116+
return false, "", fmt.Errorf("failed to make judge chat completion request: %w", err)
117+
}
118+
119+
if len(response.Choices) < 1 {
120+
return false, "", fmt.Errorf("judge chat completion request returned no choices")
121+
}
122+
123+
var equality ruling
124+
if err := json.Unmarshal([]byte(response.Choices[0].Message.Content), &equality); err != nil {
125+
return false, "", fmt.Errorf("failed to unmarshal judge ruling: %w", err)
126+
}
127+
128+
return equality.Equal, equality.Reasoning, nil
129+
}

pkg/tests/smoke/smoke_test.go

+182
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
//go:build smoke
2+
3+
package smoke
4+
5+
import (
6+
"bufio"
7+
"context"
8+
"encoding/json"
9+
"fmt"
10+
"os"
11+
"path/filepath"
12+
"strings"
13+
"testing"
14+
15+
openai "github.com/gptscript-ai/chat-completion-client"
16+
"github.com/gptscript-ai/gptscript/pkg/runner"
17+
"github.com/gptscript-ai/gptscript/pkg/tests/judge"
18+
"github.com/gptscript-ai/gptscript/pkg/types"
19+
"github.com/samber/lo"
20+
"github.com/stretchr/testify/assert"
21+
"github.com/stretchr/testify/require"
22+
"gotest.tools/v3/icmd"
23+
)
24+
25+
const defaultModelEnvVar = "GPTSCRIPT_DEFAULT_MODEL"
26+
27+
func TestSmoke(t *testing.T) {
28+
// TODO(njhale): Configure this client correctly
29+
client := openai.NewClient(os.Getenv("OPENAI_API_KEY"))
30+
smokeJudge, err := judge.New[[]event](client)
31+
require.NoError(t, err, "error initializing smoke test judge")
32+
33+
for _, tc := range getTestcases(t) {
34+
t.Run(tc.name, func(t *testing.T) {
35+
cmd := icmd.Command(
36+
"gptscript",
37+
"--color=false",
38+
"--disable-cache",
39+
"--events-stream-to",
40+
tc.actualEventsFile,
41+
"--default-model",
42+
tc.defaultModel,
43+
tc.gptFile,
44+
)
45+
46+
result := icmd.RunCmd(cmd)
47+
defer func() {
48+
t.Helper()
49+
assert.NoError(t, os.Remove(tc.actualEventsFile))
50+
}()
51+
52+
require.NoError(t, result.Error, "stderr: %q", result.Stderr())
53+
require.Zero(t, result.ExitCode)
54+
55+
var (
56+
actualEvents = getActualEvents(t, tc.actualEventsFile)
57+
expectedEvents = make([]event, 0)
58+
)
59+
f, err := os.Open(tc.expectedEventsFile)
60+
if os.IsNotExist(err) {
61+
// No expected events found, store the results of the latest call as the golden file for future tests runs
62+
f, err := os.Create(tc.expectedEventsFile)
63+
require.NoError(t, err)
64+
defer f.Close()
65+
66+
encoder := json.NewEncoder(f)
67+
encoder.SetIndent("", " ")
68+
require.NoError(t, encoder.Encode(actualEvents))
69+
t.Skipf("Generated initial golden file %q, skipping test", tc.expectedEventsFile)
70+
} else {
71+
require.NoError(t, err)
72+
defer f.Close()
73+
74+
decoder := json.NewDecoder(f)
75+
require.NoError(t, decoder.Decode(&expectedEvents))
76+
}
77+
78+
ctx, cancel := context.WithCancel(context.Background())
79+
defer cancel()
80+
81+
equal, reasoning, err := smokeJudge.Equal(
82+
ctx,
83+
expectedEvents,
84+
actualEvents,
85+
`Actual and expected must have semantically equivalent elements.
86+
Ignore fields with timestamp values, except in json strings.
87+
The final elements must have semantically equivalent output values.
88+
`,
89+
)
90+
require.NoError(t, err, "error getting judge ruling on output")
91+
require.True(t, equal, reasoning)
92+
t.Logf("reasoning: %q", reasoning)
93+
})
94+
}
95+
}
96+
97+
type testcase struct {
98+
name string
99+
dir string
100+
gptFile string
101+
defaultModel string
102+
modelName string
103+
env []string
104+
actualEventsFile string
105+
expectedEventsFile string
106+
}
107+
108+
func getTestcases(t *testing.T) []testcase {
109+
t.Helper()
110+
111+
defaultModel := os.Getenv(defaultModelEnvVar)
112+
modelName := strings.Split(defaultModel, " ")[0]
113+
114+
var testcases []testcase
115+
for _, d := range lo.Must(os.ReadDir("testdata")) {
116+
if !d.IsDir() {
117+
continue
118+
}
119+
var (
120+
dirName = d.Name()
121+
dir = filepath.Join("testdata", dirName)
122+
)
123+
124+
files, err := os.ReadDir(dir)
125+
require.NoError(t, err, "failed to get testdata dir %q", dir)
126+
127+
for _, f := range files {
128+
if f.IsDir() || filepath.Ext(f.Name()) != ".gpt" {
129+
continue
130+
}
131+
132+
testcases = append(testcases, testcase{
133+
name: dirName,
134+
dir: dir,
135+
gptFile: filepath.Join(dir, f.Name()),
136+
defaultModel: defaultModel,
137+
modelName: modelName,
138+
expectedEventsFile: filepath.Join(dir, fmt.Sprintf("%s-expected.json", modelName)),
139+
actualEventsFile: filepath.Join(dir, fmt.Sprintf("%s.json", modelName)),
140+
})
141+
142+
// Only take the first .gpt file in each testcase directory
143+
break
144+
}
145+
}
146+
147+
return testcases
148+
}
149+
150+
type event struct {
151+
runner.Event
152+
ChatRequest *openai.ChatCompletionRequest `json:"chatRequest,omitempty"`
153+
ChatResponse *types.CompletionMessage `json:"chatResponse,omitempty"`
154+
}
155+
156+
func getActualEvents(t *testing.T, eventsFile string) []event {
157+
t.Helper()
158+
159+
f, err := os.Open(eventsFile)
160+
require.NoError(t, err)
161+
defer f.Close()
162+
163+
var (
164+
events []event
165+
scanner = bufio.NewScanner(f)
166+
)
167+
for scanner.Scan() {
168+
line := scanner.Text()
169+
// Skip blank lines
170+
if strings.TrimSpace(line) == "" {
171+
continue
172+
}
173+
174+
var e event
175+
require.NoError(t, json.Unmarshal([]byte(line), &e))
176+
events = append(events, e)
177+
}
178+
179+
require.NoError(t, scanner.Err())
180+
181+
return events
182+
}

pkg/tests/smoke/testdata/.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Ignore intermediate event stream JSON files
2+
*-events.json
3+

0 commit comments

Comments
 (0)