Skip to content

Commit 7ff4d3b

Browse files
committed
test: add smoke test runner and initial test cases
Signed-off-by: Nick Hale <[email protected]>
1 parent a875d29 commit 7ff4d3b

12 files changed

+14016
-1
lines changed

go.mod

+1-1
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ require (
3232
golang.org/x/sync v0.7.0
3333
golang.org/x/term v0.20.0
3434
gopkg.in/yaml.v3 v3.0.1
35+
gotest.tools/v3 v3.5.1
3536
sigs.k8s.io/yaml v1.4.0
3637
)
3738

@@ -107,6 +108,5 @@ require (
107108
golang.org/x/sys v0.20.0 // indirect
108109
golang.org/x/text v0.15.0 // indirect
109110
golang.org/x/tools v0.20.0 // indirect
110-
gotest.tools/v3 v3.5.1 // indirect
111111
mvdan.cc/gofumpt v0.6.0 // indirect
112112
)

pkg/tests/judge/judge.go

+127
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
package judge
2+
3+
import (
4+
"context"
5+
"encoding/json"
6+
"fmt"
7+
8+
"github.com/getkin/kin-openapi/openapi3gen"
9+
openai "github.com/gptscript-ai/chat-completion-client"
10+
)
11+
12+
const instructions = `When given JSON objects that conform to the following JSONSchema:
13+
14+
%s
15+
16+
Determine if "actual" is equal to "expected" based on the comparison constraints described by "criteria".
17+
"actual" is considered equal to "expected" if and only if the all of the constraints described by "criteria" are satisfied.
18+
19+
After making a determination, respond with a JSON object that conforms to the following JSONSchema:
20+
21+
{
22+
"name": "ruling",
23+
"type": "object",
24+
"properties": {
25+
"equal": {
26+
"type": "boolean",
27+
"description": "Set to true if and only if actual is considered equal to expected."
28+
},
29+
"reasoning": {
30+
"type": "string",
31+
"description": "The reasoning used to come to the determination, that points out all instances where the given criteria was violated"
32+
}
33+
},
34+
"required": [
35+
"equal",
36+
"reasoning"
37+
]
38+
}
39+
40+
Your responses are concise and include only the json object described above.
41+
`
42+
43+
type Judge[T any] struct {
44+
client *openai.Client
45+
instructions string
46+
}
47+
48+
type comparison[T any] struct {
49+
Expected T `json:"expected"`
50+
Actual T `json:"actual"`
51+
Criteria string `json:"criteria"`
52+
}
53+
54+
type ruling struct {
55+
Equal bool `json:"equal"`
56+
Reasoning string `json:"reasoning"`
57+
}
58+
59+
func New[T any](client *openai.Client) (*Judge[T], error) {
60+
schema, err := openapi3gen.NewSchemaRefForValue(
61+
new(comparison[T]),
62+
nil,
63+
openapi3gen.CreateComponentSchemas(
64+
openapi3gen.ExportComponentSchemasOptions{
65+
ExportComponentSchemas: true,
66+
ExportGenerics: false,
67+
}),
68+
)
69+
if err != nil {
70+
return nil, fmt.Errorf("failed to generate JSONSchema for %T: %w", new(T), err)
71+
}
72+
73+
schemaJSON, err := json.MarshalIndent(schema, "", " ")
74+
if err != nil {
75+
return nil, fmt.Errorf("failed to marshal JSONSchema for %T: %w", new(T), err)
76+
}
77+
78+
return &Judge[T]{
79+
client: client,
80+
instructions: fmt.Sprintf(instructions, schemaJSON),
81+
}, nil
82+
}
83+
84+
func (j *Judge[T]) Equal(ctx context.Context, expected, actual T, criteria string) (equal bool, reasoning string, err error) {
85+
comparisonJSON, err := json.MarshalIndent(&comparison[T]{
86+
Expected: expected,
87+
Actual: actual,
88+
Criteria: criteria,
89+
}, "", " ")
90+
if err != nil {
91+
return false, "", fmt.Errorf("failed to marshal judge testcase JSON: %w", err)
92+
}
93+
94+
request := openai.ChatCompletionRequest{
95+
Model: openai.GPT4o,
96+
Temperature: new(float32),
97+
N: 1,
98+
ResponseFormat: &openai.ChatCompletionResponseFormat{
99+
Type: openai.ChatCompletionResponseFormatTypeJSONObject,
100+
},
101+
Messages: []openai.ChatCompletionMessage{
102+
{
103+
Role: openai.ChatMessageRoleSystem,
104+
Content: j.instructions,
105+
},
106+
{
107+
Role: openai.ChatMessageRoleUser,
108+
Content: string(comparisonJSON),
109+
},
110+
},
111+
}
112+
response, err := j.client.CreateChatCompletion(ctx, request)
113+
if err != nil {
114+
return false, "", fmt.Errorf("failed to make judge chat completion request: %w", err)
115+
}
116+
117+
if len(response.Choices) < 1 {
118+
return false, "", fmt.Errorf("judge chat completion request returned no choices")
119+
}
120+
121+
var equality ruling
122+
if err := json.Unmarshal([]byte(response.Choices[0].Message.Content), &equality); err != nil {
123+
return false, "", fmt.Errorf("failed to unmarshal judge ruling: %w", err)
124+
}
125+
126+
return equality.Equal, equality.Reasoning, nil
127+
}

pkg/tests/smoke/smoke_test.go

+181
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
//go:build smoke
2+
3+
package smoke
4+
5+
import (
6+
"bufio"
7+
"context"
8+
"encoding/json"
9+
"fmt"
10+
"os"
11+
"path/filepath"
12+
"strings"
13+
"testing"
14+
15+
openai "github.com/gptscript-ai/chat-completion-client"
16+
"github.com/gptscript-ai/gptscript/pkg/runner"
17+
"github.com/gptscript-ai/gptscript/pkg/tests/judge"
18+
"github.com/gptscript-ai/gptscript/pkg/types"
19+
"github.com/samber/lo"
20+
"github.com/stretchr/testify/assert"
21+
"github.com/stretchr/testify/require"
22+
"gotest.tools/v3/icmd"
23+
)
24+
25+
const defaultModelEnvVar = "GPTSCRIPT_DEFAULT_MODEL"
26+
27+
func TestSmoke(t *testing.T) {
28+
client := openai.NewClient(os.Getenv("OPENAI_API_KEY"))
29+
smokeJudge, err := judge.New[[]event](client)
30+
require.NoError(t, err, "error initializing smoke test judge")
31+
32+
for _, tc := range getTestcases(t) {
33+
t.Run(tc.name, func(t *testing.T) {
34+
cmd := icmd.Command(
35+
"gptscript",
36+
"--color=false",
37+
"--disable-cache",
38+
"--events-stream-to",
39+
tc.actualEventsFile,
40+
"--default-model",
41+
tc.defaultModel,
42+
tc.gptFile,
43+
)
44+
45+
result := icmd.RunCmd(cmd)
46+
defer func() {
47+
t.Helper()
48+
assert.NoError(t, os.Remove(tc.actualEventsFile))
49+
}()
50+
51+
require.NoError(t, result.Error, "stderr: %q", result.Stderr())
52+
require.Zero(t, result.ExitCode)
53+
54+
var (
55+
actualEvents = getActualEvents(t, tc.actualEventsFile)
56+
expectedEvents = make([]event, 0)
57+
)
58+
f, err := os.Open(tc.expectedEventsFile)
59+
if os.IsNotExist(err) {
60+
// No expected events found, store the results of the latest call as the golden file for future tests runs
61+
f, err := os.Create(tc.expectedEventsFile)
62+
require.NoError(t, err)
63+
defer f.Close()
64+
65+
encoder := json.NewEncoder(f)
66+
encoder.SetIndent("", " ")
67+
require.NoError(t, encoder.Encode(actualEvents))
68+
t.Skipf("Generated initial golden file %q, skipping test", tc.expectedEventsFile)
69+
} else {
70+
require.NoError(t, err)
71+
defer f.Close()
72+
73+
decoder := json.NewDecoder(f)
74+
require.NoError(t, decoder.Decode(&expectedEvents))
75+
}
76+
77+
ctx, cancel := context.WithCancel(context.Background())
78+
defer cancel()
79+
80+
equal, reasoning, err := smokeJudge.Equal(
81+
ctx,
82+
expectedEvents,
83+
actualEvents,
84+
`Actual and expected must have semantically equivalent elements.
85+
Ignore fields with timestamp values, except in json strings.
86+
The final elements must have semantically equivalent output values.
87+
`,
88+
)
89+
require.NoError(t, err, "error getting judge ruling on output")
90+
require.True(t, equal, reasoning)
91+
t.Logf("reasoning: %q", reasoning)
92+
})
93+
}
94+
}
95+
96+
type testcase struct {
97+
name string
98+
dir string
99+
gptFile string
100+
defaultModel string
101+
modelName string
102+
env []string
103+
actualEventsFile string
104+
expectedEventsFile string
105+
}
106+
107+
func getTestcases(t *testing.T) []testcase {
108+
t.Helper()
109+
110+
defaultModel := os.Getenv(defaultModelEnvVar)
111+
modelName := strings.Split(defaultModel, " ")[0]
112+
113+
var testcases []testcase
114+
for _, d := range lo.Must(os.ReadDir("testdata")) {
115+
if !d.IsDir() {
116+
continue
117+
}
118+
var (
119+
dirName = d.Name()
120+
dir = filepath.Join("testdata", dirName)
121+
)
122+
123+
files, err := os.ReadDir(dir)
124+
require.NoError(t, err, "failed to get testdata dir %q", dir)
125+
126+
for _, f := range files {
127+
if f.IsDir() || filepath.Ext(f.Name()) != ".gpt" {
128+
continue
129+
}
130+
131+
testcases = append(testcases, testcase{
132+
name: dirName,
133+
dir: dir,
134+
gptFile: filepath.Join(dir, f.Name()),
135+
defaultModel: defaultModel,
136+
modelName: modelName,
137+
expectedEventsFile: filepath.Join(dir, fmt.Sprintf("%s-expected.json", modelName)),
138+
actualEventsFile: filepath.Join(dir, fmt.Sprintf("%s.json", modelName)),
139+
})
140+
141+
// Only take the first .gpt file in each testcase directory
142+
break
143+
}
144+
}
145+
146+
return testcases
147+
}
148+
149+
type event struct {
150+
runner.Event
151+
ChatRequest *openai.ChatCompletionRequest `json:"chatRequest,omitempty"`
152+
ChatResponse *types.CompletionMessage `json:"chatResponse,omitempty"`
153+
}
154+
155+
func getActualEvents(t *testing.T, eventsFile string) []event {
156+
t.Helper()
157+
158+
f, err := os.Open(eventsFile)
159+
require.NoError(t, err)
160+
defer f.Close()
161+
162+
var (
163+
events []event
164+
scanner = bufio.NewScanner(f)
165+
)
166+
for scanner.Scan() {
167+
line := scanner.Text()
168+
// Skip blank lines
169+
if strings.TrimSpace(line) == "" {
170+
continue
171+
}
172+
173+
var e event
174+
require.NoError(t, json.Unmarshal([]byte(line), &e))
175+
events = append(events, e)
176+
}
177+
178+
require.NoError(t, scanner.Err())
179+
180+
return events
181+
}

pkg/tests/smoke/testdata/.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Ignore intermediate event stream JSON files
2+
*-events.json
3+

0 commit comments

Comments
 (0)