Skip to content

Commit 57d5f00

Browse files
authored
Merge pull request #457 from njhale/test/smoke
Add multi-model smoke tests
2 parents 9ee00d4 + 825bf28 commit 57d5f00

16 files changed

+16075
-2
lines changed

.github/workflows/smoke.yaml

+179
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
name: test
2+
3+
on:
4+
pull_request_target:
5+
types: [opened, synchronize, reopened]
6+
branches:
7+
- main
8+
push:
9+
branches:
10+
- main
11+
paths-ignore:
12+
- docs/**
13+
workflow_dispatch:
14+
15+
jobs:
16+
check-label:
17+
runs-on: ubuntu-22.04
18+
outputs:
19+
run_smoke_tests: ${{ steps.check.outputs.run_smoke_tests }}
20+
steps:
21+
- name: Check if PR author is a member of the organization or has the run-smoke label
22+
id: check
23+
run: |
24+
case "${{ github.event_name }}" in
25+
push)
26+
# Run smoke tests for push to base repo
27+
echo "run_smoke_tests=true" >> $GITHUB_OUTPUT
28+
exit 0
29+
;;
30+
workflow_dispatch)
31+
# Run smoke tests for manual runs against base branch
32+
echo "run_smoke_tests=true" >> $GITHUB_OUTPUT
33+
exit 0
34+
;;
35+
pull_request_target)
36+
ORG="gptscript-ai"
37+
AUTHOR="${{ github.event.pull_request.user.login }}"
38+
39+
# Check for org membership
40+
MEMBERSHIP_RESPONSE_CODE=$(curl -s -o /dev/null -w "%{http_code}" -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \
41+
"https://api.github.com/orgs/$ORG/members/$AUTHOR")
42+
43+
if [ "$MEMBERSHIP_RESPONSE_CODE" -eq 204 ]; then
44+
echo "run_smoke_tests=true" >> $GITHUB_OUTPUT
45+
exit 0
46+
fi
47+
48+
# Check for "run-smoke" label
49+
LABELS=$(curl -s -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \
50+
"https://api.github.com/repos/${{ github.repository_owner }}/${{ github.event.repository.name }}/issues/${{ github.event.pull_request.number }}/labels" | jq -r '.[].name')
51+
if echo "$LABELS" | grep -q "run-smoke"; then
52+
# Run smoke tests for PR with the "run-smoke" label
53+
echo "run_smoke_tests=true" >> $GITHUB_OUTPUT
54+
exit 0
55+
fi
56+
57+
;;
58+
esac
59+
60+
echo "run_smoke_tests=false" >> $GITHUB_OUTPUT
61+
62+
smoke-gpt-4o-2024-05-13:
63+
needs: check-label
64+
if: ${{ needs.check-label.outputs.run_smoke_tests == 'true' }}
65+
runs-on: ubuntu-22.04
66+
steps:
67+
- name: Checkout base repository
68+
uses: actions/checkout@v4
69+
with:
70+
fetch-depth: 1
71+
- name: Checkout PR code if running for a PR
72+
if: ${{ github.event_name == 'pull_request_target' }}
73+
uses: actions/checkout@v4
74+
with:
75+
fetch-depth: 1
76+
repository: ${{ github.event.pull_request.head.repo.full_name }}
77+
ref: ${{ github.event.pull_request.head.ref }}
78+
- uses: actions/setup-go@v5
79+
with:
80+
cache: false
81+
go-version: "1.21"
82+
- env:
83+
OPENAI_API_KEY: ${{ secrets.SMOKE_OPENAI_API_KEY }}
84+
GPTSCRIPT_DEFAULT_MODEL: gpt-4o-2024-05-13
85+
name: Run smoke test for gpt-4o-2024-05-13
86+
run: |
87+
echo "Running smoke test for model gpt-4o-2024-05-13"
88+
export PATH="$(pwd)/bin:${PATH}"
89+
make smoke
90+
91+
smoke-gpt-4-turbo-2024-04-09:
92+
needs: check-label
93+
if: ${{ needs.check-label.outputs.run_smoke_tests == 'true' }}
94+
runs-on: ubuntu-22.04
95+
steps:
96+
- name: Checkout base repository
97+
uses: actions/checkout@v4
98+
with:
99+
fetch-depth: 1
100+
- name: Checkout PR code if running for a PR
101+
if: ${{ github.event_name == 'pull_request_target' }}
102+
uses: actions/checkout@v4
103+
with:
104+
fetch-depth: 1
105+
repository: ${{ github.event.pull_request.head.repo.full_name }}
106+
ref: ${{ github.event.pull_request.head.ref }}
107+
- uses: actions/setup-go@v5
108+
with:
109+
cache: false
110+
go-version: "1.21"
111+
- env:
112+
OPENAI_API_KEY: ${{ secrets.SMOKE_OPENAI_API_KEY }}
113+
GPTSCRIPT_DEFAULT_MODEL: gpt-4-turbo-2024-04-09
114+
name: Run smoke test for gpt-4-turbo-2024-04-09
115+
run: |
116+
echo "Running smoke test for model gpt-4-turbo-2024-04-09"
117+
export PATH="$(pwd)/bin:${PATH}"
118+
make smoke
119+
120+
smoke-claude-3-opus-20240229:
121+
needs: check-label
122+
if: ${{ needs.check-label.outputs.run_smoke_tests == 'true' }}
123+
runs-on: ubuntu-22.04
124+
steps:
125+
- name: Checkout base repository
126+
uses: actions/checkout@v4
127+
with:
128+
fetch-depth: 1
129+
- name: Checkout PR code if running for a PR
130+
if: ${{ github.event_name == 'pull_request_target' }}
131+
uses: actions/checkout@v4
132+
with:
133+
fetch-depth: 1
134+
repository: ${{ github.event.pull_request.head.repo.full_name }}
135+
ref: ${{ github.event.pull_request.head.ref }}
136+
- uses: actions/setup-go@v5
137+
with:
138+
cache: false
139+
go-version: "1.21"
140+
- env:
141+
OPENAI_API_KEY: ${{ secrets.SMOKE_OPENAI_API_KEY }}
142+
GPTSCRIPT_DEFAULT_MODEL: claude-3-opus-20240229 from github.com/gptscript-ai/claude3-anthropic-provider@tool-beta
143+
ANTHROPIC_API_KEY: ${{ secrets.SMOKE_ANTHROPIC_API_KEY }}
144+
name: Run smoke test for claude-3-opus-20240229
145+
run: |
146+
echo "Running smoke test for model claude-3-opus-20240229"
147+
export PATH="$(pwd)/bin:${PATH}"
148+
make smoke
149+
150+
smoke-mistral-large-2402:
151+
needs: check-label
152+
if: ${{ needs.check-label.outputs.run_smoke_tests == 'true' }}
153+
runs-on: ubuntu-22.04
154+
steps:
155+
- name: Checkout base repository
156+
uses: actions/checkout@v4
157+
with:
158+
fetch-depth: 1
159+
- name: Checkout PR code if running for a PR
160+
if: ${{ github.event_name == 'pull_request_target' }}
161+
uses: actions/checkout@v4
162+
with:
163+
fetch-depth: 1
164+
repository: ${{ github.event.pull_request.head.repo.full_name }}
165+
ref: ${{ github.event.pull_request.head.ref }}
166+
- uses: actions/setup-go@v5
167+
with:
168+
cache: false
169+
go-version: "1.21"
170+
- env:
171+
OPENAI_API_KEY: ${{ secrets.SMOKE_OPENAI_API_KEY }}
172+
GPTSCRIPT_DEFAULT_MODEL: mistral-large-2402 from https://api.mistral.ai/v1
173+
GPTSCRIPT_PROVIDER_API_MISTRAL_AI_API_KEY: ${{ secrets.SMOKE_GPTSCRIPT_PROVIDER_API_MISTRAL_AI_API_KEY }}
174+
name: Run smoke test for mistral-large-2402
175+
run: |
176+
echo "Running smoke test for model mistral-large-2402"
177+
export PATH="$(pwd)/bin:${PATH}"
178+
make smoke
179+

Makefile

+5-1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,10 @@ tidy:
1414
test:
1515
go test -v ./...
1616

17+
smoke: build
18+
smoke:
19+
go test -v -tags='smoke' ./pkg/tests/smoke/...
20+
1721
GOLANGCI_LINT_VERSION ?= v1.59.0
1822
lint:
1923
if ! command -v golangci-lint &> /dev/null; then \
@@ -52,4 +56,4 @@ validate-docs:
5256
echo "Encountered dirty repo!"; \
5357
git diff; \
5458
exit 1 \
55-
;fi
59+
;fi

go.mod

+1-1
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ require (
3232
golang.org/x/sync v0.7.0
3333
golang.org/x/term v0.20.0
3434
gopkg.in/yaml.v3 v3.0.1
35+
gotest.tools/v3 v3.5.1
3536
sigs.k8s.io/yaml v1.4.0
3637
)
3738

@@ -107,6 +108,5 @@ require (
107108
golang.org/x/sys v0.20.0 // indirect
108109
golang.org/x/text v0.15.0 // indirect
109110
golang.org/x/tools v0.20.0 // indirect
110-
gotest.tools/v3 v3.5.1 // indirect
111111
mvdan.cc/gofumpt v0.6.0 // indirect
112112
)

pkg/tests/judge/judge.go

+127
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
package judge
2+
3+
import (
4+
"context"
5+
"encoding/json"
6+
"fmt"
7+
8+
"github.com/getkin/kin-openapi/openapi3gen"
9+
openai "github.com/gptscript-ai/chat-completion-client"
10+
)
11+
12+
const instructions = `When given JSON objects that conform to the following JSONSchema:
13+
14+
%s
15+
16+
Determine if "actual" is equal to "expected" based on the comparison constraints described by "criteria".
17+
"actual" is considered equal to "expected" if and only if the all of the constraints described by "criteria" are satisfied.
18+
19+
After making a determination, respond with a JSON object that conforms to the following JSONSchema:
20+
21+
{
22+
"name": "ruling",
23+
"type": "object",
24+
"properties": {
25+
"equal": {
26+
"type": "boolean",
27+
"description": "Set to true if and only if actual is considered equal to expected."
28+
},
29+
"reasoning": {
30+
"type": "string",
31+
"description": "The reasoning used to come to the determination, that points out all instances where the given criteria was violated"
32+
}
33+
},
34+
"required": [
35+
"equal",
36+
"reasoning"
37+
]
38+
}
39+
40+
Your responses are concise and include only the json object described above.
41+
`
42+
43+
type Judge[T any] struct {
44+
client *openai.Client
45+
instructions string
46+
}
47+
48+
type comparison[T any] struct {
49+
Expected T `json:"expected"`
50+
Actual T `json:"actual"`
51+
Criteria string `json:"criteria"`
52+
}
53+
54+
type ruling struct {
55+
Equal bool `json:"equal"`
56+
Reasoning string `json:"reasoning"`
57+
}
58+
59+
func New[T any](client *openai.Client) (*Judge[T], error) {
60+
schema, err := openapi3gen.NewSchemaRefForValue(
61+
new(comparison[T]),
62+
nil,
63+
openapi3gen.CreateComponentSchemas(
64+
openapi3gen.ExportComponentSchemasOptions{
65+
ExportComponentSchemas: true,
66+
ExportGenerics: false,
67+
}),
68+
)
69+
if err != nil {
70+
return nil, fmt.Errorf("failed to generate JSONSchema for %T: %w", new(T), err)
71+
}
72+
73+
schemaJSON, err := json.MarshalIndent(schema, "", " ")
74+
if err != nil {
75+
return nil, fmt.Errorf("failed to marshal JSONSchema for %T: %w", new(T), err)
76+
}
77+
78+
return &Judge[T]{
79+
client: client,
80+
instructions: fmt.Sprintf(instructions, schemaJSON),
81+
}, nil
82+
}
83+
84+
func (j *Judge[T]) Equal(ctx context.Context, expected, actual T, criteria string) (equal bool, reasoning string, err error) {
85+
comparisonJSON, err := json.MarshalIndent(&comparison[T]{
86+
Expected: expected,
87+
Actual: actual,
88+
Criteria: criteria,
89+
}, "", " ")
90+
if err != nil {
91+
return false, "", fmt.Errorf("failed to marshal judge testcase JSON: %w", err)
92+
}
93+
94+
request := openai.ChatCompletionRequest{
95+
Model: openai.GPT4o,
96+
Temperature: new(float32),
97+
N: 1,
98+
ResponseFormat: &openai.ChatCompletionResponseFormat{
99+
Type: openai.ChatCompletionResponseFormatTypeJSONObject,
100+
},
101+
Messages: []openai.ChatCompletionMessage{
102+
{
103+
Role: openai.ChatMessageRoleSystem,
104+
Content: j.instructions,
105+
},
106+
{
107+
Role: openai.ChatMessageRoleUser,
108+
Content: string(comparisonJSON),
109+
},
110+
},
111+
}
112+
response, err := j.client.CreateChatCompletion(ctx, request)
113+
if err != nil {
114+
return false, "", fmt.Errorf("failed to make judge chat completion request: %w", err)
115+
}
116+
117+
if len(response.Choices) < 1 {
118+
return false, "", fmt.Errorf("judge chat completion request returned no choices")
119+
}
120+
121+
var equality ruling
122+
if err := json.Unmarshal([]byte(response.Choices[0].Message.Content), &equality); err != nil {
123+
return false, "", fmt.Errorf("failed to unmarshal judge ruling: %w", err)
124+
}
125+
126+
return equality.Equal, equality.Reasoning, nil
127+
}

0 commit comments

Comments
 (0)