Skip to content

Add multi-model smoke tests #457

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
179 changes: 179 additions & 0 deletions .github/workflows/smoke.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
name: test

on:
pull_request_target:
types: [opened, synchronize, reopened]
branches:
- main
push:
branches:
- main
paths-ignore:
- docs/**
workflow_dispatch:

jobs:
check-label:
runs-on: ubuntu-22.04
outputs:
run_smoke_tests: ${{ steps.check.outputs.run_smoke_tests }}
steps:
- name: Check if PR author is a member of the organization or has the run-smoke label
id: check
run: |
case "${{ github.event_name }}" in
push)
# Run smoke tests for push to base repo
echo "run_smoke_tests=true" >> $GITHUB_OUTPUT
exit 0
;;
workflow_dispatch)
# Run smoke tests for manual runs against base branch
echo "run_smoke_tests=true" >> $GITHUB_OUTPUT
exit 0
;;
pull_request_target)
ORG="gptscript-ai"
AUTHOR="${{ github.event.pull_request.user.login }}"

# Check for org membership
MEMBERSHIP_RESPONSE_CODE=$(curl -s -o /dev/null -w "%{http_code}" -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \
"https://api.github.com/orgs/$ORG/members/$AUTHOR")

if [ "$MEMBERSHIP_RESPONSE_CODE" -eq 204 ]; then
echo "run_smoke_tests=true" >> $GITHUB_OUTPUT
exit 0
fi

# Check for "run-smoke" label
LABELS=$(curl -s -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \
"https://api.github.com/repos/${{ github.repository_owner }}/${{ github.event.repository.name }}/issues/${{ github.event.pull_request.number }}/labels" | jq -r '.[].name')
if echo "$LABELS" | grep -q "run-smoke"; then
# Run smoke tests for PR with the "run-smoke" label
echo "run_smoke_tests=true" >> $GITHUB_OUTPUT
exit 0
fi

;;
esac

echo "run_smoke_tests=false" >> $GITHUB_OUTPUT

smoke-gpt-4o-2024-05-13:
needs: check-label
if: ${{ needs.check-label.outputs.run_smoke_tests == 'true' }}
runs-on: ubuntu-22.04
steps:
- name: Checkout base repository
uses: actions/checkout@v4
with:
fetch-depth: 1
- name: Checkout PR code if running for a PR
if: ${{ github.event_name == 'pull_request_target' }}
uses: actions/checkout@v4
with:
fetch-depth: 1
repository: ${{ github.event.pull_request.head.repo.full_name }}
ref: ${{ github.event.pull_request.head.ref }}
- uses: actions/setup-go@v5
with:
cache: false
go-version: "1.21"
- env:
OPENAI_API_KEY: ${{ secrets.SMOKE_OPENAI_API_KEY }}
GPTSCRIPT_DEFAULT_MODEL: gpt-4o-2024-05-13
name: Run smoke test for gpt-4o-2024-05-13
run: |
echo "Running smoke test for model gpt-4o-2024-05-13"
export PATH="$(pwd)/bin:${PATH}"
make smoke

smoke-gpt-4-turbo-2024-04-09:
needs: check-label
if: ${{ needs.check-label.outputs.run_smoke_tests == 'true' }}
runs-on: ubuntu-22.04
steps:
- name: Checkout base repository
uses: actions/checkout@v4
with:
fetch-depth: 1
- name: Checkout PR code if running for a PR
if: ${{ github.event_name == 'pull_request_target' }}
uses: actions/checkout@v4
with:
fetch-depth: 1
repository: ${{ github.event.pull_request.head.repo.full_name }}
ref: ${{ github.event.pull_request.head.ref }}
- uses: actions/setup-go@v5
with:
cache: false
go-version: "1.21"
- env:
OPENAI_API_KEY: ${{ secrets.SMOKE_OPENAI_API_KEY }}
GPTSCRIPT_DEFAULT_MODEL: gpt-4-turbo-2024-04-09
name: Run smoke test for gpt-4-turbo-2024-04-09
run: |
echo "Running smoke test for model gpt-4-turbo-2024-04-09"
export PATH="$(pwd)/bin:${PATH}"
make smoke

smoke-claude-3-opus-20240229:
needs: check-label
if: ${{ needs.check-label.outputs.run_smoke_tests == 'true' }}
runs-on: ubuntu-22.04
steps:
- name: Checkout base repository
uses: actions/checkout@v4
with:
fetch-depth: 1
- name: Checkout PR code if running for a PR
if: ${{ github.event_name == 'pull_request_target' }}
uses: actions/checkout@v4
with:
fetch-depth: 1
repository: ${{ github.event.pull_request.head.repo.full_name }}
ref: ${{ github.event.pull_request.head.ref }}
- uses: actions/setup-go@v5
with:
cache: false
go-version: "1.21"
- env:
OPENAI_API_KEY: ${{ secrets.SMOKE_OPENAI_API_KEY }}
GPTSCRIPT_DEFAULT_MODEL: claude-3-opus-20240229 from github.com/gptscript-ai/claude3-anthropic-provider@tool-beta
ANTHROPIC_API_KEY: ${{ secrets.SMOKE_ANTHROPIC_API_KEY }}
name: Run smoke test for claude-3-opus-20240229
run: |
echo "Running smoke test for model claude-3-opus-20240229"
export PATH="$(pwd)/bin:${PATH}"
make smoke

smoke-mistral-large-2402:
needs: check-label
if: ${{ needs.check-label.outputs.run_smoke_tests == 'true' }}
runs-on: ubuntu-22.04
steps:
- name: Checkout base repository
uses: actions/checkout@v4
with:
fetch-depth: 1
- name: Checkout PR code if running for a PR
if: ${{ github.event_name == 'pull_request_target' }}
uses: actions/checkout@v4
with:
fetch-depth: 1
repository: ${{ github.event.pull_request.head.repo.full_name }}
ref: ${{ github.event.pull_request.head.ref }}
- uses: actions/setup-go@v5
with:
cache: false
go-version: "1.21"
- env:
OPENAI_API_KEY: ${{ secrets.SMOKE_OPENAI_API_KEY }}
GPTSCRIPT_DEFAULT_MODEL: mistral-large-2402 from https://api.mistral.ai/v1
GPTSCRIPT_PROVIDER_API_MISTRAL_AI_API_KEY: ${{ secrets.SMOKE_GPTSCRIPT_PROVIDER_API_MISTRAL_AI_API_KEY }}
name: Run smoke test for mistral-large-2402
run: |
echo "Running smoke test for model mistral-large-2402"
export PATH="$(pwd)/bin:${PATH}"
make smoke

6 changes: 5 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ tidy:
test:
go test -v ./...

smoke: build
smoke:
go test -v -tags='smoke' ./pkg/tests/smoke/...

GOLANGCI_LINT_VERSION ?= v1.59.0
lint:
if ! command -v golangci-lint &> /dev/null; then \
Expand Down Expand Up @@ -52,4 +56,4 @@ validate-docs:
echo "Encountered dirty repo!"; \
git diff; \
exit 1 \
;fi
;fi
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ require (
golang.org/x/sync v0.7.0
golang.org/x/term v0.20.0
gopkg.in/yaml.v3 v3.0.1
gotest.tools/v3 v3.5.1
sigs.k8s.io/yaml v1.4.0
)

Expand Down Expand Up @@ -107,6 +108,5 @@ require (
golang.org/x/sys v0.20.0 // indirect
golang.org/x/text v0.15.0 // indirect
golang.org/x/tools v0.20.0 // indirect
gotest.tools/v3 v3.5.1 // indirect
mvdan.cc/gofumpt v0.6.0 // indirect
)
127 changes: 127 additions & 0 deletions pkg/tests/judge/judge.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
package judge

import (
"context"
"encoding/json"
"fmt"

"github.com/getkin/kin-openapi/openapi3gen"
openai "github.com/gptscript-ai/chat-completion-client"
)

const instructions = `When given JSON objects that conform to the following JSONSchema:
%s
Determine if "actual" is equal to "expected" based on the comparison constraints described by "criteria".
"actual" is considered equal to "expected" if and only if the all of the constraints described by "criteria" are satisfied.
After making a determination, respond with a JSON object that conforms to the following JSONSchema:
{
"name": "ruling",
"type": "object",
"properties": {
"equal": {
"type": "boolean",
"description": "Set to true if and only if actual is considered equal to expected."
},
"reasoning": {
"type": "string",
"description": "The reasoning used to come to the determination, that points out all instances where the given criteria was violated"
}
},
"required": [
"equal",
"reasoning"
]
}
Your responses are concise and include only the json object described above.
`

type Judge[T any] struct {
client *openai.Client
instructions string
}

type comparison[T any] struct {
Expected T `json:"expected"`
Actual T `json:"actual"`
Criteria string `json:"criteria"`
}

type ruling struct {
Equal bool `json:"equal"`
Reasoning string `json:"reasoning"`
}

func New[T any](client *openai.Client) (*Judge[T], error) {
schema, err := openapi3gen.NewSchemaRefForValue(
new(comparison[T]),
nil,
openapi3gen.CreateComponentSchemas(
openapi3gen.ExportComponentSchemasOptions{
ExportComponentSchemas: true,
ExportGenerics: false,
}),
)
if err != nil {
return nil, fmt.Errorf("failed to generate JSONSchema for %T: %w", new(T), err)
}

schemaJSON, err := json.MarshalIndent(schema, "", " ")
if err != nil {
return nil, fmt.Errorf("failed to marshal JSONSchema for %T: %w", new(T), err)
}

return &Judge[T]{
client: client,
instructions: fmt.Sprintf(instructions, schemaJSON),
}, nil
}

func (j *Judge[T]) Equal(ctx context.Context, expected, actual T, criteria string) (equal bool, reasoning string, err error) {
comparisonJSON, err := json.MarshalIndent(&comparison[T]{
Expected: expected,
Actual: actual,
Criteria: criteria,
}, "", " ")
if err != nil {
return false, "", fmt.Errorf("failed to marshal judge testcase JSON: %w", err)
}

request := openai.ChatCompletionRequest{
Model: openai.GPT4o,
Temperature: new(float32),
N: 1,
ResponseFormat: &openai.ChatCompletionResponseFormat{
Type: openai.ChatCompletionResponseFormatTypeJSONObject,
},
Messages: []openai.ChatCompletionMessage{
{
Role: openai.ChatMessageRoleSystem,
Content: j.instructions,
},
{
Role: openai.ChatMessageRoleUser,
Content: string(comparisonJSON),
},
},
}
response, err := j.client.CreateChatCompletion(ctx, request)
if err != nil {
return false, "", fmt.Errorf("failed to make judge chat completion request: %w", err)
}

if len(response.Choices) < 1 {
return false, "", fmt.Errorf("judge chat completion request returned no choices")
}

var equality ruling
if err := json.Unmarshal([]byte(response.Choices[0].Message.Content), &equality); err != nil {
return false, "", fmt.Errorf("failed to unmarshal judge ruling: %w", err)
}

return equality.Equal, equality.Reasoning, nil
}
Loading