gptscript-ai · njhale · Jun 19, 2024 · May 29, 2024 · Jun 3, 2024
diff --git a/.github/workflows/smoke.yaml b/.github/workflows/smoke.yaml
@@ -0,0 +1,179 @@
+name: test
+
+on:
+  pull_request_target:
+    types: [opened, synchronize, reopened]
+    branches:
+      - main
+  push:
+    branches:
+      - main
+    paths-ignore:
+      - docs/**
+  workflow_dispatch:
+
+jobs:
+  check-label:
+    runs-on: ubuntu-22.04
+    outputs:
+      run_smoke_tests: ${{ steps.check.outputs.run_smoke_tests }}
+    steps:
+      - name: Check if PR author is a member of the organization or has the run-smoke label
+        id: check
+        run: |
+          case "${{ github.event_name }}" in
+            push)
+              # Run smoke tests for push to base repo
+              echo "run_smoke_tests=true" >> $GITHUB_OUTPUT
+              exit 0
+              ;;
+            workflow_dispatch)
+              # Run smoke tests for manual runs against base branch
+              echo "run_smoke_tests=true" >> $GITHUB_OUTPUT
+              exit 0
+              ;;
+            pull_request_target)
+              ORG="gptscript-ai"
+              AUTHOR="${{ github.event.pull_request.user.login }}"
+
+              # Check for org membership
+              MEMBERSHIP_RESPONSE_CODE=$(curl -s -o /dev/null -w "%{http_code}" -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \
+                "https://api.github.com/orgs/$ORG/members/$AUTHOR")
+
+              if [ "$MEMBERSHIP_RESPONSE_CODE" -eq 204 ]; then
+                echo "run_smoke_tests=true" >> $GITHUB_OUTPUT
+                exit 0
+              fi
+
+              # Check for "run-smoke" label
+              LABELS=$(curl -s -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \
+                "https://api.github.com/repos/${{ github.repository_owner }}/${{ github.event.repository.name }}/issues/${{ github.event.pull_request.number }}/labels" | jq -r '.[].name')
+              if echo "$LABELS" | grep -q "run-smoke"; then
+                # Run smoke tests for PR with the "run-smoke" label
+                echo "run_smoke_tests=true" >> $GITHUB_OUTPUT
+                exit 0
+              fi
+
+              ;;
+          esac
+
+          echo "run_smoke_tests=false" >> $GITHUB_OUTPUT
+
+  smoke-gpt-4o-2024-05-13:
+    needs: check-label
+    if: ${{ needs.check-label.outputs.run_smoke_tests == 'true' }}
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Checkout base repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 1
+      - name: Checkout PR code if running for a PR
+        if: ${{ github.event_name == 'pull_request_target' }}
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 1
+          repository: ${{ github.event.pull_request.head.repo.full_name }}
+          ref: ${{ github.event.pull_request.head.ref }}
+      - uses: actions/setup-go@v5
+        with:
+          cache: false
+          go-version: "1.21"
+      - env:
+          OPENAI_API_KEY: ${{ secrets.SMOKE_OPENAI_API_KEY }}
+          GPTSCRIPT_DEFAULT_MODEL: gpt-4o-2024-05-13
+        name: Run smoke test for gpt-4o-2024-05-13
+        run: |
+          echo "Running smoke test for model gpt-4o-2024-05-13"
+          export PATH="$(pwd)/bin:${PATH}"
+          make smoke
+
+  smoke-gpt-4-turbo-2024-04-09:
+    needs: check-label
+    if: ${{ needs.check-label.outputs.run_smoke_tests == 'true' }}
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Checkout base repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 1
+      - name: Checkout PR code if running for a PR
+        if: ${{ github.event_name == 'pull_request_target' }}
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 1
+          repository: ${{ github.event.pull_request.head.repo.full_name }}
+          ref: ${{ github.event.pull_request.head.ref }}
+      - uses: actions/setup-go@v5
+        with:
+          cache: false
+          go-version: "1.21"
+      - env:
+          OPENAI_API_KEY: ${{ secrets.SMOKE_OPENAI_API_KEY }}
+          GPTSCRIPT_DEFAULT_MODEL: gpt-4-turbo-2024-04-09
+        name: Run smoke test for gpt-4-turbo-2024-04-09
+        run: |
+          echo "Running smoke test for model gpt-4-turbo-2024-04-09"
+          export PATH="$(pwd)/bin:${PATH}"
+          make smoke
+
+  smoke-claude-3-opus-20240229:
+    needs: check-label
+    if: ${{ needs.check-label.outputs.run_smoke_tests == 'true' }}
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Checkout base repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 1
+      - name: Checkout PR code if running for a PR
+        if: ${{ github.event_name == 'pull_request_target' }}
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 1
+          repository: ${{ github.event.pull_request.head.repo.full_name }}
+          ref: ${{ github.event.pull_request.head.ref }}
+      - uses: actions/setup-go@v5
+        with:
+          cache: false
+          go-version: "1.21"
+      - env:
+          OPENAI_API_KEY: ${{ secrets.SMOKE_OPENAI_API_KEY }}
+          GPTSCRIPT_DEFAULT_MODEL: claude-3-opus-20240229 from github.com/gptscript-ai/claude3-anthropic-provider@tool-beta
+          ANTHROPIC_API_KEY: ${{ secrets.SMOKE_ANTHROPIC_API_KEY }}
+        name: Run smoke test for claude-3-opus-20240229
+        run: |
+          echo "Running smoke test for model claude-3-opus-20240229"
+          export PATH="$(pwd)/bin:${PATH}"
+          make smoke
+
+  smoke-mistral-large-2402:
+    needs: check-label
+    if: ${{ needs.check-label.outputs.run_smoke_tests == 'true' }}
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Checkout base repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 1
+      - name: Checkout PR code if running for a PR
+        if: ${{ github.event_name == 'pull_request_target' }}
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 1
+          repository: ${{ github.event.pull_request.head.repo.full_name }}
+          ref: ${{ github.event.pull_request.head.ref }}
+      - uses: actions/setup-go@v5
+        with:
+          cache: false
+          go-version: "1.21"
+      - env:
+          OPENAI_API_KEY: ${{ secrets.SMOKE_OPENAI_API_KEY }}
+          GPTSCRIPT_DEFAULT_MODEL: mistral-large-2402 from https://api.mistral.ai/v1
+          GPTSCRIPT_PROVIDER_API_MISTRAL_AI_API_KEY: ${{ secrets.SMOKE_GPTSCRIPT_PROVIDER_API_MISTRAL_AI_API_KEY }}
+        name: Run smoke test for mistral-large-2402
+        run: |
+          echo "Running smoke test for model mistral-large-2402"
+          export PATH="$(pwd)/bin:${PATH}"
+          make smoke
+
diff --git a/Makefile b/Makefile
@@ -14,6 +14,10 @@ tidy:
 test:
 	go test -v ./...
 
+smoke: build
+smoke:
+	go test -v -tags='smoke' ./pkg/tests/smoke/...
+
 GOLANGCI_LINT_VERSION ?= v1.59.0
 lint:
 	if ! command -v golangci-lint &> /dev/null; then \
@@ -52,4 +56,4 @@ validate-docs:
 		echo "Encountered dirty repo!"; \
 		git diff; \
 		exit 1 \
-	;fi
+	;fi
diff --git a/go.mod b/go.mod
@@ -32,6 +32,7 @@ require (
 	golang.org/x/sync v0.7.0
 	golang.org/x/term v0.20.0
 	gopkg.in/yaml.v3 v3.0.1
+	gotest.tools/v3 v3.5.1
 	sigs.k8s.io/yaml v1.4.0
 )
 
@@ -107,6 +108,5 @@ require (
 	golang.org/x/sys v0.20.0 // indirect
 	golang.org/x/text v0.15.0 // indirect
 	golang.org/x/tools v0.20.0 // indirect
-	gotest.tools/v3 v3.5.1 // indirect
 	mvdan.cc/gofumpt v0.6.0 // indirect
 )
diff --git a/pkg/tests/judge/judge.go b/pkg/tests/judge/judge.go
@@ -0,0 +1,127 @@
+package judge
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+
+	"github.com/getkin/kin-openapi/openapi3gen"
+	openai "github.com/gptscript-ai/chat-completion-client"
+)
+
+const instructions = `When given JSON objects that conform to the following JSONSchema:
+
+%s
+
+Determine if "actual" is equal to "expected" based on the comparison constraints described by "criteria".
+"actual" is considered equal to "expected" if and only if the all of the constraints described by "criteria" are satisfied.
+
+After making a determination, respond with a JSON object that conforms to the following JSONSchema:
+
+{
+  "name": "ruling",
+  "type": "object",
+  "properties": {
+    "equal": {
+      "type": "boolean",
+        "description": "Set to true if and only if actual is considered equal to expected."
+      },
+    "reasoning": {
+      "type": "string",
+      "description": "The reasoning used to come to the determination, that points out all instances where the given criteria was violated"
+    }
+  },
+  "required": [
+    "equal",
+    "reasoning"
+  ]
+}
+
+Your responses are concise and include only the json object described above.
+`
+
+type Judge[T any] struct {
+	client       *openai.Client
+	instructions string
+}
+
+type comparison[T any] struct {
+	Expected T      `json:"expected"`
+	Actual   T      `json:"actual"`
+	Criteria string `json:"criteria"`
+}
+
+type ruling struct {
+	Equal     bool   `json:"equal"`
+	Reasoning string `json:"reasoning"`
+}
+
+func New[T any](client *openai.Client) (*Judge[T], error) {
+	schema, err := openapi3gen.NewSchemaRefForValue(
+		new(comparison[T]),
+		nil,
+		openapi3gen.CreateComponentSchemas(
+			openapi3gen.ExportComponentSchemasOptions{
+				ExportComponentSchemas: true,
+				ExportGenerics:         false,
+			}),
+	)
+	if err != nil {
+		return nil, fmt.Errorf("failed to generate JSONSchema for %T: %w", new(T), err)
+	}
+
+	schemaJSON, err := json.MarshalIndent(schema, "", "    ")
+	if err != nil {
+		return nil, fmt.Errorf("failed to marshal JSONSchema for %T: %w", new(T), err)
+	}
+
+	return &Judge[T]{
+		client:       client,
+		instructions: fmt.Sprintf(instructions, schemaJSON),
+	}, nil
+}
+
+func (j *Judge[T]) Equal(ctx context.Context, expected, actual T, criteria string) (equal bool, reasoning string, err error) {
+	comparisonJSON, err := json.MarshalIndent(&comparison[T]{
+		Expected: expected,
+		Actual:   actual,
+		Criteria: criteria,
+	}, "", "    ")
+	if err != nil {
+		return false, "", fmt.Errorf("failed to marshal judge testcase JSON: %w", err)
+	}
+
+	request := openai.ChatCompletionRequest{
+		Model:       openai.GPT4o,
+		Temperature: new(float32),
+		N:           1,
+		ResponseFormat: &openai.ChatCompletionResponseFormat{
+			Type: openai.ChatCompletionResponseFormatTypeJSONObject,
+		},
+		Messages: []openai.ChatCompletionMessage{
+			{
+				Role:    openai.ChatMessageRoleSystem,
+				Content: j.instructions,
+			},
+			{
+				Role:    openai.ChatMessageRoleUser,
+				Content: string(comparisonJSON),
+			},
+		},
+	}
+	response, err := j.client.CreateChatCompletion(ctx, request)
+	if err != nil {
+		return false, "", fmt.Errorf("failed to make judge chat completion request: %w", err)
+	}
+
+	if len(response.Choices) < 1 {
+		return false, "", fmt.Errorf("judge chat completion request returned no choices")
+	}
+
+	var equality ruling
+	if err := json.Unmarshal([]byte(response.Choices[0].Message.Content), &equality); err != nil {
+		return false, "", fmt.Errorf("failed to unmarshal judge ruling: %w", err)
+	}
+
+	return equality.Equal, equality.Reasoning, nil
+}