Skip to content

Commit 144ae1e

Browse files
ThomasVitalemarkpollack
authored andcommitted
Support custom template in RelevancyEvaluator
The RelevancyEvaluator is key for validating RAG flows. This pull request improves it by making the PromptTemplate configurable, improving the format of the default one, introducing a Builder, and extending the documentation with more details on how to use it. I added some unit tests. The RelevancyEvaluator is used in lots of integration tests in the project to test the QuestionAnswerAdvisor and RetrievalAugmentationAdvisor, that also help assessing the evaluator itself. Signed-off-by: Thomas Vitale <[email protected]>
1 parent 66d155c commit 144ae1e

File tree

3 files changed

+191
-59
lines changed

3 files changed

+191
-59
lines changed
Lines changed: 71 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright 2023-2024 the original author or authors.
2+
* Copyright 2023-2025 the original author or authors.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -16,56 +16,104 @@
1616

1717
package org.springframework.ai.chat.evaluation;
1818

19-
import java.util.Collections;
20-
2119
import org.springframework.ai.chat.client.ChatClient;
20+
import org.springframework.ai.chat.prompt.PromptTemplate;
2221
import org.springframework.ai.evaluation.EvaluationRequest;
2322
import org.springframework.ai.evaluation.EvaluationResponse;
2423
import org.springframework.ai.evaluation.Evaluator;
24+
import org.springframework.lang.Nullable;
25+
import org.springframework.util.Assert;
26+
27+
import java.util.Collections;
28+
import java.util.Map;
2529

30+
/**
31+
* Evaluates the relevancy of a response to a query based on the context provided.
32+
*/
2633
public class RelevancyEvaluator implements Evaluator {
2734

28-
private static final String DEFAULT_EVALUATION_PROMPT_TEXT = """
35+
private static final PromptTemplate DEFAULT_PROMPT_TEMPLATE = new PromptTemplate("""
2936
Your task is to evaluate if the response for the query
30-
is in line with the context information provided.\\n
31-
You have two options to answer. Either YES/ NO.\\n
32-
Answer - YES, if the response for the query
33-
is in line with context information otherwise NO.\\n
34-
Query: \\n {query}\\n
35-
Response: \\n {response}\\n
36-
Context: \\n {context}\\n
37-
Answer: "
38-
""";
37+
is in line with the context information provided.
38+
39+
You have two options to answer. Either YES or NO.
40+
41+
Answer YES, if the response for the query
42+
is in line with context information otherwise NO.
43+
44+
Query:
45+
{query}
46+
47+
Response:
48+
{response}
49+
50+
Context:
51+
{context}
52+
53+
Answer:
54+
""");
3955

4056
private final ChatClient.Builder chatClientBuilder;
4157

58+
private final PromptTemplate promptTemplate;
59+
4260
public RelevancyEvaluator(ChatClient.Builder chatClientBuilder) {
61+
this(chatClientBuilder, null);
62+
}
63+
64+
private RelevancyEvaluator(ChatClient.Builder chatClientBuilder, @Nullable PromptTemplate promptTemplate) {
65+
Assert.notNull(chatClientBuilder, "chatClientBuilder cannot be null");
4366
this.chatClientBuilder = chatClientBuilder;
67+
this.promptTemplate = promptTemplate != null ? promptTemplate : DEFAULT_PROMPT_TEMPLATE;
4468
}
4569

4670
@Override
4771
public EvaluationResponse evaluate(EvaluationRequest evaluationRequest) {
48-
4972
var response = evaluationRequest.getResponseContent();
5073
var context = doGetSupportingData(evaluationRequest);
5174

52-
String evaluationResponse = this.chatClientBuilder.build()
53-
.prompt()
54-
.user(userSpec -> userSpec.text(DEFAULT_EVALUATION_PROMPT_TEXT)
55-
.param("query", evaluationRequest.getUserText())
56-
.param("response", response)
57-
.param("context", context))
58-
.call()
59-
.content();
75+
var userMessage = this.promptTemplate
76+
.render(Map.of("query", evaluationRequest.getUserText(), "response", response, "context", context));
77+
78+
String evaluationResponse = this.chatClientBuilder.build().prompt().user(userMessage).call().content();
6079

6180
boolean passing = false;
6281
float score = 0;
63-
if (evaluationResponse.toLowerCase().contains("yes")) {
82+
if (evaluationResponse != null && evaluationResponse.toLowerCase().contains("yes")) {
6483
passing = true;
6584
score = 1;
6685
}
6786

6887
return new EvaluationResponse(passing, score, "", Collections.emptyMap());
6988
}
7089

90+
public static Builder builder() {
91+
return new Builder();
92+
}
93+
94+
public static class Builder {
95+
96+
private ChatClient.Builder chatClientBuilder;
97+
98+
private PromptTemplate promptTemplate;
99+
100+
private Builder() {
101+
}
102+
103+
public Builder chatClientBuilder(ChatClient.Builder chatClientBuilder) {
104+
this.chatClientBuilder = chatClientBuilder;
105+
return this;
106+
}
107+
108+
public Builder promptTemplate(PromptTemplate promptTemplate) {
109+
this.promptTemplate = promptTemplate;
110+
return this;
111+
}
112+
113+
public RelevancyEvaluator build() {
114+
return new RelevancyEvaluator(this.chatClientBuilder, this.promptTemplate);
115+
}
116+
117+
}
118+
71119
}
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
/*
2+
* Copyright 2023-2025 the original author or authors.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* https://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package org.springframework.ai.chat.evaluation;
18+
19+
import org.junit.jupiter.api.Test;
20+
import org.springframework.ai.chat.client.ChatClient;
21+
import org.springframework.ai.chat.model.ChatModel;
22+
23+
import static org.assertj.core.api.Assertions.assertThat;
24+
import static org.assertj.core.api.Assertions.assertThatThrownBy;
25+
import static org.mockito.Mockito.mock;
26+
27+
/**
28+
* Unit tests for {@link RelevancyEvaluator}.
29+
*
30+
* @author Thomas Vitale
31+
*/
32+
class RelevancyEvaluatorTests {
33+
34+
@Test
35+
void whenChatClientBuilderIsNullThenThrow() {
36+
assertThatThrownBy(() -> new RelevancyEvaluator(null)).isInstanceOf(IllegalArgumentException.class)
37+
.hasMessageContaining("chatClientBuilder cannot be null");
38+
39+
assertThatThrownBy(() -> RelevancyEvaluator.builder().chatClientBuilder(null).build())
40+
.isInstanceOf(IllegalArgumentException.class)
41+
.hasMessageContaining("chatClientBuilder cannot be null");
42+
}
43+
44+
@Test
45+
void whenPromptTemplateIsNullThenUseDefault() {
46+
RelevancyEvaluator evaluator = new RelevancyEvaluator(ChatClient.builder(mock(ChatModel.class)));
47+
assertThat(evaluator).isNotNull();
48+
49+
evaluator = RelevancyEvaluator.builder()
50+
.chatClientBuilder(ChatClient.builder(mock(ChatModel.class)))
51+
.promptTemplate(null)
52+
.build();
53+
assertThat(evaluator).isNotNull();
54+
}
55+
56+
}

spring-ai-docs/src/main/antora/modules/ROOT/pages/api/testing.adoc

Lines changed: 64 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,6 @@ One method to evaluate the response is to use the AI model itself for evaluation
66

77
The Spring AI interface for evaluating responses is `Evaluator`, defined as:
88

9-
10-
119
[source,java]
1210
----
1311
@FunctionalInterface
@@ -42,58 +40,88 @@ public class EvaluationRequest {
4240
* `dataList`: Contextual data, such as from Retrieval Augmented Generation, appended to the raw input.
4341
* `responseContent`: The AI model's response content as a `String`
4442
45-
== RelevancyEvaluator
43+
== Relevancy Evaluator
4644

47-
One implementation is the `RelevancyEvaluator`, which uses the AI model for evaluation. More implementations will be available in future releases.
45+
The `RelevancyEvaluator` is an implementation of the `Evaluator` interface, designed to assess the relevance of AI-generated responses against provided context. This evaluator helps assess the quality of a RAG flow by determining if the AI model's response is relevant to the user's input with respect to the retrieved context.
4846

49-
The `RelevancyEvaluator` uses the input (`userText`) and the AI model's output (`chatResponse`) to ask the question:
47+
The evaluation is based on the user input, the AI model's response, and the context information. It uses a prompt template to ask the AI model if the response is relevant to the user input and context.
5048

51-
[source, text]
49+
This is the default prompt template used by the `RelevancyEvaluator`:
50+
51+
[source,text]
5252
----
5353
Your task is to evaluate if the response for the query
54-
is in line with the context information provided.\n
55-
You have two options to answer. Either YES/ NO.\n
56-
Answer - YES, if the response for the query
57-
is in line with context information otherwise NO.\n
58-
Query: \n {query}\n
59-
Response: \n {response}\n
60-
Context: \n {context}\n
61-
Answer: "
62-
----
54+
is in line with the context information provided.
6355
64-
Here is an example of a JUnit test that performs a RAG query over a PDF document loaded into a Vector Store and then evaluates if the response is relevant to the user text.
56+
You have two options to answer. Either YES or NO.
6557
66-
[source,java]
67-
----
68-
@Test
69-
void testEvaluation() {
58+
Answer YES, if the response for the query
59+
is in line with context information otherwise NO.
7060
71-
dataController.delete();
72-
dataController.load();
61+
Query:
62+
{query}
7363
74-
String userText = "What is the purpose of Carina?";
64+
Response:
65+
{response}
7566
76-
ChatResponse response = ChatClient.builder(chatModel)
77-
.build().prompt()
78-
.advisors(new QuestionAnswerAdvisor(vectorStore))
79-
.user(userText)
80-
.call()
81-
.chatResponse();
82-
String responseContent = response.getResult().getOutput().getContent();
67+
Context:
68+
{context}
8369
84-
var relevancyEvaluator = new RelevancyEvaluator(ChatClient.builder(chatModel));
70+
Answer:
71+
----
8572

86-
EvaluationRequest evaluationRequest = new EvaluationRequest(userText,
87-
(List<Content>) response.getMetadata().get(QuestionAnswerAdvisor.RETRIEVED_DOCUMENTS), responseContent);
73+
NOTE: You can customize the prompt template by providing your own `PromptTemplate` object via the `.promptTemplate()` builder method. See xref:_custom_template[Custom Template] for details.
8874

89-
EvaluationResponse evaluationResponse = relevancyEvaluator.evaluate(evaluationRequest);
75+
== Usage in Integration Tests
9076

91-
assertTrue(evaluationResponse.isPass(), "Response is not relevant to the question");
77+
Here is an example of usage of the `RelevancyEvaluator` in an integration test, validating the result of a RAG flow using the `RetrievalAugmentationAdvisor`:
9278

79+
[source,java]
80+
----
81+
@Test
82+
void evaluateRelevancy() {
83+
String question = "Where does the adventure of Anacletus and Birba take place?";
84+
85+
RetrievalAugmentationAdvisor ragAdvisor = RetrievalAugmentationAdvisor.builder()
86+
.documentRetriever(VectorStoreDocumentRetriever.builder()
87+
.vectorStore(pgVectorStore)
88+
.build())
89+
.build();
90+
91+
ChatResponse chatResponse = ChatClient.builder(chatModel).build()
92+
.prompt(question)
93+
.advisors(ragAdvisor)
94+
.call()
95+
.chatResponse();
96+
97+
EvaluationRequest evaluationRequest = new EvaluationRequest(
98+
// The original user question
99+
question,
100+
// The retrieved context from the RAG flow
101+
chatResponse.getMetadata().get(RetrievalAugmentationAdvisor.DOCUMENT_CONTEXT),
102+
// The AI model's response
103+
chatResponse.getResult().getOutput().getText()
104+
);
105+
106+
RelevancyEvaluator evaluator = new RelevancyEvaluator(ChatClient.builder(chatModel));
107+
108+
EvaluationResponse evaluationResponse = evaluator.evaluate(evaluationRequest);
109+
110+
assertThat(evaluationResponse.isPass()).isTrue();
93111
}
94112
----
95113

96-
The code above is from the example application located https://github.com/rd-1-2022/ai-azure-rag.git[here].
114+
You can find several integration tests in the Spring AI project that use the `RelevancyEvaluator` to test the functionality of the `QuestionAnswerAdvisor` (see https://github.com/spring-projects/spring-ai/blob/main/spring-ai-integration-tests/src/test/java/org/springframework/ai/integration/tests/client/advisor/QuestionAnswerAdvisorIT.java[tests]) and `RetrievalAugmentationAdvisor` (see https://github.com/spring-projects/spring-ai/blob/main/spring-ai-integration-tests/src/test/java/org/springframework/ai/integration/tests/client/advisor/RetrievalAugmentationAdvisorIT.java[tests]).
115+
116+
=== Custom Template
117+
118+
The `RelevancyEvaluator` uses a default template to prompt the AI model for evaluation. You can customize this behavior by providing your own `PromptTemplate` object via the `.promptTemplate()` builder method.
119+
120+
The custom `PromptTemplate` can use any `TemplateRenderer` implementation (by default, it uses `StPromptTemplate` based on the https://www.stringtemplate.org/[StringTemplate] engine). The important requirement is that the template must contain the following placeholders:
121+
122+
* a `query` placeholder to receive the user question.
123+
* a `response` placeholder to receive the AI model's response.
124+
* a `context` placeholder to receive the context information.
97125

98126
== FactCheckingEvaluator
99127

0 commit comments

Comments
 (0)