From 0bb3dd97565414ca32bcd98b15c9886ff96c16f4 Mon Sep 17 00:00:00 2001
From: junya-takayama <clps1220@gmail.com>
Date: Sat, 28 Mar 2026 15:05:30 +0900
Subject: [PATCH 1/3] Support reference answers in the OpenAI Messages Dataset

---
 flexeval/core/chat_dataset/openai_messages.py | 56 +++++++++++++-
 .../core/chat_dataset/test_openai_messages.py | 76 ++++++++++++++++++-
 2 files changed, 127 insertions(+), 5 deletions(-)

diff --git a/flexeval/core/chat_dataset/openai_messages.py b/flexeval/core/chat_dataset/openai_messages.py
index 22b41bf4..bd44d114 100644
--- a/flexeval/core/chat_dataset/openai_messages.py
+++ b/flexeval/core/chat_dataset/openai_messages.py
@@ -27,6 +27,10 @@ class OpenAIMessagesDataset(ChatDataset):
         tool_definitions_key (str | None): Key used to extract the list of tool definitions from each JSON object.
             Set to `None` (default) for data without tool_calls.
         drop_if_last_from_assistant (bool): If true, when the last utterance is given by assistant, drop it.
+            If references_key is None and drop_if_last_from_assistant is True,
+            the last assistant utterance will be used as reference answer.
+        references_key (str | None): Key used to extract the reference answers from each JSON object.
+            Set to `None` (default) for data without reference answers.
 
     In Jsonl, each line must have a following structure:
     ```json
@@ -77,6 +81,36 @@ class OpenAIMessagesDataset(ChatDataset):
       ]
     }
     ```
+
+    Example with reference answers:
+    ```json
+    {
+      '<message_key>': [
+        {
+          'role': 'user',
+          'content': 'こんにちは。元気が出る言葉を教えて下さい。'
+        },
+      ],
+      '<references_key>': [
+        'こんなのはどうでしょう。どんどんやってください！',
+        'こんなのはどうでしょう。頑張ってください！',
+      ],
+    }
+    ```
+
+    If there is only one reference answer for each conversation,
+    it can also be directly given as a string instead of a list:
+    ```json
+    {
+      '<message_key>': [
+        {
+          'role': 'user',
+          'content': 'こんにちは。元気が出る言葉を教えて下さい。'
+        },
+      ],
+      '<references_key>': 'こんなのはどうでしょう。どんどんやってください！',
+    }
+    ```
     """
 
     def __init__(
@@ -85,6 +119,7 @@ def __init__(
         message_key: str = "messages",
         tool_definitions_key: str | None = None,
         drop_if_last_from_assistant: bool = False,
+        references_key: str | None = None,
     ) -> None:
         self.conversations: list[ChatInstance] = []
         with open(file_path) as f:
@@ -95,9 +130,28 @@ def __init__(
                 tool_dicts = sample.get(tool_definitions_key, None)
 
             messages: list[dict[str, Any]] = sample.pop(message_key)
+            last_assistant_content: str | None = None
             if drop_if_last_from_assistant and messages[-1]["role"] == "assistant":
+                last_assistant_content = messages[-1].get("content", None)
                 messages = messages[:-1]
-            self.conversations.append(ChatInstance(messages=messages, tools=tool_dicts, extra_info=sample))
+
+            if references_key:
+              references = sample.pop(references_key, None)
+              if isinstance(references, str):
+                references = [references]
+              elif isinstance(references, list) and all(isinstance(ref, str) for ref in references):
+                pass
+              else:
+                msg = "Invalid format for references."
+                raise ValueError(msg)
+            elif references_key is None and last_assistant_content:
+              references = [last_assistant_content]
+            else:
+              references = []
+
+            self.conversations.append(
+              ChatInstance(messages=messages, tools=tool_dicts, references=references, extra_info=sample)
+            )
 
     def __len__(self) -> int:
         return len(self.conversations)
diff --git a/tests/core/chat_dataset/test_openai_messages.py b/tests/core/chat_dataset/test_openai_messages.py
index a1360873..eb2c9cb6 100644
--- a/tests/core/chat_dataset/test_openai_messages.py
+++ b/tests/core/chat_dataset/test_openai_messages.py
@@ -23,14 +23,21 @@
 @pytest.fixture
 def jsonl_data_factory(tmp_path) -> Callable:  # noqa: ANN001
     def _create(
-        message_key: str, messages_list: list[dict], num_samples: int = 10, extra_info: dict | None = None
+        message_key: str,
+        messages_list: list[dict],
+        num_samples: int = 10,
+        extra_info: dict | None = None,
+        references_key: str | None = None,
+        references_list: list[list[str]] | None = None,
     ) -> str:
         file_path = tmp_path / f"mock_data_{message_key}.jsonl"
         with open(file_path, "w") as f:
-            for messages in messages_list * num_samples:
+            for i, messages in enumerate(messages_list * num_samples):
                 sample = {message_key: messages}
                 if extra_info is not None:
                     sample = {**extra_info, **sample}
+                if references_key is not None and references_list is not None:
+                    sample[references_key] = references_list[i % len(references_list)]
                 f.write(json.dumps(sample) + "\n")
         return str(file_path)
 
@@ -68,7 +75,8 @@ def test_load_dataset_with_drop_if_last_from_assistant(jsonl_data_factory) -> No
             {"role": TEST_CHAT_MESSAGES[0][0]["role"], "content": TEST_CHAT_MESSAGES[0][0]["content"]},
             {"role": TEST_CHAT_MESSAGES[0][1]["role"], "content": TEST_CHAT_MESSAGES[0][1]["content"]},
             {"role": TEST_CHAT_MESSAGES[0][2]["role"], "content": TEST_CHAT_MESSAGES[0][2]["content"]},
-        ]
+        ],
+        references=["You're welcome!"],
     )
 
     test_chat_messages_with_last_user = deepcopy(TEST_CHAT_MESSAGES)
@@ -83,7 +91,7 @@ def test_load_dataset_with_drop_if_last_from_assistant(jsonl_data_factory) -> No
             {"role": TEST_CHAT_MESSAGES[0][0]["role"], "content": TEST_CHAT_MESSAGES[0][0]["content"]},
             {"role": TEST_CHAT_MESSAGES[0][1]["role"], "content": TEST_CHAT_MESSAGES[0][1]["content"]},
             {"role": TEST_CHAT_MESSAGES[0][2]["role"], "content": TEST_CHAT_MESSAGES[0][2]["content"]},
-        ]
+        ],
     )
 
 
@@ -101,6 +109,7 @@ def test_load_dataset_with_extra_info(jsonl_data_factory) -> None:  # noqa: ANN0
             {"role": TEST_CHAT_MESSAGES[0][2]["role"], "content": TEST_CHAT_MESSAGES[0][2]["content"]},
         ],
         extra_info={"extra_info": "some_info"},
+        references=["You're welcome!"],
     )
 
 
@@ -207,3 +216,62 @@ def test_load_dataset_with_tools(mock_chat_messages_with_tools_data_path: str) -
     assert processed_tool_response_2 == input_tool_response_2["content"]
     # assistant response turn
     assert chat_messages[4] == {"role": "assistant", "content": messages_dicts[4]["content"]}
+
+
+def test_load_dataset_with_references(jsonl_data_factory) -> None:  # noqa: ANN001
+    tmp_jsonl_path = jsonl_data_factory(
+        message_key="messages",
+        messages_list=[[{"role": "user", "content": "This is a user message."}]],
+        references_key="references",
+        references_list=[["This is a reference answer.", "This is another reference answer."]],
+    )
+
+    dataset = OpenAIMessagesDataset(file_path=tmp_jsonl_path, message_key="messages", references_key="references")
+
+    assert len(dataset) == 10
+    assert dataset[0] == ChatInstance(
+        messages=[{"role": "user", "content": "This is a user message."}],
+        references=["This is a reference answer.", "This is another reference answer."],
+    )
+
+
+def test_load_dataset_with_references_as_string(jsonl_data_factory) -> None:  # noqa: ANN001
+    tmp_jsonl_path = jsonl_data_factory(
+        message_key="messages",
+        messages_list=[[{"role": "user", "content": "This is a user message."}]],
+        references_key="references",
+        references_list=["This is a reference answer."],
+    )
+
+    dataset = OpenAIMessagesDataset(file_path=tmp_jsonl_path, message_key="messages", references_key="references")
+
+    assert len(dataset) == 10
+    assert dataset[0] == ChatInstance(
+        messages=[{"role": "user", "content": "This is a user message."}],
+        references=["This is a reference answer."],
+    )
+
+
+def test_load_dataset_with_references_and_drop_if_last_from_assistant(jsonl_data_factory) -> None:  # noqa: ANN001
+    tmp_jsonl_path = jsonl_data_factory(
+        message_key="messages",
+        messages_list=TEST_CHAT_MESSAGES,
+        references_key="references",
+        references_list=["This is a reference answer."],
+    )
+
+    dataset = OpenAIMessagesDataset(
+        file_path=tmp_jsonl_path, message_key="messages", references_key="references", drop_if_last_from_assistant=True
+    )
+
+    assert len(dataset) == 10
+    # When both drop_if_last_from_assistant and references_key are specified,
+    # the reference is always taken from the references_key field, not from the dropped assistant message.
+    assert dataset[0] == ChatInstance(
+        messages=[
+            {"role": TEST_CHAT_MESSAGES[0][0]["role"], "content": TEST_CHAT_MESSAGES[0][0]["content"]},
+            {"role": TEST_CHAT_MESSAGES[0][1]["role"], "content": TEST_CHAT_MESSAGES[0][1]["content"]},
+            {"role": TEST_CHAT_MESSAGES[0][2]["role"], "content": TEST_CHAT_MESSAGES[0][2]["content"]},
+        ],
+        references=["This is a reference answer."],
+    )

From 114296459fada37bbc17d5d72a2ab9e84572fabd Mon Sep 17 00:00:00 2001
From: junya-takayama <clps1220@gmail.com>
Date: Mon, 30 Mar 2026 09:35:35 +0900
Subject: [PATCH 2/3] lint

---
 flexeval/core/chat_dataset/openai_messages.py | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/flexeval/core/chat_dataset/openai_messages.py b/flexeval/core/chat_dataset/openai_messages.py
index bd44d114..74538541 100644
--- a/flexeval/core/chat_dataset/openai_messages.py
+++ b/flexeval/core/chat_dataset/openai_messages.py
@@ -136,21 +136,21 @@ def __init__(
                 messages = messages[:-1]
 
             if references_key:
-              references = sample.pop(references_key, None)
-              if isinstance(references, str):
-                references = [references]
-              elif isinstance(references, list) and all(isinstance(ref, str) for ref in references):
-                pass
-              else:
-                msg = "Invalid format for references."
-                raise ValueError(msg)
+                references = sample.pop(references_key, None)
+                if isinstance(references, str):
+                    references = [references]
+                elif isinstance(references, list) and all(isinstance(ref, str) for ref in references):
+                    pass
+                else:
+                    msg = "Invalid format for references."
+                    raise ValueError(msg)
             elif references_key is None and last_assistant_content:
-              references = [last_assistant_content]
+                references = [last_assistant_content]
             else:
-              references = []
+                references = []
 
             self.conversations.append(
-              ChatInstance(messages=messages, tools=tool_dicts, references=references, extra_info=sample)
+                ChatInstance(messages=messages, tools=tool_dicts, references=references, extra_info=sample)
             )
 
     def __len__(self) -> int:

From 4310c2b009161bab70d0ca14aec99beb2245cc1e Mon Sep 17 00:00:00 2001
From: junya-takayama <clps1220@gmail.com>
Date: Mon, 30 Mar 2026 12:49:39 +0900
Subject: [PATCH 3/3] update doc

---
 flexeval/core/chat_dataset/openai_messages.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/flexeval/core/chat_dataset/openai_messages.py b/flexeval/core/chat_dataset/openai_messages.py
index 74538541..91aef1b1 100644
--- a/flexeval/core/chat_dataset/openai_messages.py
+++ b/flexeval/core/chat_dataset/openai_messages.py
@@ -27,10 +27,8 @@ class OpenAIMessagesDataset(ChatDataset):
         tool_definitions_key (str | None): Key used to extract the list of tool definitions from each JSON object.
             Set to `None` (default) for data without tool_calls.
         drop_if_last_from_assistant (bool): If true, when the last utterance is given by assistant, drop it.
-            If references_key is None and drop_if_last_from_assistant is True,
-            the last assistant utterance will be used as reference answer.
+            And the last assistant utterance will be used as reference answer if `references_key` is not given.
         references_key (str | None): Key used to extract the reference answers from each JSON object.
-            Set to `None` (default) for data without reference answers.
 
     In Jsonl, each line must have a following structure:
     ```json