sbintuitions · junya-takayama · Mar 30, 2026 · Mar 28, 2026 · Mar 30, 2026 · Mar 30, 2026
diff --git a/flexeval/core/chat_dataset/openai_messages.py b/flexeval/core/chat_dataset/openai_messages.py
@@ -27,6 +27,8 @@ class OpenAIMessagesDataset(ChatDataset):
         tool_definitions_key (str | None): Key used to extract the list of tool definitions from each JSON object.
             Set to `None` (default) for data without tool_calls.
         drop_if_last_from_assistant (bool): If true, when the last utterance is given by assistant, drop it.
+            And the last assistant utterance will be used as reference answer if `references_key` is not given.
+        references_key (str | None): Key used to extract the reference answers from each JSON object.
 
     In Jsonl, each line must have a following structure:
     ```json
@@ -77,6 +79,36 @@ class OpenAIMessagesDataset(ChatDataset):
       ]
     }
     ```
+
+    Example with reference answers:
+    ```json
+    {
+      '<message_key>': [
+        {
+          'role': 'user',
+          'content': 'こんにちは。元気が出る言葉を教えて下さい。'
+        },
+      ],
+      '<references_key>': [
+        'こんなのはどうでしょう。どんどんやってください！',
+        'こんなのはどうでしょう。頑張ってください！',
+      ],
+    }
+    ```
+
+    If there is only one reference answer for each conversation,
+    it can also be directly given as a string instead of a list:
+    ```json
+    {
+      '<message_key>': [
+        {
+          'role': 'user',
+          'content': 'こんにちは。元気が出る言葉を教えて下さい。'
+        },
+      ],
+      '<references_key>': 'こんなのはどうでしょう。どんどんやってください！',
+    }
+    ```
     """
 
     def __init__(
@@ -85,6 +117,7 @@ def __init__(
         message_key: str = "messages",
         tool_definitions_key: str | None = None,
         drop_if_last_from_assistant: bool = False,
+        references_key: str | None = None,
     ) -> None:
         self.conversations: list[ChatInstance] = []
         with open(file_path) as f:
@@ -95,9 +128,28 @@ def __init__(
                 tool_dicts = sample.get(tool_definitions_key, None)
 
             messages: list[dict[str, Any]] = sample.pop(message_key)
+            last_assistant_content: str | None = None
             if drop_if_last_from_assistant and messages[-1]["role"] == "assistant":
+                last_assistant_content = messages[-1].get("content", None)
                 messages = messages[:-1]
-            self.conversations.append(ChatInstance(messages=messages, tools=tool_dicts, extra_info=sample))
+
+            if references_key:
+                references = sample.pop(references_key, None)
+                if isinstance(references, str):
+                    references = [references]
+                elif isinstance(references, list) and all(isinstance(ref, str) for ref in references):
+                    pass
+                else:
+                    msg = "Invalid format for references."
+                    raise ValueError(msg)
+            elif references_key is None and last_assistant_content:
+                references = [last_assistant_content]
+            else:
+                references = []
+
+            self.conversations.append(
+                ChatInstance(messages=messages, tools=tool_dicts, references=references, extra_info=sample)
+            )
 
     def __len__(self) -> int:
         return len(self.conversations)

diff --git a/tests/core/chat_dataset/test_openai_messages.py b/tests/core/chat_dataset/test_openai_messages.py
@@ -23,14 +23,21 @@
 @pytest.fixture
 def jsonl_data_factory(tmp_path) -> Callable:  # noqa: ANN001
     def _create(
-        message_key: str, messages_list: list[dict], num_samples: int = 10, extra_info: dict | None = None
+        message_key: str,
+        messages_list: list[dict],
+        num_samples: int = 10,
+        extra_info: dict | None = None,
+        references_key: str | None = None,
+        references_list: list[list[str]] | None = None,
     ) -> str:
         file_path = tmp_path / f"mock_data_{message_key}.jsonl"
         with open(file_path, "w") as f:
-            for messages in messages_list * num_samples:
+            for i, messages in enumerate(messages_list * num_samples):
                 sample = {message_key: messages}
                 if extra_info is not None:
                     sample = {**extra_info, **sample}
+                if references_key is not None and references_list is not None:
+                    sample[references_key] = references_list[i % len(references_list)]
                 f.write(json.dumps(sample) + "\n")
         return str(file_path)
 
@@ -68,7 +75,8 @@ def test_load_dataset_with_drop_if_last_from_assistant(jsonl_data_factory) -> No
             {"role": TEST_CHAT_MESSAGES[0][0]["role"], "content": TEST_CHAT_MESSAGES[0][0]["content"]},
             {"role": TEST_CHAT_MESSAGES[0][1]["role"], "content": TEST_CHAT_MESSAGES[0][1]["content"]},
             {"role": TEST_CHAT_MESSAGES[0][2]["role"], "content": TEST_CHAT_MESSAGES[0][2]["content"]},
-        ]
+        ],
+        references=["You're welcome!"],
     )
 
     test_chat_messages_with_last_user = deepcopy(TEST_CHAT_MESSAGES)
@@ -83,7 +91,7 @@ def test_load_dataset_with_drop_if_last_from_assistant(jsonl_data_factory) -> No
             {"role": TEST_CHAT_MESSAGES[0][0]["role"], "content": TEST_CHAT_MESSAGES[0][0]["content"]},
             {"role": TEST_CHAT_MESSAGES[0][1]["role"], "content": TEST_CHAT_MESSAGES[0][1]["content"]},
             {"role": TEST_CHAT_MESSAGES[0][2]["role"], "content": TEST_CHAT_MESSAGES[0][2]["content"]},
-        ]
+        ],
     )
 
 
@@ -101,6 +109,7 @@ def test_load_dataset_with_extra_info(jsonl_data_factory) -> None:  # noqa: ANN0
             {"role": TEST_CHAT_MESSAGES[0][2]["role"], "content": TEST_CHAT_MESSAGES[0][2]["content"]},
         ],
         extra_info={"extra_info": "some_info"},
+        references=["You're welcome!"],
     )
 
 
@@ -207,3 +216,62 @@ def test_load_dataset_with_tools(mock_chat_messages_with_tools_data_path: str) -
     assert processed_tool_response_2 == input_tool_response_2["content"]
     # assistant response turn
     assert chat_messages[4] == {"role": "assistant", "content": messages_dicts[4]["content"]}
+
+
+def test_load_dataset_with_references(jsonl_data_factory) -> None:  # noqa: ANN001
+    tmp_jsonl_path = jsonl_data_factory(
+        message_key="messages",
+        messages_list=[[{"role": "user", "content": "This is a user message."}]],
+        references_key="references",
+        references_list=[["This is a reference answer.", "This is another reference answer."]],
+    )
+
+    dataset = OpenAIMessagesDataset(file_path=tmp_jsonl_path, message_key="messages", references_key="references")
+
+    assert len(dataset) == 10
+    assert dataset[0] == ChatInstance(
+        messages=[{"role": "user", "content": "This is a user message."}],
+        references=["This is a reference answer.", "This is another reference answer."],
+    )
+
+
+def test_load_dataset_with_references_as_string(jsonl_data_factory) -> None:  # noqa: ANN001
+    tmp_jsonl_path = jsonl_data_factory(
+        message_key="messages",
+        messages_list=[[{"role": "user", "content": "This is a user message."}]],
+        references_key="references",
+        references_list=["This is a reference answer."],
+    )
+
+    dataset = OpenAIMessagesDataset(file_path=tmp_jsonl_path, message_key="messages", references_key="references")
+
+    assert len(dataset) == 10
+    assert dataset[0] == ChatInstance(
+        messages=[{"role": "user", "content": "This is a user message."}],
+        references=["This is a reference answer."],
+    )
+
+
+def test_load_dataset_with_references_and_drop_if_last_from_assistant(jsonl_data_factory) -> None:  # noqa: ANN001
+    tmp_jsonl_path = jsonl_data_factory(
+        message_key="messages",
+        messages_list=TEST_CHAT_MESSAGES,
+        references_key="references",
+        references_list=["This is a reference answer."],
+    )
+
+    dataset = OpenAIMessagesDataset(
+        file_path=tmp_jsonl_path, message_key="messages", references_key="references", drop_if_last_from_assistant=True
+    )
+
+    assert len(dataset) == 10
+    # When both drop_if_last_from_assistant and references_key are specified,
+    # the reference is always taken from the references_key field, not from the dropped assistant message.
+    assert dataset[0] == ChatInstance(
+        messages=[
+            {"role": TEST_CHAT_MESSAGES[0][0]["role"], "content": TEST_CHAT_MESSAGES[0][0]["content"]},
+            {"role": TEST_CHAT_MESSAGES[0][1]["role"], "content": TEST_CHAT_MESSAGES[0][1]["content"]},
+            {"role": TEST_CHAT_MESSAGES[0][2]["role"], "content": TEST_CHAT_MESSAGES[0][2]["content"]},
+        ],
+        references=["This is a reference answer."],
+    )