From 0bb3dd97565414ca32bcd98b15c9886ff96c16f4 Mon Sep 17 00:00:00 2001 From: junya-takayama Date: Sat, 28 Mar 2026 15:05:30 +0900 Subject: [PATCH 1/3] Support reference answers in the OpenAI Messages Dataset --- flexeval/core/chat_dataset/openai_messages.py | 56 +++++++++++++- .../core/chat_dataset/test_openai_messages.py | 76 ++++++++++++++++++- 2 files changed, 127 insertions(+), 5 deletions(-) diff --git a/flexeval/core/chat_dataset/openai_messages.py b/flexeval/core/chat_dataset/openai_messages.py index 22b41bf4..bd44d114 100644 --- a/flexeval/core/chat_dataset/openai_messages.py +++ b/flexeval/core/chat_dataset/openai_messages.py @@ -27,6 +27,10 @@ class OpenAIMessagesDataset(ChatDataset): tool_definitions_key (str | None): Key used to extract the list of tool definitions from each JSON object. Set to `None` (default) for data without tool_calls. drop_if_last_from_assistant (bool): If true, when the last utterance is given by assistant, drop it. + If references_key is None and drop_if_last_from_assistant is True, + the last assistant utterance will be used as reference answer. + references_key (str | None): Key used to extract the reference answers from each JSON object. + Set to `None` (default) for data without reference answers. In Jsonl, each line must have a following structure: ```json @@ -77,6 +81,36 @@ class OpenAIMessagesDataset(ChatDataset): ] } ``` + + Example with reference answers: + ```json + { + '': [ + { + 'role': 'user', + 'content': 'こんにちは。元気が出る言葉を教えて下さい。' + }, + ], + '': [ + 'こんなのはどうでしょう。どんどんやってください!', + 'こんなのはどうでしょう。頑張ってください!', + ], + } + ``` + + If there is only one reference answer for each conversation, + it can also be directly given as a string instead of a list: + ```json + { + '': [ + { + 'role': 'user', + 'content': 'こんにちは。元気が出る言葉を教えて下さい。' + }, + ], + '': 'こんなのはどうでしょう。どんどんやってください!', + } + ``` """ def __init__( @@ -85,6 +119,7 @@ def __init__( message_key: str = "messages", tool_definitions_key: str | None = None, drop_if_last_from_assistant: bool = False, + references_key: str | None = None, ) -> None: self.conversations: list[ChatInstance] = [] with open(file_path) as f: @@ -95,9 +130,28 @@ def __init__( tool_dicts = sample.get(tool_definitions_key, None) messages: list[dict[str, Any]] = sample.pop(message_key) + last_assistant_content: str | None = None if drop_if_last_from_assistant and messages[-1]["role"] == "assistant": + last_assistant_content = messages[-1].get("content", None) messages = messages[:-1] - self.conversations.append(ChatInstance(messages=messages, tools=tool_dicts, extra_info=sample)) + + if references_key: + references = sample.pop(references_key, None) + if isinstance(references, str): + references = [references] + elif isinstance(references, list) and all(isinstance(ref, str) for ref in references): + pass + else: + msg = "Invalid format for references." + raise ValueError(msg) + elif references_key is None and last_assistant_content: + references = [last_assistant_content] + else: + references = [] + + self.conversations.append( + ChatInstance(messages=messages, tools=tool_dicts, references=references, extra_info=sample) + ) def __len__(self) -> int: return len(self.conversations) diff --git a/tests/core/chat_dataset/test_openai_messages.py b/tests/core/chat_dataset/test_openai_messages.py index a1360873..eb2c9cb6 100644 --- a/tests/core/chat_dataset/test_openai_messages.py +++ b/tests/core/chat_dataset/test_openai_messages.py @@ -23,14 +23,21 @@ @pytest.fixture def jsonl_data_factory(tmp_path) -> Callable: # noqa: ANN001 def _create( - message_key: str, messages_list: list[dict], num_samples: int = 10, extra_info: dict | None = None + message_key: str, + messages_list: list[dict], + num_samples: int = 10, + extra_info: dict | None = None, + references_key: str | None = None, + references_list: list[list[str]] | None = None, ) -> str: file_path = tmp_path / f"mock_data_{message_key}.jsonl" with open(file_path, "w") as f: - for messages in messages_list * num_samples: + for i, messages in enumerate(messages_list * num_samples): sample = {message_key: messages} if extra_info is not None: sample = {**extra_info, **sample} + if references_key is not None and references_list is not None: + sample[references_key] = references_list[i % len(references_list)] f.write(json.dumps(sample) + "\n") return str(file_path) @@ -68,7 +75,8 @@ def test_load_dataset_with_drop_if_last_from_assistant(jsonl_data_factory) -> No {"role": TEST_CHAT_MESSAGES[0][0]["role"], "content": TEST_CHAT_MESSAGES[0][0]["content"]}, {"role": TEST_CHAT_MESSAGES[0][1]["role"], "content": TEST_CHAT_MESSAGES[0][1]["content"]}, {"role": TEST_CHAT_MESSAGES[0][2]["role"], "content": TEST_CHAT_MESSAGES[0][2]["content"]}, - ] + ], + references=["You're welcome!"], ) test_chat_messages_with_last_user = deepcopy(TEST_CHAT_MESSAGES) @@ -83,7 +91,7 @@ def test_load_dataset_with_drop_if_last_from_assistant(jsonl_data_factory) -> No {"role": TEST_CHAT_MESSAGES[0][0]["role"], "content": TEST_CHAT_MESSAGES[0][0]["content"]}, {"role": TEST_CHAT_MESSAGES[0][1]["role"], "content": TEST_CHAT_MESSAGES[0][1]["content"]}, {"role": TEST_CHAT_MESSAGES[0][2]["role"], "content": TEST_CHAT_MESSAGES[0][2]["content"]}, - ] + ], ) @@ -101,6 +109,7 @@ def test_load_dataset_with_extra_info(jsonl_data_factory) -> None: # noqa: ANN0 {"role": TEST_CHAT_MESSAGES[0][2]["role"], "content": TEST_CHAT_MESSAGES[0][2]["content"]}, ], extra_info={"extra_info": "some_info"}, + references=["You're welcome!"], ) @@ -207,3 +216,62 @@ def test_load_dataset_with_tools(mock_chat_messages_with_tools_data_path: str) - assert processed_tool_response_2 == input_tool_response_2["content"] # assistant response turn assert chat_messages[4] == {"role": "assistant", "content": messages_dicts[4]["content"]} + + +def test_load_dataset_with_references(jsonl_data_factory) -> None: # noqa: ANN001 + tmp_jsonl_path = jsonl_data_factory( + message_key="messages", + messages_list=[[{"role": "user", "content": "This is a user message."}]], + references_key="references", + references_list=[["This is a reference answer.", "This is another reference answer."]], + ) + + dataset = OpenAIMessagesDataset(file_path=tmp_jsonl_path, message_key="messages", references_key="references") + + assert len(dataset) == 10 + assert dataset[0] == ChatInstance( + messages=[{"role": "user", "content": "This is a user message."}], + references=["This is a reference answer.", "This is another reference answer."], + ) + + +def test_load_dataset_with_references_as_string(jsonl_data_factory) -> None: # noqa: ANN001 + tmp_jsonl_path = jsonl_data_factory( + message_key="messages", + messages_list=[[{"role": "user", "content": "This is a user message."}]], + references_key="references", + references_list=["This is a reference answer."], + ) + + dataset = OpenAIMessagesDataset(file_path=tmp_jsonl_path, message_key="messages", references_key="references") + + assert len(dataset) == 10 + assert dataset[0] == ChatInstance( + messages=[{"role": "user", "content": "This is a user message."}], + references=["This is a reference answer."], + ) + + +def test_load_dataset_with_references_and_drop_if_last_from_assistant(jsonl_data_factory) -> None: # noqa: ANN001 + tmp_jsonl_path = jsonl_data_factory( + message_key="messages", + messages_list=TEST_CHAT_MESSAGES, + references_key="references", + references_list=["This is a reference answer."], + ) + + dataset = OpenAIMessagesDataset( + file_path=tmp_jsonl_path, message_key="messages", references_key="references", drop_if_last_from_assistant=True + ) + + assert len(dataset) == 10 + # When both drop_if_last_from_assistant and references_key are specified, + # the reference is always taken from the references_key field, not from the dropped assistant message. + assert dataset[0] == ChatInstance( + messages=[ + {"role": TEST_CHAT_MESSAGES[0][0]["role"], "content": TEST_CHAT_MESSAGES[0][0]["content"]}, + {"role": TEST_CHAT_MESSAGES[0][1]["role"], "content": TEST_CHAT_MESSAGES[0][1]["content"]}, + {"role": TEST_CHAT_MESSAGES[0][2]["role"], "content": TEST_CHAT_MESSAGES[0][2]["content"]}, + ], + references=["This is a reference answer."], + ) From 114296459fada37bbc17d5d72a2ab9e84572fabd Mon Sep 17 00:00:00 2001 From: junya-takayama Date: Mon, 30 Mar 2026 09:35:35 +0900 Subject: [PATCH 2/3] lint --- flexeval/core/chat_dataset/openai_messages.py | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/flexeval/core/chat_dataset/openai_messages.py b/flexeval/core/chat_dataset/openai_messages.py index bd44d114..74538541 100644 --- a/flexeval/core/chat_dataset/openai_messages.py +++ b/flexeval/core/chat_dataset/openai_messages.py @@ -136,21 +136,21 @@ def __init__( messages = messages[:-1] if references_key: - references = sample.pop(references_key, None) - if isinstance(references, str): - references = [references] - elif isinstance(references, list) and all(isinstance(ref, str) for ref in references): - pass - else: - msg = "Invalid format for references." - raise ValueError(msg) + references = sample.pop(references_key, None) + if isinstance(references, str): + references = [references] + elif isinstance(references, list) and all(isinstance(ref, str) for ref in references): + pass + else: + msg = "Invalid format for references." + raise ValueError(msg) elif references_key is None and last_assistant_content: - references = [last_assistant_content] + references = [last_assistant_content] else: - references = [] + references = [] self.conversations.append( - ChatInstance(messages=messages, tools=tool_dicts, references=references, extra_info=sample) + ChatInstance(messages=messages, tools=tool_dicts, references=references, extra_info=sample) ) def __len__(self) -> int: From 4310c2b009161bab70d0ca14aec99beb2245cc1e Mon Sep 17 00:00:00 2001 From: junya-takayama Date: Mon, 30 Mar 2026 12:49:39 +0900 Subject: [PATCH 3/3] update doc --- flexeval/core/chat_dataset/openai_messages.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/flexeval/core/chat_dataset/openai_messages.py b/flexeval/core/chat_dataset/openai_messages.py index 74538541..91aef1b1 100644 --- a/flexeval/core/chat_dataset/openai_messages.py +++ b/flexeval/core/chat_dataset/openai_messages.py @@ -27,10 +27,8 @@ class OpenAIMessagesDataset(ChatDataset): tool_definitions_key (str | None): Key used to extract the list of tool definitions from each JSON object. Set to `None` (default) for data without tool_calls. drop_if_last_from_assistant (bool): If true, when the last utterance is given by assistant, drop it. - If references_key is None and drop_if_last_from_assistant is True, - the last assistant utterance will be used as reference answer. + And the last assistant utterance will be used as reference answer if `references_key` is not given. references_key (str | None): Key used to extract the reference answers from each JSON object. - Set to `None` (default) for data without reference answers. In Jsonl, each line must have a following structure: ```json