From a4b17efb61c13000488e0d6df8e3d7a8e2c59d9e Mon Sep 17 00:00:00 2001 From: Maik Froebe Date: Thu, 24 Apr 2025 17:00:36 +0200 Subject: [PATCH 01/15] start to develop trec-tot-2025 --- ir_datasets/datasets/__init__.py | 3 + ir_datasets/datasets/trec_tot_2025.py | 142 ++++++++++++++++++++++++ test/trec-tot-2025/test_docs_iter.py | 39 +++++++ test/trec-tot-2025/test_docs_store.py | 42 +++++++ test/trec-tot-2025/test_qrel_iter.py | 54 +++++++++ test/trec-tot-2025/test_queries_iter.py | 78 +++++++++++++ 6 files changed, 358 insertions(+) create mode 100644 ir_datasets/datasets/trec_tot_2025.py create mode 100644 test/trec-tot-2025/test_docs_iter.py create mode 100644 test/trec-tot-2025/test_docs_store.py create mode 100644 test/trec-tot-2025/test_qrel_iter.py create mode 100644 test/trec-tot-2025/test_queries_iter.py diff --git a/ir_datasets/datasets/__init__.py b/ir_datasets/datasets/__init__.py index 41568aee..80a608e1 100644 --- a/ir_datasets/datasets/__init__.py +++ b/ir_datasets/datasets/__init__.py @@ -56,3 +56,6 @@ from . import hc4 from . import neuclir # must be after hc4 from . import sara +from . import trec_tot_2025 + +trec_tot_2025.register_dataset() diff --git a/ir_datasets/datasets/trec_tot_2025.py b/ir_datasets/datasets/trec_tot_2025.py new file mode 100644 index 00000000..4b8c4f23 --- /dev/null +++ b/ir_datasets/datasets/trec_tot_2025.py @@ -0,0 +1,142 @@ +from ir_datasets import registry +from ir_datasets.datasets.base import Dataset +from ir_datasets.util.download import RequestsDownload +from ir_datasets.formats.base import BaseDocs +from ir_datasets.indices import Docstore +from ir_datasets.util import ZipExtractCache, home_path, Cache +from ir_datasets.formats import BaseDocs, TrecQrels, JsonlQueries +from ir_datasets.indices import PickleLz4FullStore +import os +import gzip +import json +from tqdm import tqdm +from typing import NamedTuple + +NAME = "trec-tot" + +def cached_tot_resource(url, md5): + streamer = RequestsDownload(url) + return Cache(streamer, home_path() / "trec-tot-2025" / url.split("/")[-1]) + +class JsonlDocumentOffset(NamedTuple): + doc_id: str + offset_start: int + offset_end: int + + +class TrecToT2025Doc(): + def __init__(self, json_doc): + parsed_doc = json.loads(json_doc) + self.doc_id = parsed_doc["id"] + self.title = parsed_doc["title"] + self.url = parsed_doc["url"] + self.text = parsed_doc["text"] + + def default_text(self): + return self.title + " " + self.text + +class JsonlWithOffsetsDocsStore(Docstore): + def __init__(self, docs, offsets): + self.__docs = docs + self.__offsets = offsets + self._docs_dict = None + self._id_field = "doc_id" + + def offsets_iter(self): + with gzip.open(self.__offsets.path(), "rt") as f: + for i in f: + i = json.loads(i) + yield JsonlDocumentOffset(doc_id=i["id"], offset_start=i["offset_start"], offset_end=i["offset_end"]) + + def docs_dict(self): + return PickleLz4FullStore( + path=str(self.__offsets.path().absolute().resolve()) + '.pklz4', + init_iter_fn=self.offsets_iter, + data_cls=JsonlDocumentOffset, + lookup_field="doc_id", + index_fields=("doc_id",) + ) + + def get_many_iter(self, doc_ids): + offsets = self.docs_dict() + + with open(self.__docs.path(), "rb") as f: + for doc in doc_ids: + doc = offsets.get(doc) + f.seek(doc.offset_start) + raw_content_bytes = f.read(doc.offset_end - doc.offset_start) + yield gzip.decompress(raw_content_bytes) + + +class TrecToT2025DocsStore(JsonlWithOffsetsDocsStore): + def get_many_iter(self, doc_ids): + for i in super().get_many_iter(doc_ids): + yield TrecToT2025Doc(i) + + +class JsonlDocumentsWithOffsets(BaseDocs): + def __init__(self, docs, offsets): + self.__docs = docs + self.__offsets = offsets + + def docs_iter(self): + with gzip.open(self.__docs.path()) as f: + for l in f: + yield TrecToT2025Doc(l) + + def docs_cls(self): + return self._cls + + def docs_store(self, field='doc_id'): + return TrecToT2025DocsStore(self.__docs, self.__offsets) + + def docs_namespace(self): + raise ValueError("ToDo: Implement this") + + def docs_count(self): + return len(self.docs_dict()) + + def docs_lang(self): + raise ValueError("ToDo: Implement this") + + +class TrecToT2025Dataset(Dataset): + def __init__(self, docs_jsonl_file, offset_jsonl_file, queries=None, qrels=None): + docs = JsonlDocumentsWithOffsets(docs_jsonl_file, offset_jsonl_file) + + if queries: + queries = JsonlQueries(queries, lang='en', mapping={"text": "query", "query_id": "query_id"}) + if qrels: + qrels = TrecQrels(qrels, {0: 'Not Relevant', 1: 'Relevant'}) + + super().__init__(docs, queries, qrels) + + +def register_dataset(): + if f"{NAME}/2025" in registry: + return + + doc_offsets = cached_tot_resource("https://files.webis.de/data-in-progress/trec-tot-2025-offsets.jsonl.gz", "00678e3155d962bb244e034e6401b79b") + doc_corpus = cached_tot_resource("https://files.webis.de/data-in-progress/trec-tot-2025-corpus.jsonl.gz", "a2c82398aa86df6a68c8706b9b462bf2") + registry.register(f"{NAME}/2025", TrecToT2025Dataset(doc_corpus, doc_offsets)) + for i in ["train", "dev1", "dev2", "dev3"]: + qrels = cached_tot_resource("https://files.webis.de/data-in-progress/trec-tot-2025-queries/" + i + "-2025-qrel.txt", "TBD") + queries = cached_tot_resource("https://files.webis.de/data-in-progress/trec-tot-2025-queries/" + i + "-2025-queries.jsonl", "TBD") + registry.register(f"{NAME}/2025/{i}", TrecToT2025Dataset(doc_corpus, doc_offsets, queries, qrels)) + + +if __name__ == '__main__': + register_dataset() + import ir_datasets + dataset = ir_datasets.load("trec-tot/2025") + + cnt = 0 + for doc in dataset.docs_iter(): + print(doc.doc_id) + cnt += 1 + if cnt > 10: + break + + for doc in ["12", "39", "290", "303", "305", "307", "308", "309"]: + print(doc, "=>", dataset.docs_store().get(doc).doc_id) + diff --git a/test/trec-tot-2025/test_docs_iter.py b/test/trec-tot-2025/test_docs_iter.py new file mode 100644 index 00000000..f50c1528 --- /dev/null +++ b/test/trec-tot-2025/test_docs_iter.py @@ -0,0 +1,39 @@ +import unittest + +def load_dataset(): + import ir_datasets + return ir_datasets.load("trec-tot/2025") + +def load_doc_number(num): + index = 0 + for i in load_dataset().docs_iter(): + if num == index: + return i + index += 1 + +class TestDocsIter(unittest.TestCase): + def test_dataset_can_be_loaded(self): + actual = load_dataset() + self.assertIsNotNone(actual) + + def test_first_doc(self): + actual = load_doc_number(0) + + self.assertIsNotNone(actual) + self.assertEqual("12", actual.doc_id) + self.assertEqual("https://en.wikipedia.org/wiki/Anarchism", actual.url) + self.assertEqual("Anarchism", actual.title) + self.assertIn("a political philosophy and movement that is skeptical", actual.text) + self.assertIn("a political philosophy and movement that is skeptical", actual.default_text()) + self.assertIn("Anarchism Anarchism is a political philosophy and movement that is skeptical", actual.default_text()) + + def test_third_doc(self): + actual = load_doc_number(3) + + self.assertIsNotNone(actual) + self.assertEqual("303", actual.doc_id) + self.assertEqual("https://en.wikipedia.org/wiki/Alabama", actual.url) + self.assertEqual("Alabama", actual.title) + self.assertIn("Alabama's economy in the 21st century is based on automotive", actual.text) + self.assertIn("Alabama's economy in the 21st century is based on automotive", actual.default_text()) + diff --git a/test/trec-tot-2025/test_docs_store.py b/test/trec-tot-2025/test_docs_store.py new file mode 100644 index 00000000..276a43eb --- /dev/null +++ b/test/trec-tot-2025/test_docs_store.py @@ -0,0 +1,42 @@ +import unittest + +def load_docs_store(): + import ir_datasets + return ir_datasets.load("trec-tot/2025").docs_store() + +class TestDocsStore(unittest.TestCase): + def test_docs_store_can_be_loaded(self): + actual = load_docs_store() + self.assertIsNotNone(actual) + + def test_first_doc(self): + actual = load_docs_store().get("12") + + self.assertIsNotNone(actual) + self.assertEqual("12", actual.doc_id) + self.assertEqual("https://en.wikipedia.org/wiki/Anarchism", actual.url) + self.assertEqual("Anarchism", actual.title) + self.assertIn("a political philosophy and movement that is skeptical", actual.text) + self.assertIn("a political philosophy and movement that is skeptical", actual.default_text()) + self.assertIn("Anarchism Anarchism is a political philosophy and movement that is skeptical", actual.default_text()) + + def test_third_doc(self): + actual = load_docs_store().get("303") + + self.assertIsNotNone(actual) + self.assertEqual("303", actual.doc_id) + self.assertEqual("https://en.wikipedia.org/wiki/Alabama", actual.url) + self.assertEqual("Alabama", actual.title) + self.assertIn("Alabama's economy in the 21st century is based on automotive", actual.text) + self.assertIn("Alabama's economy in the 21st century is based on automotive", actual.default_text()) + + def test_some_random_doc(self): + actual = load_docs_store().get("6596604") + + self.assertIsNotNone(actual) + self.assertEqual("6596604", actual.doc_id) + self.assertEqual("https://en.wikipedia.org/wiki/Radio%20Reloj", actual.url) + self.assertEqual("Radio Reloj", actual.title) + self.assertIn("Radio Reloj (Spanish for Radio Clock) is an internationally broadcast Spanish-language radio station", actual.text) + self.assertIn("Radio Reloj (Spanish for Radio Clock) is an internationally broadcast Spanish-language radio station", actual.default_text()) + diff --git a/test/trec-tot-2025/test_qrel_iter.py b/test/trec-tot-2025/test_qrel_iter.py new file mode 100644 index 00000000..699b9127 --- /dev/null +++ b/test/trec-tot-2025/test_qrel_iter.py @@ -0,0 +1,54 @@ +import unittest + +def load_dataset(dataset_id): + import ir_datasets + return ir_datasets.load(dataset_id) + +def load_qrel_number(dataset_id, num): + index = 0 + for i in load_dataset(dataset_id).qrels_iter(): + if num == index: + return i + index += 1 + +class TestQrelIter(unittest.TestCase): + def test_train_dataset_can_be_loaded(self): + actual = load_dataset("trec-tot/2025/train") + self.assertIsNotNone(actual) + + def test_dev1_dataset_can_be_loaded(self): + actual = load_dataset("trec-tot/2025/dev1") + self.assertIsNotNone(actual) + + def test_dev2_dataset_can_be_loaded(self): + actual = load_dataset("trec-tot/2025/dev2") + self.assertIsNotNone(actual) + + def test_dev3_dataset_can_be_loaded(self): + actual = load_dataset("trec-tot/2025/dev3") + self.assertIsNotNone(actual) + + def test_train_qrel_iter(self): + actual = load_qrel_number("trec-tot/2025/train", 12) + self.assertEqual("1014", actual.query_id) + self.assertEqual("46264411", actual.doc_id) + self.assertEqual(1, actual.relevance) + + def test_dev1_qrel_iter(self): + actual = load_qrel_number("trec-tot/2025/dev1", 12) + self.assertEqual("898", actual.query_id) + self.assertEqual("3761238", actual.doc_id) + self.assertEqual(1, actual.relevance) + + def test_dev2_qrel_iter(self): + actual = load_qrel_number("trec-tot/2025/dev2", 12) + self.assertEqual("632", actual.query_id) + self.assertEqual("3261733", actual.doc_id) + self.assertEqual(1, actual.relevance) + + def test_dev3_qrel_iter(self): + actual = load_qrel_number("trec-tot/2025/dev3", 12) + self.assertEqual("2014", actual.query_id) + self.assertEqual("446518", actual.doc_id) + self.assertEqual(1, actual.relevance) + diff --git a/test/trec-tot-2025/test_queries_iter.py b/test/trec-tot-2025/test_queries_iter.py new file mode 100644 index 00000000..f30ab709 --- /dev/null +++ b/test/trec-tot-2025/test_queries_iter.py @@ -0,0 +1,78 @@ +import unittest + +def load_dataset(dataset_id): + import ir_datasets + return ir_datasets.load(dataset_id) + +def load_query_number(dataset_id, num): + index = 0 + for i in load_dataset(dataset_id).queries_iter(): + if num == index: + return i + index += 1 + +class TestQueriesIter(unittest.TestCase): + def test_train_dataset_can_be_loaded(self): + actual = load_dataset("trec-tot/2025/train") + self.assertIsNotNone(actual) + + def test_dev1_dataset_can_be_loaded(self): + actual = load_dataset("trec-tot/2025/dev1") + self.assertIsNotNone(actual) + + def test_dev2_dataset_can_be_loaded(self): + actual = load_dataset("trec-tot/2025/dev2") + self.assertIsNotNone(actual) + + def test_dev3_dataset_can_be_loaded(self): + actual = load_dataset("trec-tot/2025/dev3") + self.assertIsNotNone(actual) + + def test_query_from_train_dataset_can_be_loaded_01(self): + actual = load_query_number("trec-tot/2025/train", 2) + self.assertIsNotNone(actual) + self.assertEqual("950", actual.query_id) + self.assertIn("two girls who run away", actual.default_text()) + + def test_query_from_train_dataset_can_be_loaded_02(self): + actual = load_query_number("trec-tot/2025/train", 25) + self.assertIsNotNone(actual) + self.assertEqual("484", actual.query_id) + self.assertIn("Main character is a famous person like a celebrity", actual.default_text()) + + def test_query_from_dev1_dataset_can_be_loaded_01(self): + actual = load_query_number("trec-tot/2025/dev1", 2) + self.assertIsNotNone(actual) + self.assertEqual("473", actual.query_id) + self.assertIn("possibly a ghost killing in an old house", actual.default_text()) + + def test_query_from_dev1_dataset_can_be_loaded_02(self): + actual = load_query_number("trec-tot/2025/dev1", 25) + self.assertIsNotNone(actual) + self.assertEqual("153", actual.query_id) + self.assertIn("Martial arts movie where the human is fighting aliens", actual.default_text()) + + def test_query_from_dev2_dataset_can_be_loaded_01(self): + actual = load_query_number("trec-tot/2025/dev2", 2) + self.assertIsNotNone(actual) + self.assertEqual("477", actual.query_id) + self.assertIn("Pretty sure it was a comedy", actual.default_text()) + + def test_query_from_dev2_dataset_can_be_loaded_02(self): + actual = load_query_number("trec-tot/2025/dev2", 25) + self.assertIsNotNone(actual) + self.assertEqual("873", actual.query_id) + self.assertIn("I remember there were 2 siblings involved in the movie", actual.default_text()) + + def test_query_from_dev3_dataset_can_be_loaded_01(self): + actual = load_query_number("trec-tot/2025/dev3", 2) + self.assertIsNotNone(actual) + self.assertEqual("2003", actual.query_id) + self.assertIn("I remember a scene where the bell tower guy and the soldier had to sneak into this hidden place", actual.default_text()) + + def test_query_from_dev3_dataset_can_be_loaded_02(self): + actual = load_query_number("trec-tot/2025/dev3", 25) + self.assertIsNotNone(actual) + self.assertEqual("2028", actual.query_id) + self.assertIn("The place had this weird energy source", actual.default_text()) + From 8ae421b2694e481db4e9da3814b90e706012ce09 Mon Sep 17 00:00:00 2001 From: Maik Froebe Date: Thu, 24 Apr 2025 17:02:35 +0200 Subject: [PATCH 02/15] start to develop trec-tot-2025 --- ir_datasets/datasets/trec_tot_2025.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ir_datasets/datasets/trec_tot_2025.py b/ir_datasets/datasets/trec_tot_2025.py index 4b8c4f23..02d3d3ed 100644 --- a/ir_datasets/datasets/trec_tot_2025.py +++ b/ir_datasets/datasets/trec_tot_2025.py @@ -97,7 +97,7 @@ def docs_count(self): return len(self.docs_dict()) def docs_lang(self): - raise ValueError("ToDo: Implement this") + raise "en" class TrecToT2025Dataset(Dataset): From 9265cdcd8a6e1a7c24f30f19d32f2d00b3edb232 Mon Sep 17 00:00:00 2001 From: Maik Froebe Date: Thu, 24 Apr 2025 17:04:48 +0200 Subject: [PATCH 03/15] start to develop trec-tot-2025 --- ir_datasets/datasets/trec_tot_2025.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ir_datasets/datasets/trec_tot_2025.py b/ir_datasets/datasets/trec_tot_2025.py index 02d3d3ed..a88f8c2a 100644 --- a/ir_datasets/datasets/trec_tot_2025.py +++ b/ir_datasets/datasets/trec_tot_2025.py @@ -97,7 +97,7 @@ def docs_count(self): return len(self.docs_dict()) def docs_lang(self): - raise "en" + return "en" class TrecToT2025Dataset(Dataset): From d9b3049192ac61940ab0e1bb44e339cae95382e4 Mon Sep 17 00:00:00 2001 From: Maik Froebe Date: Thu, 24 Apr 2025 17:11:04 +0200 Subject: [PATCH 04/15] start to develop trec-tot-2025 --- ir_datasets/datasets/trec_tot_2025.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ir_datasets/datasets/trec_tot_2025.py b/ir_datasets/datasets/trec_tot_2025.py index a88f8c2a..2abbe11c 100644 --- a/ir_datasets/datasets/trec_tot_2025.py +++ b/ir_datasets/datasets/trec_tot_2025.py @@ -94,7 +94,7 @@ def docs_namespace(self): raise ValueError("ToDo: Implement this") def docs_count(self): - return len(self.docs_dict()) + return 6407814 def docs_lang(self): return "en" From b5f93f87f413d066d74cd9ce5441758bfe6e4021 Mon Sep 17 00:00:00 2001 From: Maik Froebe Date: Thu, 24 Apr 2025 17:28:30 +0200 Subject: [PATCH 05/15] start to develop trec-tot-2025 --- ir_datasets/datasets/trec_tot_2025.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ir_datasets/datasets/trec_tot_2025.py b/ir_datasets/datasets/trec_tot_2025.py index 2abbe11c..116d7351 100644 --- a/ir_datasets/datasets/trec_tot_2025.py +++ b/ir_datasets/datasets/trec_tot_2025.py @@ -35,6 +35,9 @@ def __init__(self, json_doc): def default_text(self): return self.title + " " + self.text + def _asdict(self): + return {"docno": self.doc_id, "text": self.default_text()} + class JsonlWithOffsetsDocsStore(Docstore): def __init__(self, docs, offsets): self.__docs = docs From acd5c044375356f9876dbadd56f68f5ed4a36eec Mon Sep 17 00:00:00 2001 From: Maik Froebe Date: Thu, 24 Apr 2025 17:29:50 +0200 Subject: [PATCH 06/15] start to develop trec-tot-2025 --- ir_datasets/datasets/trec_tot_2025.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ir_datasets/datasets/trec_tot_2025.py b/ir_datasets/datasets/trec_tot_2025.py index 116d7351..205a61e7 100644 --- a/ir_datasets/datasets/trec_tot_2025.py +++ b/ir_datasets/datasets/trec_tot_2025.py @@ -36,7 +36,7 @@ def default_text(self): return self.title + " " + self.text def _asdict(self): - return {"docno": self.doc_id, "text": self.default_text()} + return {"doc_id": self.doc_id, "text": self.default_text()} class JsonlWithOffsetsDocsStore(Docstore): def __init__(self, docs, offsets): From d2672eb7f3167cc07c298f65e6f77c3ea13d1bb8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maik=20Fr=C3=B6be?= Date: Wed, 7 May 2025 15:46:24 +0200 Subject: [PATCH 07/15] use data from zenodo --- ir_datasets/datasets/trec_tot_2025.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ir_datasets/datasets/trec_tot_2025.py b/ir_datasets/datasets/trec_tot_2025.py index 205a61e7..ee07d35d 100644 --- a/ir_datasets/datasets/trec_tot_2025.py +++ b/ir_datasets/datasets/trec_tot_2025.py @@ -119,12 +119,12 @@ def register_dataset(): if f"{NAME}/2025" in registry: return - doc_offsets = cached_tot_resource("https://files.webis.de/data-in-progress/trec-tot-2025-offsets.jsonl.gz", "00678e3155d962bb244e034e6401b79b") - doc_corpus = cached_tot_resource("https://files.webis.de/data-in-progress/trec-tot-2025-corpus.jsonl.gz", "a2c82398aa86df6a68c8706b9b462bf2") + doc_offsets = cached_tot_resource("https://zenodo.org/records/15356599/files/trec-tot-2025-offsets.jsonl.gz", "00678e3155d962bb244e034e6401b79b") + doc_corpus = cached_tot_resource("https://zenodo.org/records/15356599/files/trec-tot-2025-corpus.jsonl.gz", "a2c82398aa86df6a68c8706b9b462bf2") registry.register(f"{NAME}/2025", TrecToT2025Dataset(doc_corpus, doc_offsets)) for i in ["train", "dev1", "dev2", "dev3"]: - qrels = cached_tot_resource("https://files.webis.de/data-in-progress/trec-tot-2025-queries/" + i + "-2025-qrel.txt", "TBD") - queries = cached_tot_resource("https://files.webis.de/data-in-progress/trec-tot-2025-queries/" + i + "-2025-queries.jsonl", "TBD") + qrels = cached_tot_resource("https://zenodo.org/records/15356599/files/" + i + "-2025-qrel.txt", "TBD") + queries = cached_tot_resource("https://zenodo.org/records/15356599/files/" + i + "-2025-queries.jsonl", "TBD") registry.register(f"{NAME}/2025/{i}", TrecToT2025Dataset(doc_corpus, doc_offsets, queries, qrels)) From b21cf008f9672c15496d8805820a9bc7ec56e663 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maik=20Fr=C3=B6be?= Date: Thu, 8 May 2025 20:01:52 +0200 Subject: [PATCH 08/15] make integration of trec-tot-2025 more standard --- ir_datasets/datasets/trec_tot_2025.py | 41 +++++--------- ir_datasets/docs/trec-tot-2025.yaml | 42 +++++++++++++++ ir_datasets/etc/downloads.json | 53 +++++++++++++++++++ .../trec_tot_2025}/test_docs_iter.py | 0 .../trec_tot_2025}/test_docs_store.py | 0 .../trec_tot_2025}/test_qrel_iter.py | 0 .../trec_tot_2025}/test_queries_iter.py | 0 7 files changed, 109 insertions(+), 27 deletions(-) create mode 100644 ir_datasets/docs/trec-tot-2025.yaml rename test/{trec-tot-2025 => integration/trec_tot_2025}/test_docs_iter.py (100%) rename test/{trec-tot-2025 => integration/trec_tot_2025}/test_docs_store.py (100%) rename test/{trec-tot-2025 => integration/trec_tot_2025}/test_qrel_iter.py (100%) rename test/{trec-tot-2025 => integration/trec_tot_2025}/test_queries_iter.py (100%) diff --git a/ir_datasets/datasets/trec_tot_2025.py b/ir_datasets/datasets/trec_tot_2025.py index ee07d35d..293026fd 100644 --- a/ir_datasets/datasets/trec_tot_2025.py +++ b/ir_datasets/datasets/trec_tot_2025.py @@ -1,9 +1,9 @@ from ir_datasets import registry -from ir_datasets.datasets.base import Dataset +from ir_datasets.datasets.base import Dataset, YamlDocumentation from ir_datasets.util.download import RequestsDownload from ir_datasets.formats.base import BaseDocs from ir_datasets.indices import Docstore -from ir_datasets.util import ZipExtractCache, home_path, Cache +from ir_datasets.util import ZipExtractCache, home_path, Cache, DownloadConfig from ir_datasets.formats import BaseDocs, TrecQrels, JsonlQueries from ir_datasets.indices import PickleLz4FullStore import os @@ -14,9 +14,6 @@ NAME = "trec-tot" -def cached_tot_resource(url, md5): - streamer = RequestsDownload(url) - return Cache(streamer, home_path() / "trec-tot-2025" / url.split("/")[-1]) class JsonlDocumentOffset(NamedTuple): doc_id: str @@ -104,7 +101,7 @@ def docs_lang(self): class TrecToT2025Dataset(Dataset): - def __init__(self, docs_jsonl_file, offset_jsonl_file, queries=None, qrels=None): + def __init__(self, docs_jsonl_file, offset_jsonl_file, queries=None, qrels=None, documentation=None): docs = JsonlDocumentsWithOffsets(docs_jsonl_file, offset_jsonl_file) if queries: @@ -112,34 +109,24 @@ def __init__(self, docs_jsonl_file, offset_jsonl_file, queries=None, qrels=None) if qrels: qrels = TrecQrels(qrels, {0: 'Not Relevant', 1: 'Relevant'}) - super().__init__(docs, queries, qrels) + super().__init__(docs, queries, qrels, documentation) def register_dataset(): if f"{NAME}/2025" in registry: return - doc_offsets = cached_tot_resource("https://zenodo.org/records/15356599/files/trec-tot-2025-offsets.jsonl.gz", "00678e3155d962bb244e034e6401b79b") - doc_corpus = cached_tot_resource("https://zenodo.org/records/15356599/files/trec-tot-2025-corpus.jsonl.gz", "a2c82398aa86df6a68c8706b9b462bf2") - registry.register(f"{NAME}/2025", TrecToT2025Dataset(doc_corpus, doc_offsets)) - for i in ["train", "dev1", "dev2", "dev3"]: - qrels = cached_tot_resource("https://zenodo.org/records/15356599/files/" + i + "-2025-qrel.txt", "TBD") - queries = cached_tot_resource("https://zenodo.org/records/15356599/files/" + i + "-2025-queries.jsonl", "TBD") - registry.register(f"{NAME}/2025/{i}", TrecToT2025Dataset(doc_corpus, doc_offsets, queries, qrels)) - + dlc = DownloadConfig.context("trec-tot-2025", home_path() / NAME / "2025") -if __name__ == '__main__': - register_dataset() - import ir_datasets - dataset = ir_datasets.load("trec-tot/2025") + documentation = YamlDocumentation(f'docs/{NAME}.yaml') + doc_offsets = dlc['trec-tot-2025-offsets.jsonl.gz'] + doc_corpus = dlc['trec-tot-2025-corpus.jsonl.gz'] + registry.register(f"{NAME}/2025", TrecToT2025Dataset(doc_corpus, doc_offsets, documentation=documentation("2025"))) + for i in ["train", "dev1", "dev2", "dev3"]: + qrels = dlc[i + "-2025-qrel.txt"] + queries = dlc[i + "-2025-queries.jsonl"] + registry.register(f"{NAME}/2025/{i}", TrecToT2025Dataset(doc_corpus, doc_offsets, queries, qrels, documentation(f"2025/{i}"))) - cnt = 0 - for doc in dataset.docs_iter(): - print(doc.doc_id) - cnt += 1 - if cnt > 10: - break - for doc in ["12", "39", "290", "303", "305", "307", "308", "309"]: - print(doc, "=>", dataset.docs_store().get(doc).doc_id) +register_dataset() diff --git a/ir_datasets/docs/trec-tot-2025.yaml b/ir_datasets/docs/trec-tot-2025.yaml new file mode 100644 index 00000000..c4d6af46 --- /dev/null +++ b/ir_datasets/docs/trec-tot-2025.yaml @@ -0,0 +1,42 @@ +_: + pretty_name: 'TREC Tip-of-the-Tongue' + desc: ' +

+Tip of the tongue: The phenomenon of failing to retrieve something from memory, combined with partial recall and the feeling that retrieval is imminent. More details are available on the official page for the TREC Tip-of-the-Tongue (ToT) Track. +

+' + +2025: + desc: ' +

+Corpus for the TREC 2025 tip-of-the-tongue search track. +

+' + +2025/train: + desc: ' +

+Train query set for TREC 2025 tip-of-the-tongue search track. +

+' + +2025/dev1: + desc: ' +

+Dev-1 query set for TREC 2025 tip-of-the-tongue search track (the original 2023 dev set). +

+' + +2025/dev2: + desc: ' +

+Dev-2 query set for TREC 2025 tip-of-the-tongue search track (the original 2023 test set). +

+' + +2025/dev3: + desc: ' +

+Dev-3 query set for TREC 2025 tip-of-the-tongue search track (the original 2024 test set). +

+' diff --git a/ir_datasets/etc/downloads.json b/ir_datasets/etc/downloads.json index ce9313f3..4501fe1b 100644 --- a/ir_datasets/etc/downloads.json +++ b/ir_datasets/etc/downloads.json @@ -6186,6 +6186,59 @@ "cache_path": "trec-tot.zip" } }, + + "trec-tot-2025": { + "trec-tot-2025-offsets.jsonl.gz": { + "url": "https://zenodo.org/records/15356599/files/trec-tot-2025-offsets.jsonl.gz", + "expected_md5": "00678e3155d962bb244e034e6401b79b", + "cache_path": "trec-tot-2025-offsets.jsonl.gz" + }, + "trec-tot-2025-corpus.jsonl.gz": { + "url": "https://zenodo.org/records/15356599/files/trec-tot-2025-corpus.jsonl.gz", + "expected_md5": "a2c82398aa86df6a68c8706b9b462bf2", + "cache_path": "trec-tot-2025-corpus.jsonl.gz" + }, + "train-2025-qrel.txt": { + "url": "https://zenodo.org/records/15356599/files/train-2025-qrel.txt", + "expected_md5": "10a3c727fc5806ec4510f7a071b57cd7", + "cache_path": "train-2025-qrel.txt" + }, + "train-2025-queries.jsonl": { + "url": "https://zenodo.org/records/15356599/files/train-2025-queries.jsonl", + "expected_md5": "288b7707b4e897f7447aac2cc2f613be", + "cache_path": "train-2025-queries.jsonl" + }, + "dev1-2025-qrel.txt": { + "url": "https://zenodo.org/records/15356599/files/dev1-2025-qrel.txt", + "expected_md5": "0c913ce8b5b287c73a6dfac662971e82", + "cache_path": "dev1-2025-qrel.txt" + }, + "dev1-2025-queries.jsonl": { + "url": "https://zenodo.org/records/15356599/files/dev1-2025-queries.jsonl", + "expected_md5": "b87c2f51d058de844e258a69b02e70fc", + "cache_path": "dev1-2025-queries.jsonl" + }, + "dev2-2025-qrel.txt": { + "url": "https://zenodo.org/records/15356599/files/dev2-2025-qrel.txt", + "expected_md5": "4548eb41e639905384aa017c69129bfc", + "cache_path": "dev2-2025-qrel.txt" + }, + "dev2-2025-queries.jsonl": { + "url": "https://zenodo.org/records/15356599/files/dev2-2025-queries.jsonl", + "expected_md5": "b174a128a255e92d0d54b76465d596b5", + "cache_path": "dev2-2025-queries.jsonl" + }, + "dev3-2025-qrel.txt": { + "url": "https://zenodo.org/records/15356599/files/dev3-2025-qrel.txt", + "expected_md5": "48ab0d24a5946861546e54064238477f", + "cache_path": "dev3-2025-qrel.txt" + }, + "dev3-2025-queries.jsonl": { + "url": "https://zenodo.org/records/15356599/files/dev3-2025-queries.jsonl", + "expected_md5": "259c11645694a3c5230b66c7852d4d80", + "cache_path": "dev3-2025-queries.jsonl" + } + }, "tripclick": { "benchmark": { diff --git a/test/trec-tot-2025/test_docs_iter.py b/test/integration/trec_tot_2025/test_docs_iter.py similarity index 100% rename from test/trec-tot-2025/test_docs_iter.py rename to test/integration/trec_tot_2025/test_docs_iter.py diff --git a/test/trec-tot-2025/test_docs_store.py b/test/integration/trec_tot_2025/test_docs_store.py similarity index 100% rename from test/trec-tot-2025/test_docs_store.py rename to test/integration/trec_tot_2025/test_docs_store.py diff --git a/test/trec-tot-2025/test_qrel_iter.py b/test/integration/trec_tot_2025/test_qrel_iter.py similarity index 100% rename from test/trec-tot-2025/test_qrel_iter.py rename to test/integration/trec_tot_2025/test_qrel_iter.py diff --git a/test/trec-tot-2025/test_queries_iter.py b/test/integration/trec_tot_2025/test_queries_iter.py similarity index 100% rename from test/trec-tot-2025/test_queries_iter.py rename to test/integration/trec_tot_2025/test_queries_iter.py From 8cf0fc10550d0672d831c3835093815692640898 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maik=20Fr=C3=B6be?= Date: Thu, 8 May 2025 20:12:50 +0200 Subject: [PATCH 09/15] mf --- ir_datasets/datasets/trec_tot_2025.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ir_datasets/datasets/trec_tot_2025.py b/ir_datasets/datasets/trec_tot_2025.py index 293026fd..c8ee2e34 100644 --- a/ir_datasets/datasets/trec_tot_2025.py +++ b/ir_datasets/datasets/trec_tot_2025.py @@ -50,7 +50,7 @@ def offsets_iter(self): def docs_dict(self): return PickleLz4FullStore( - path=str(self.__offsets.path().absolute().resolve()) + '.pklz4', + path=str(self.__offsets.path()) + '.pklz4', init_iter_fn=self.offsets_iter, data_cls=JsonlDocumentOffset, lookup_field="doc_id", From 28eec83145f5c08a55c24e6e287789565a8569e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maik=20Fr=C3=B6be?= Date: Thu, 8 May 2025 20:16:47 +0200 Subject: [PATCH 10/15] mf --- ir_datasets/datasets/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ir_datasets/datasets/__init__.py b/ir_datasets/datasets/__init__.py index 80a608e1..3fa77c20 100644 --- a/ir_datasets/datasets/__init__.py +++ b/ir_datasets/datasets/__init__.py @@ -58,4 +58,3 @@ from . import sara from . import trec_tot_2025 -trec_tot_2025.register_dataset() From a1f0796e112385ffbbdbe707a61e45549019acd8 Mon Sep 17 00:00:00 2001 From: Sean MacAvaney Date: Fri, 9 May 2025 17:01:45 +0100 Subject: [PATCH 11/15] add metadata --- ir_datasets/etc/metadata.json | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ir_datasets/etc/metadata.json b/ir_datasets/etc/metadata.json index cefa0f0b..b728b88b 100644 --- a/ir_datasets/etc/metadata.json +++ b/ir_datasets/etc/metadata.json @@ -702,6 +702,11 @@ "trec-tot/2023": {"docs": {"count": 231852, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}}, "trec-tot/2023/dev": {"docs": {"count": 231852, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 150}, "qrels": {"count": 150, "fields": {"relevance": {"counts_by_value": {"1": 150}}}}}, "trec-tot/2023/train": {"docs": {"count": 231852, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 150}, "qrels": {"count": 150, "fields": {"relevance": {"counts_by_value": {"1": 150}}}}}, + "trec-tot/2025": {"docs": {"count": 6407814, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}}, + "trec-tot/2025/dev1": {"docs": {"count": 6407814, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 142}, "qrels": {"count": 142, "fields": {"relevance": {"counts_by_value": {"1": 142}}}}}, + "trec-tot/2025/dev2": {"docs": {"count": 6407814, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 143}, "qrels": {"count": 143, "fields": {"relevance": {"counts_by_value": {"1": 143}}}}}, + "trec-tot/2025/dev3": {"docs": {"count": 6407814, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 536}, "qrels": {"count": 536, "fields": {"relevance": {"counts_by_value": {"1": 536}}}}}, + "trec-tot/2025/train": {"docs": {"count": 6407814, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 143}, "qrels": {"count": 143, "fields": {"relevance": {"counts_by_value": {"1": 143}}}}}, "tripclick": {"docs": {"count": 1523878, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}}, "tripclick/logs": {"docs": {"count": 5196956, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "qlogs": {"count": 5317350}}, "tripclick/test": {"docs": {"_ref": "tripclick"}, "queries": {"count": 3525}, "scoreddocs": {"count": 3486402}}, From 88715051bdf2505f7f3848cfde8359bcb79d1f1b Mon Sep 17 00:00:00 2001 From: Sean MacAvaney Date: Fri, 9 May 2025 17:19:07 +0100 Subject: [PATCH 12/15] metadata --- ir_datasets/etc/metadata.json | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ir_datasets/etc/metadata.json b/ir_datasets/etc/metadata.json index b728b88b..03aca199 100644 --- a/ir_datasets/etc/metadata.json +++ b/ir_datasets/etc/metadata.json @@ -578,7 +578,7 @@ "nano-beir/quora": {"docs": {"count": 5046, "fields": {"doc_id": {"max_len": 6, "common_prefix": ""}}}, "queries": {"count": 50}, "qrels": {"count": 70, "fields": {"relevance": {"counts_by_value": {"1": 70}}}}}, "nano-beir/scidocs": {"docs": {"count": 2210, "fields": {"doc_id": {"max_len": 40, "common_prefix": ""}}}, "queries": {"count": 50}, "qrels": {"count": 244, "fields": {"relevance": {"counts_by_value": {"1": 244}}}}}, "nano-beir/scifact": {"docs": {"count": 2919, "fields": {"doc_id": {"max_len": 9, "common_prefix": ""}}}, "queries": {"count": 50}, "qrels": {"count": 56, "fields": {"relevance": {"counts_by_value": {"1": 56}}}}}, - "nano-beir/webis-touche2020": {"docs": {"count": 5745 , "fields": {"doc_id": {"max_len": 39, "common_prefix": ""}}}, "queries": {"count": 49}, "qrels": {"count": 932, "fields": {"relevance": {"counts_by_value": {"1": 932}}}}}, + "nano-beir/webis-touche2020": {"docs": {"count": 5745, "fields": {"doc_id": {"max_len": 39, "common_prefix": ""}}}, "queries": {"count": 49}, "qrels": {"count": 932, "fields": {"relevance": {"counts_by_value": {"1": 932}}}}}, "natural-questions": {"docs": {"count": 28390850, "fields": {"doc_id": {"max_len": 11, "common_prefix": ""}}}}, "natural-questions/dev": {"docs": {"_ref": "natural-questions"}, "queries": {"count": 7830}, "qrels": {"count": 7695, "fields": {"relevance": {"counts_by_value": {"1": 7695}}}}, "scoreddocs": {"count": 973480}}, "natural-questions/train": {"docs": {"_ref": "natural-questions"}, "queries": {"count": 307373}, "qrels": {"count": 152148, "fields": {"relevance": {"counts_by_value": {"1": 152148}}}}, "scoreddocs": {"count": 40374730}}, @@ -702,6 +702,8 @@ "trec-tot/2023": {"docs": {"count": 231852, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}}, "trec-tot/2023/dev": {"docs": {"count": 231852, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 150}, "qrels": {"count": 150, "fields": {"relevance": {"counts_by_value": {"1": 150}}}}}, "trec-tot/2023/train": {"docs": {"count": 231852, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 150}, "qrels": {"count": 150, "fields": {"relevance": {"counts_by_value": {"1": 150}}}}}, + "trec-tot/2024": {"docs": {"count": 3185450, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}}, + "trec-tot/2024/test": {"docs": {"_ref": "trec-tot/2024"}, "queries": {"count": 600}}, "trec-tot/2025": {"docs": {"count": 6407814, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}}, "trec-tot/2025/dev1": {"docs": {"count": 6407814, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 142}, "qrels": {"count": 142, "fields": {"relevance": {"counts_by_value": {"1": 142}}}}}, "trec-tot/2025/dev2": {"docs": {"count": 6407814, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 143}, "qrels": {"count": 143, "fields": {"relevance": {"counts_by_value": {"1": 143}}}}}, From 49287037a46f698770920be7f4b40c72df5cada1 Mon Sep 17 00:00:00 2001 From: Sean MacAvaney Date: Fri, 9 May 2025 17:23:05 +0100 Subject: [PATCH 13/15] Update trec_tot_2025.py --- ir_datasets/datasets/trec_tot_2025.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/ir_datasets/datasets/trec_tot_2025.py b/ir_datasets/datasets/trec_tot_2025.py index c8ee2e34..1b693bed 100644 --- a/ir_datasets/datasets/trec_tot_2025.py +++ b/ir_datasets/datasets/trec_tot_2025.py @@ -21,19 +21,19 @@ class JsonlDocumentOffset(NamedTuple): offset_end: int -class TrecToT2025Doc(): - def __init__(self, json_doc): - parsed_doc = json.loads(json_doc) - self.doc_id = parsed_doc["id"] - self.title = parsed_doc["title"] - self.url = parsed_doc["url"] - self.text = parsed_doc["text"] +class TrecToT2025Doc(NamedTuple): + doc_id: str + title: str + url: str + text: str + + @staticmethod + def _from_json(self, json_doc): + return TrecToT2025Doc(json_doc["id"], json_doc["title"], json_doc["url"], json_doc["text"]) def default_text(self): return self.title + " " + self.text - def _asdict(self): - return {"doc_id": self.doc_id, "text": self.default_text()} class JsonlWithOffsetsDocsStore(Docstore): def __init__(self, docs, offsets): @@ -71,7 +71,7 @@ def get_many_iter(self, doc_ids): class TrecToT2025DocsStore(JsonlWithOffsetsDocsStore): def get_many_iter(self, doc_ids): for i in super().get_many_iter(doc_ids): - yield TrecToT2025Doc(i) + yield TrecToT2025Doc._from_json(i) class JsonlDocumentsWithOffsets(BaseDocs): @@ -82,10 +82,10 @@ def __init__(self, docs, offsets): def docs_iter(self): with gzip.open(self.__docs.path()) as f: for l in f: - yield TrecToT2025Doc(l) + yield TrecToT2025Doc._from_json(json.loads(l)) def docs_cls(self): - return self._cls + return TrecToT2025Doc def docs_store(self, field='doc_id'): return TrecToT2025DocsStore(self.__docs, self.__offsets) From 944edf175b0f59abb6c3f806a78d821235db7f7f Mon Sep 17 00:00:00 2001 From: Sean MacAvaney Date: Fri, 9 May 2025 17:33:53 +0100 Subject: [PATCH 14/15] fix --- ir_datasets/datasets/trec_tot_2025.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ir_datasets/datasets/trec_tot_2025.py b/ir_datasets/datasets/trec_tot_2025.py index 1b693bed..d0015b98 100644 --- a/ir_datasets/datasets/trec_tot_2025.py +++ b/ir_datasets/datasets/trec_tot_2025.py @@ -28,7 +28,7 @@ class TrecToT2025Doc(NamedTuple): text: str @staticmethod - def _from_json(self, json_doc): + def _from_json(json_doc): return TrecToT2025Doc(json_doc["id"], json_doc["title"], json_doc["url"], json_doc["text"]) def default_text(self): From d00d8af0b72d184df93ca00382f8b2705cec10c0 Mon Sep 17 00:00:00 2001 From: Sean MacAvaney Date: Fri, 9 May 2025 19:49:51 +0100 Subject: [PATCH 15/15] Update trec_tot_2025.py --- ir_datasets/datasets/trec_tot_2025.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ir_datasets/datasets/trec_tot_2025.py b/ir_datasets/datasets/trec_tot_2025.py index d0015b98..c85e1dda 100644 --- a/ir_datasets/datasets/trec_tot_2025.py +++ b/ir_datasets/datasets/trec_tot_2025.py @@ -71,7 +71,7 @@ def get_many_iter(self, doc_ids): class TrecToT2025DocsStore(JsonlWithOffsetsDocsStore): def get_many_iter(self, doc_ids): for i in super().get_many_iter(doc_ids): - yield TrecToT2025Doc._from_json(i) + yield TrecToT2025Doc._from_json(json.loads(i)) class JsonlDocumentsWithOffsets(BaseDocs):