diff --git a/ir_datasets/datasets/__init__.py b/ir_datasets/datasets/__init__.py index 41568aee..3fa77c20 100644 --- a/ir_datasets/datasets/__init__.py +++ b/ir_datasets/datasets/__init__.py @@ -56,3 +56,5 @@ from . import hc4 from . import neuclir # must be after hc4 from . import sara +from . import trec_tot_2025 + diff --git a/ir_datasets/datasets/trec_tot_2025.py b/ir_datasets/datasets/trec_tot_2025.py new file mode 100644 index 00000000..c85e1dda --- /dev/null +++ b/ir_datasets/datasets/trec_tot_2025.py @@ -0,0 +1,132 @@ +from ir_datasets import registry +from ir_datasets.datasets.base import Dataset, YamlDocumentation +from ir_datasets.util.download import RequestsDownload +from ir_datasets.formats.base import BaseDocs +from ir_datasets.indices import Docstore +from ir_datasets.util import ZipExtractCache, home_path, Cache, DownloadConfig +from ir_datasets.formats import BaseDocs, TrecQrels, JsonlQueries +from ir_datasets.indices import PickleLz4FullStore +import os +import gzip +import json +from tqdm import tqdm +from typing import NamedTuple + +NAME = "trec-tot" + + +class JsonlDocumentOffset(NamedTuple): + doc_id: str + offset_start: int + offset_end: int + + +class TrecToT2025Doc(NamedTuple): + doc_id: str + title: str + url: str + text: str + + @staticmethod + def _from_json(json_doc): + return TrecToT2025Doc(json_doc["id"], json_doc["title"], json_doc["url"], json_doc["text"]) + + def default_text(self): + return self.title + " " + self.text + + +class JsonlWithOffsetsDocsStore(Docstore): + def __init__(self, docs, offsets): + self.__docs = docs + self.__offsets = offsets + self._docs_dict = None + self._id_field = "doc_id" + + def offsets_iter(self): + with gzip.open(self.__offsets.path(), "rt") as f: + for i in f: + i = json.loads(i) + yield JsonlDocumentOffset(doc_id=i["id"], offset_start=i["offset_start"], offset_end=i["offset_end"]) + + def docs_dict(self): + return PickleLz4FullStore( + path=str(self.__offsets.path()) + '.pklz4', + init_iter_fn=self.offsets_iter, + data_cls=JsonlDocumentOffset, + lookup_field="doc_id", + index_fields=("doc_id",) + ) + + def get_many_iter(self, doc_ids): + offsets = self.docs_dict() + + with open(self.__docs.path(), "rb") as f: + for doc in doc_ids: + doc = offsets.get(doc) + f.seek(doc.offset_start) + raw_content_bytes = f.read(doc.offset_end - doc.offset_start) + yield gzip.decompress(raw_content_bytes) + + +class TrecToT2025DocsStore(JsonlWithOffsetsDocsStore): + def get_many_iter(self, doc_ids): + for i in super().get_many_iter(doc_ids): + yield TrecToT2025Doc._from_json(json.loads(i)) + + +class JsonlDocumentsWithOffsets(BaseDocs): + def __init__(self, docs, offsets): + self.__docs = docs + self.__offsets = offsets + + def docs_iter(self): + with gzip.open(self.__docs.path()) as f: + for l in f: + yield TrecToT2025Doc._from_json(json.loads(l)) + + def docs_cls(self): + return TrecToT2025Doc + + def docs_store(self, field='doc_id'): + return TrecToT2025DocsStore(self.__docs, self.__offsets) + + def docs_namespace(self): + raise ValueError("ToDo: Implement this") + + def docs_count(self): + return 6407814 + + def docs_lang(self): + return "en" + + +class TrecToT2025Dataset(Dataset): + def __init__(self, docs_jsonl_file, offset_jsonl_file, queries=None, qrels=None, documentation=None): + docs = JsonlDocumentsWithOffsets(docs_jsonl_file, offset_jsonl_file) + + if queries: + queries = JsonlQueries(queries, lang='en', mapping={"text": "query", "query_id": "query_id"}) + if qrels: + qrels = TrecQrels(qrels, {0: 'Not Relevant', 1: 'Relevant'}) + + super().__init__(docs, queries, qrels, documentation) + + +def register_dataset(): + if f"{NAME}/2025" in registry: + return + + dlc = DownloadConfig.context("trec-tot-2025", home_path() / NAME / "2025") + + documentation = YamlDocumentation(f'docs/{NAME}.yaml') + doc_offsets = dlc['trec-tot-2025-offsets.jsonl.gz'] + doc_corpus = dlc['trec-tot-2025-corpus.jsonl.gz'] + registry.register(f"{NAME}/2025", TrecToT2025Dataset(doc_corpus, doc_offsets, documentation=documentation("2025"))) + for i in ["train", "dev1", "dev2", "dev3"]: + qrels = dlc[i + "-2025-qrel.txt"] + queries = dlc[i + "-2025-queries.jsonl"] + registry.register(f"{NAME}/2025/{i}", TrecToT2025Dataset(doc_corpus, doc_offsets, queries, qrels, documentation(f"2025/{i}"))) + + +register_dataset() + diff --git a/ir_datasets/docs/trec-tot-2025.yaml b/ir_datasets/docs/trec-tot-2025.yaml new file mode 100644 index 00000000..c4d6af46 --- /dev/null +++ b/ir_datasets/docs/trec-tot-2025.yaml @@ -0,0 +1,42 @@ +_: + pretty_name: 'TREC Tip-of-the-Tongue' + desc: ' +

+Tip of the tongue: The phenomenon of failing to retrieve something from memory, combined with partial recall and the feeling that retrieval is imminent. More details are available on the official page for the TREC Tip-of-the-Tongue (ToT) Track. +

+' + +2025: + desc: ' +

+Corpus for the TREC 2025 tip-of-the-tongue search track. +

+' + +2025/train: + desc: ' +

+Train query set for TREC 2025 tip-of-the-tongue search track. +

+' + +2025/dev1: + desc: ' +

+Dev-1 query set for TREC 2025 tip-of-the-tongue search track (the original 2023 dev set). +

+' + +2025/dev2: + desc: ' +

+Dev-2 query set for TREC 2025 tip-of-the-tongue search track (the original 2023 test set). +

+' + +2025/dev3: + desc: ' +

+Dev-3 query set for TREC 2025 tip-of-the-tongue search track (the original 2024 test set). +

+' diff --git a/ir_datasets/etc/downloads.json b/ir_datasets/etc/downloads.json index ce9313f3..4501fe1b 100644 --- a/ir_datasets/etc/downloads.json +++ b/ir_datasets/etc/downloads.json @@ -6186,6 +6186,59 @@ "cache_path": "trec-tot.zip" } }, + + "trec-tot-2025": { + "trec-tot-2025-offsets.jsonl.gz": { + "url": "https://zenodo.org/records/15356599/files/trec-tot-2025-offsets.jsonl.gz", + "expected_md5": "00678e3155d962bb244e034e6401b79b", + "cache_path": "trec-tot-2025-offsets.jsonl.gz" + }, + "trec-tot-2025-corpus.jsonl.gz": { + "url": "https://zenodo.org/records/15356599/files/trec-tot-2025-corpus.jsonl.gz", + "expected_md5": "a2c82398aa86df6a68c8706b9b462bf2", + "cache_path": "trec-tot-2025-corpus.jsonl.gz" + }, + "train-2025-qrel.txt": { + "url": "https://zenodo.org/records/15356599/files/train-2025-qrel.txt", + "expected_md5": "10a3c727fc5806ec4510f7a071b57cd7", + "cache_path": "train-2025-qrel.txt" + }, + "train-2025-queries.jsonl": { + "url": "https://zenodo.org/records/15356599/files/train-2025-queries.jsonl", + "expected_md5": "288b7707b4e897f7447aac2cc2f613be", + "cache_path": "train-2025-queries.jsonl" + }, + "dev1-2025-qrel.txt": { + "url": "https://zenodo.org/records/15356599/files/dev1-2025-qrel.txt", + "expected_md5": "0c913ce8b5b287c73a6dfac662971e82", + "cache_path": "dev1-2025-qrel.txt" + }, + "dev1-2025-queries.jsonl": { + "url": "https://zenodo.org/records/15356599/files/dev1-2025-queries.jsonl", + "expected_md5": "b87c2f51d058de844e258a69b02e70fc", + "cache_path": "dev1-2025-queries.jsonl" + }, + "dev2-2025-qrel.txt": { + "url": "https://zenodo.org/records/15356599/files/dev2-2025-qrel.txt", + "expected_md5": "4548eb41e639905384aa017c69129bfc", + "cache_path": "dev2-2025-qrel.txt" + }, + "dev2-2025-queries.jsonl": { + "url": "https://zenodo.org/records/15356599/files/dev2-2025-queries.jsonl", + "expected_md5": "b174a128a255e92d0d54b76465d596b5", + "cache_path": "dev2-2025-queries.jsonl" + }, + "dev3-2025-qrel.txt": { + "url": "https://zenodo.org/records/15356599/files/dev3-2025-qrel.txt", + "expected_md5": "48ab0d24a5946861546e54064238477f", + "cache_path": "dev3-2025-qrel.txt" + }, + "dev3-2025-queries.jsonl": { + "url": "https://zenodo.org/records/15356599/files/dev3-2025-queries.jsonl", + "expected_md5": "259c11645694a3c5230b66c7852d4d80", + "cache_path": "dev3-2025-queries.jsonl" + } + }, "tripclick": { "benchmark": { diff --git a/ir_datasets/etc/metadata.json b/ir_datasets/etc/metadata.json index cefa0f0b..03aca199 100644 --- a/ir_datasets/etc/metadata.json +++ b/ir_datasets/etc/metadata.json @@ -578,7 +578,7 @@ "nano-beir/quora": {"docs": {"count": 5046, "fields": {"doc_id": {"max_len": 6, "common_prefix": ""}}}, "queries": {"count": 50}, "qrels": {"count": 70, "fields": {"relevance": {"counts_by_value": {"1": 70}}}}}, "nano-beir/scidocs": {"docs": {"count": 2210, "fields": {"doc_id": {"max_len": 40, "common_prefix": ""}}}, "queries": {"count": 50}, "qrels": {"count": 244, "fields": {"relevance": {"counts_by_value": {"1": 244}}}}}, "nano-beir/scifact": {"docs": {"count": 2919, "fields": {"doc_id": {"max_len": 9, "common_prefix": ""}}}, "queries": {"count": 50}, "qrels": {"count": 56, "fields": {"relevance": {"counts_by_value": {"1": 56}}}}}, - "nano-beir/webis-touche2020": {"docs": {"count": 5745 , "fields": {"doc_id": {"max_len": 39, "common_prefix": ""}}}, "queries": {"count": 49}, "qrels": {"count": 932, "fields": {"relevance": {"counts_by_value": {"1": 932}}}}}, + "nano-beir/webis-touche2020": {"docs": {"count": 5745, "fields": {"doc_id": {"max_len": 39, "common_prefix": ""}}}, "queries": {"count": 49}, "qrels": {"count": 932, "fields": {"relevance": {"counts_by_value": {"1": 932}}}}}, "natural-questions": {"docs": {"count": 28390850, "fields": {"doc_id": {"max_len": 11, "common_prefix": ""}}}}, "natural-questions/dev": {"docs": {"_ref": "natural-questions"}, "queries": {"count": 7830}, "qrels": {"count": 7695, "fields": {"relevance": {"counts_by_value": {"1": 7695}}}}, "scoreddocs": {"count": 973480}}, "natural-questions/train": {"docs": {"_ref": "natural-questions"}, "queries": {"count": 307373}, "qrels": {"count": 152148, "fields": {"relevance": {"counts_by_value": {"1": 152148}}}}, "scoreddocs": {"count": 40374730}}, @@ -702,6 +702,13 @@ "trec-tot/2023": {"docs": {"count": 231852, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}}, "trec-tot/2023/dev": {"docs": {"count": 231852, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 150}, "qrels": {"count": 150, "fields": {"relevance": {"counts_by_value": {"1": 150}}}}}, "trec-tot/2023/train": {"docs": {"count": 231852, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 150}, "qrels": {"count": 150, "fields": {"relevance": {"counts_by_value": {"1": 150}}}}}, + "trec-tot/2024": {"docs": {"count": 3185450, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}}, + "trec-tot/2024/test": {"docs": {"_ref": "trec-tot/2024"}, "queries": {"count": 600}}, + "trec-tot/2025": {"docs": {"count": 6407814, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}}, + "trec-tot/2025/dev1": {"docs": {"count": 6407814, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 142}, "qrels": {"count": 142, "fields": {"relevance": {"counts_by_value": {"1": 142}}}}}, + "trec-tot/2025/dev2": {"docs": {"count": 6407814, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 143}, "qrels": {"count": 143, "fields": {"relevance": {"counts_by_value": {"1": 143}}}}}, + "trec-tot/2025/dev3": {"docs": {"count": 6407814, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 536}, "qrels": {"count": 536, "fields": {"relevance": {"counts_by_value": {"1": 536}}}}}, + "trec-tot/2025/train": {"docs": {"count": 6407814, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 143}, "qrels": {"count": 143, "fields": {"relevance": {"counts_by_value": {"1": 143}}}}}, "tripclick": {"docs": {"count": 1523878, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}}, "tripclick/logs": {"docs": {"count": 5196956, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "qlogs": {"count": 5317350}}, "tripclick/test": {"docs": {"_ref": "tripclick"}, "queries": {"count": 3525}, "scoreddocs": {"count": 3486402}}, diff --git a/test/integration/trec_tot_2025/test_docs_iter.py b/test/integration/trec_tot_2025/test_docs_iter.py new file mode 100644 index 00000000..f50c1528 --- /dev/null +++ b/test/integration/trec_tot_2025/test_docs_iter.py @@ -0,0 +1,39 @@ +import unittest + +def load_dataset(): + import ir_datasets + return ir_datasets.load("trec-tot/2025") + +def load_doc_number(num): + index = 0 + for i in load_dataset().docs_iter(): + if num == index: + return i + index += 1 + +class TestDocsIter(unittest.TestCase): + def test_dataset_can_be_loaded(self): + actual = load_dataset() + self.assertIsNotNone(actual) + + def test_first_doc(self): + actual = load_doc_number(0) + + self.assertIsNotNone(actual) + self.assertEqual("12", actual.doc_id) + self.assertEqual("https://en.wikipedia.org/wiki/Anarchism", actual.url) + self.assertEqual("Anarchism", actual.title) + self.assertIn("a political philosophy and movement that is skeptical", actual.text) + self.assertIn("a political philosophy and movement that is skeptical", actual.default_text()) + self.assertIn("Anarchism Anarchism is a political philosophy and movement that is skeptical", actual.default_text()) + + def test_third_doc(self): + actual = load_doc_number(3) + + self.assertIsNotNone(actual) + self.assertEqual("303", actual.doc_id) + self.assertEqual("https://en.wikipedia.org/wiki/Alabama", actual.url) + self.assertEqual("Alabama", actual.title) + self.assertIn("Alabama's economy in the 21st century is based on automotive", actual.text) + self.assertIn("Alabama's economy in the 21st century is based on automotive", actual.default_text()) + diff --git a/test/integration/trec_tot_2025/test_docs_store.py b/test/integration/trec_tot_2025/test_docs_store.py new file mode 100644 index 00000000..276a43eb --- /dev/null +++ b/test/integration/trec_tot_2025/test_docs_store.py @@ -0,0 +1,42 @@ +import unittest + +def load_docs_store(): + import ir_datasets + return ir_datasets.load("trec-tot/2025").docs_store() + +class TestDocsStore(unittest.TestCase): + def test_docs_store_can_be_loaded(self): + actual = load_docs_store() + self.assertIsNotNone(actual) + + def test_first_doc(self): + actual = load_docs_store().get("12") + + self.assertIsNotNone(actual) + self.assertEqual("12", actual.doc_id) + self.assertEqual("https://en.wikipedia.org/wiki/Anarchism", actual.url) + self.assertEqual("Anarchism", actual.title) + self.assertIn("a political philosophy and movement that is skeptical", actual.text) + self.assertIn("a political philosophy and movement that is skeptical", actual.default_text()) + self.assertIn("Anarchism Anarchism is a political philosophy and movement that is skeptical", actual.default_text()) + + def test_third_doc(self): + actual = load_docs_store().get("303") + + self.assertIsNotNone(actual) + self.assertEqual("303", actual.doc_id) + self.assertEqual("https://en.wikipedia.org/wiki/Alabama", actual.url) + self.assertEqual("Alabama", actual.title) + self.assertIn("Alabama's economy in the 21st century is based on automotive", actual.text) + self.assertIn("Alabama's economy in the 21st century is based on automotive", actual.default_text()) + + def test_some_random_doc(self): + actual = load_docs_store().get("6596604") + + self.assertIsNotNone(actual) + self.assertEqual("6596604", actual.doc_id) + self.assertEqual("https://en.wikipedia.org/wiki/Radio%20Reloj", actual.url) + self.assertEqual("Radio Reloj", actual.title) + self.assertIn("Radio Reloj (Spanish for Radio Clock) is an internationally broadcast Spanish-language radio station", actual.text) + self.assertIn("Radio Reloj (Spanish for Radio Clock) is an internationally broadcast Spanish-language radio station", actual.default_text()) + diff --git a/test/integration/trec_tot_2025/test_qrel_iter.py b/test/integration/trec_tot_2025/test_qrel_iter.py new file mode 100644 index 00000000..699b9127 --- /dev/null +++ b/test/integration/trec_tot_2025/test_qrel_iter.py @@ -0,0 +1,54 @@ +import unittest + +def load_dataset(dataset_id): + import ir_datasets + return ir_datasets.load(dataset_id) + +def load_qrel_number(dataset_id, num): + index = 0 + for i in load_dataset(dataset_id).qrels_iter(): + if num == index: + return i + index += 1 + +class TestQrelIter(unittest.TestCase): + def test_train_dataset_can_be_loaded(self): + actual = load_dataset("trec-tot/2025/train") + self.assertIsNotNone(actual) + + def test_dev1_dataset_can_be_loaded(self): + actual = load_dataset("trec-tot/2025/dev1") + self.assertIsNotNone(actual) + + def test_dev2_dataset_can_be_loaded(self): + actual = load_dataset("trec-tot/2025/dev2") + self.assertIsNotNone(actual) + + def test_dev3_dataset_can_be_loaded(self): + actual = load_dataset("trec-tot/2025/dev3") + self.assertIsNotNone(actual) + + def test_train_qrel_iter(self): + actual = load_qrel_number("trec-tot/2025/train", 12) + self.assertEqual("1014", actual.query_id) + self.assertEqual("46264411", actual.doc_id) + self.assertEqual(1, actual.relevance) + + def test_dev1_qrel_iter(self): + actual = load_qrel_number("trec-tot/2025/dev1", 12) + self.assertEqual("898", actual.query_id) + self.assertEqual("3761238", actual.doc_id) + self.assertEqual(1, actual.relevance) + + def test_dev2_qrel_iter(self): + actual = load_qrel_number("trec-tot/2025/dev2", 12) + self.assertEqual("632", actual.query_id) + self.assertEqual("3261733", actual.doc_id) + self.assertEqual(1, actual.relevance) + + def test_dev3_qrel_iter(self): + actual = load_qrel_number("trec-tot/2025/dev3", 12) + self.assertEqual("2014", actual.query_id) + self.assertEqual("446518", actual.doc_id) + self.assertEqual(1, actual.relevance) + diff --git a/test/integration/trec_tot_2025/test_queries_iter.py b/test/integration/trec_tot_2025/test_queries_iter.py new file mode 100644 index 00000000..f30ab709 --- /dev/null +++ b/test/integration/trec_tot_2025/test_queries_iter.py @@ -0,0 +1,78 @@ +import unittest + +def load_dataset(dataset_id): + import ir_datasets + return ir_datasets.load(dataset_id) + +def load_query_number(dataset_id, num): + index = 0 + for i in load_dataset(dataset_id).queries_iter(): + if num == index: + return i + index += 1 + +class TestQueriesIter(unittest.TestCase): + def test_train_dataset_can_be_loaded(self): + actual = load_dataset("trec-tot/2025/train") + self.assertIsNotNone(actual) + + def test_dev1_dataset_can_be_loaded(self): + actual = load_dataset("trec-tot/2025/dev1") + self.assertIsNotNone(actual) + + def test_dev2_dataset_can_be_loaded(self): + actual = load_dataset("trec-tot/2025/dev2") + self.assertIsNotNone(actual) + + def test_dev3_dataset_can_be_loaded(self): + actual = load_dataset("trec-tot/2025/dev3") + self.assertIsNotNone(actual) + + def test_query_from_train_dataset_can_be_loaded_01(self): + actual = load_query_number("trec-tot/2025/train", 2) + self.assertIsNotNone(actual) + self.assertEqual("950", actual.query_id) + self.assertIn("two girls who run away", actual.default_text()) + + def test_query_from_train_dataset_can_be_loaded_02(self): + actual = load_query_number("trec-tot/2025/train", 25) + self.assertIsNotNone(actual) + self.assertEqual("484", actual.query_id) + self.assertIn("Main character is a famous person like a celebrity", actual.default_text()) + + def test_query_from_dev1_dataset_can_be_loaded_01(self): + actual = load_query_number("trec-tot/2025/dev1", 2) + self.assertIsNotNone(actual) + self.assertEqual("473", actual.query_id) + self.assertIn("possibly a ghost killing in an old house", actual.default_text()) + + def test_query_from_dev1_dataset_can_be_loaded_02(self): + actual = load_query_number("trec-tot/2025/dev1", 25) + self.assertIsNotNone(actual) + self.assertEqual("153", actual.query_id) + self.assertIn("Martial arts movie where the human is fighting aliens", actual.default_text()) + + def test_query_from_dev2_dataset_can_be_loaded_01(self): + actual = load_query_number("trec-tot/2025/dev2", 2) + self.assertIsNotNone(actual) + self.assertEqual("477", actual.query_id) + self.assertIn("Pretty sure it was a comedy", actual.default_text()) + + def test_query_from_dev2_dataset_can_be_loaded_02(self): + actual = load_query_number("trec-tot/2025/dev2", 25) + self.assertIsNotNone(actual) + self.assertEqual("873", actual.query_id) + self.assertIn("I remember there were 2 siblings involved in the movie", actual.default_text()) + + def test_query_from_dev3_dataset_can_be_loaded_01(self): + actual = load_query_number("trec-tot/2025/dev3", 2) + self.assertIsNotNone(actual) + self.assertEqual("2003", actual.query_id) + self.assertIn("I remember a scene where the bell tower guy and the soldier had to sneak into this hidden place", actual.default_text()) + + def test_query_from_dev3_dataset_can_be_loaded_02(self): + actual = load_query_number("trec-tot/2025/dev3", 25) + self.assertIsNotNone(actual) + self.assertEqual("2028", actual.query_id) + self.assertIn("The place had this weird energy source", actual.default_text()) +