From 153b592a7b379138bad7e463ddae0654a006c45c Mon Sep 17 00:00:00 2001 From: Rodrigo Duarte <52805238+RodrigDuarte@users.noreply.github.com> Date: Sun, 21 Sep 2025 18:48:54 +0100 Subject: [PATCH 1/3] Added pt-image-ir-dataset --- ir_datasets/datasets/__init__.py | 6 +- ir_datasets/datasets/pt_image_ir_dataset.py | 120 ++++++++++++++++++++ ir_datasets/docs/pt-image-ir-dataset.yaml | 35 ++++++ ir_datasets/etc/downloads.json | 22 ++++ test/integration/pt_image_ir_dataset.py | 103 +++++++++++++++++ 5 files changed, 283 insertions(+), 3 deletions(-) create mode 100644 ir_datasets/datasets/pt_image_ir_dataset.py create mode 100644 ir_datasets/docs/pt-image-ir-dataset.yaml create mode 100644 test/integration/pt_image_ir_dataset.py diff --git a/ir_datasets/datasets/__init__.py b/ir_datasets/datasets/__init__.py index 3fa77c20..92c314de 100644 --- a/ir_datasets/datasets/__init__.py +++ b/ir_datasets/datasets/__init__.py @@ -38,6 +38,7 @@ from . import natural_questions from . import nyt from . import pmc +from . import pt_image_ir_dataset from . import touche_image from . import touche # must be after argsme,clueweb12,touche_image from . import trec_arabic @@ -52,9 +53,8 @@ from . import wikiclir from . import wikir from . import trec_fair -from . import trec_cast # must be after wapo,car,msmarco_passage +from . import trec_cast # must be after wapo,car,msmarco_passage from . import hc4 -from . import neuclir # must be after hc4 +from . import neuclir # must be after hc4 from . import sara from . import trec_tot_2025 - diff --git a/ir_datasets/datasets/pt_image_ir_dataset.py b/ir_datasets/datasets/pt_image_ir_dataset.py new file mode 100644 index 00000000..b6fadc42 --- /dev/null +++ b/ir_datasets/datasets/pt_image_ir_dataset.py @@ -0,0 +1,120 @@ +import ir_datasets + +from ir_datasets.formats import TsvDocs +from ir_datasets.formats import TrecQrels +from ir_datasets.formats import TsvQueries +from ir_datasets.formats.tsv import _TsvBase +from ir_datasets.formats.base import BaseQueries +from ir_datasets.formats.base import GenericQuery + +from .base import Dataset +from .base import YamlDocumentation + +from ir_datasets.util import DownloadConfig + +from typing import NamedTuple + +NAME = "pt-image-ir-dataset" + + +class PtImageIrArticle(NamedTuple): + doc_id: str + url: str + title: str + text: str # content field + date: str + images: str + + +class PtImageIrImage(NamedTuple): + doc_id: str + text: str # url field + + +# Custom TsvQueries class that supports skipping the first line (header) +class TsvQueriesWithHeader(TsvQueries): + def __init__( + self, + queries_dlc, + query_cls=None, + namespace=None, + lang=None, + skip_first_line=False, + ): + if query_cls is None: + query_cls = GenericQuery + # Call the _TsvBase constructor directly with skip_first_line + _TsvBase.__init__( + self, queries_dlc, query_cls, "queries", skip_first_line=skip_first_line + ) + BaseQueries.__init__(self) + self._queries_namespace = namespace + self._queries_lang = lang + + +# What do the relevance levels in qrels mean? +QREL_DEFS = { + 1: "relevant - the image is relevant to the query", + 0: "not relevant - the image is not relevant to the query", +} + +# This message is shown to the user before downloads are started +DUA = ( + "This work is licensed under the Creative Commons Attribution 4.0 International License. " + "To view a copy of this license, visit https://creativecommons.org/licenses/by/4.0/. " + "By using this dataset, you agree to the terms and conditions of this license." +) + + +def _init(): + # The directory where this dataset's data files will be stored + base_path = ir_datasets.util.home_path() / NAME + + # Load an object that is used for providing the documentation + documentation = YamlDocumentation(f"docs/{NAME}.yaml") + + # A reference to the downloads file, under the key "pt-image-ir". (DLC stands for DownLoadable Content) + dlc = DownloadConfig.context(NAME, base_path, dua=DUA) + + # How to process the documents (articles). Since they are in a TSV format with 6 fields, we'll use TsvDocs with custom doc class. + articles = TsvDocs( + dlc["articles"], + doc_cls=PtImageIrArticle, + namespace=NAME, + lang="pt", + count_hint=4678, + skip_first_line=True, # Skip header row + ) + + # How to process the images. TSV format with 2 fields, using custom doc class. + images = TsvDocs( + dlc["images"], + doc_cls=PtImageIrImage, + namespace=f"{NAME}/images", + lang="pt", + count_hint=42333, + skip_first_line=True, # Skip header row + ) + + # How to process the queries. Using the custom class that can skip header. + queries = TsvQueriesWithHeader( + dlc["queries"], namespace=NAME, lang="pt", skip_first_line=True + ) + + # Qrels: The qrels file is in the TREC format, so we'll use TrecQrels to process them + qrels = TrecQrels(dlc["qrels"], QREL_DEFS) + + # Package the docs, queries, qrels, and documentation into a Dataset object + dataset = Dataset(articles, queries, qrels, documentation("_")) + + # Also create a dataset just for images + images_dataset = Dataset(images, queries, qrels, documentation("images")) + + # Register the dataset in ir_datasets + ir_datasets.registry.register(NAME, dataset) + ir_datasets.registry.register(f"{NAME}/images", images_dataset) + + return dataset, images_dataset + + +dataset, images_dataset = _init() diff --git a/ir_datasets/docs/pt-image-ir-dataset.yaml b/ir_datasets/docs/pt-image-ir-dataset.yaml new file mode 100644 index 00000000..8e5acab4 --- /dev/null +++ b/ir_datasets/docs/pt-image-ir-dataset.yaml @@ -0,0 +1,35 @@ +_: # matches documentation key above + pretty_name: 'PT Image IR Dataset' # a more human-readable way to present this dataset than the dataset-id + desc: ' +

+A Dataset for Image Information Retrieval in European Portuguese. The data is sourced from the +Portuguese Presidency website. It contains 4,678 articles, 42,333 images, and 80 queries related to the Portuguese Presidency. Over 5,000 images were annotated by three annotators. +

+

+The dataset includes: +

+ +

+The dataset was annotated manually by three annotators following specific annotation rules. The relevance judgments are in TREC format, where each line contains a query ID, a zero, an image ID, and a relevance score (0 or 1). +

+ +' + + +images: + pretty_name: 'PT Image IR Dataset (Images only)' + desc: ' +

+Image collection from the PT Image IR dataset. Contains 42,333 images from the Portuguese Presidency website. +

+

+This variant provides access to only the image documents from the full PT Image IR dataset, useful for image-focused retrieval tasks. +

+' \ No newline at end of file diff --git a/ir_datasets/etc/downloads.json b/ir_datasets/etc/downloads.json index c65efff3..bcc632b3 100644 --- a/ir_datasets/etc/downloads.json +++ b/ir_datasets/etc/downloads.json @@ -6490,5 +6490,27 @@ "expected_md5": "49589ab65d1eaf78dbbadfc5ae56ef72", "cache_path": "qrels.txt" } + }, + "pt-image-ir-dataset": { + "articles": { + "url": "https://raw.githubusercontent.com/LIAAD/pt-image-ir-dataset/main/data/articles.tsv", + "expected_md5": "ebbce9e470f683918d526b44849ad97c", + "cache_path": "articles.tsv" + }, + "images": { + "url": "https://raw.githubusercontent.com/LIAAD/pt-image-ir-dataset/main/data/images.tsv", + "expected_md5": "eaefd26a5b2ba1e18c48715d3363d8a1", + "cache_path": "images.tsv" + }, + "queries": { + "url": "https://raw.githubusercontent.com/LIAAD/pt-image-ir-dataset/main/data/queries.tsv", + "expected_md5": "2e094149f0ba84e2eb8d5dedc574c3e2", + "cache_path": "queries.tsv" + }, + "qrels": { + "url": "https://raw.githubusercontent.com/LIAAD/pt-image-ir-dataset/main/data/qrels.txt", + "expected_md5": "53187432192e9989b913e5c3259322ca", + "cache_path": "qrels.txt" + } } } diff --git a/test/integration/pt_image_ir_dataset.py b/test/integration/pt_image_ir_dataset.py new file mode 100644 index 00000000..f5ba451f --- /dev/null +++ b/test/integration/pt_image_ir_dataset.py @@ -0,0 +1,103 @@ +import unittest +import ir_datasets + +from ir_datasets.formats import TrecQrel +from ir_datasets.formats import GenericQuery + +from test.integration.base import DatasetIntegrationTest + +from ir_datasets.datasets.pt_image_ir_dataset import PtImageIrImage + + +class TestPtImageIr(DatasetIntegrationTest): + def test_articles(self): + # Test that the dataset 'pt-image-ir-dataset' has 4743 articles (excluding header) + # Just test the count and basic structure + docs = list(ir_datasets.load("pt-image-ir-dataset").docs_iter()) + self.assertEqual(len(docs), 4743) + + # Test first document structure + first_doc = docs[0] + self.assertEqual(first_doc.doc_id, "art001") + self.assertEqual(first_doc.date, "2023-12-01") + self.assertTrue(first_doc.title.startswith("Comemorações")) + self.assertTrue(first_doc.text.startswith("O Presidente")) + self.assertTrue("img00001" in first_doc.images) + + def test_images(self): + # Test that the dataset 'pt-image-ir-dataset/images' has 42920 images (excluding header) + # Testing start (index 0), middle (index 21459), and end (index 42919) entries + self._test_docs( + "pt-image-ir-dataset/images", + count=42920, + items={ + 0: PtImageIrImage( + doc_id="img00001", + text="https://www.presidencia.pt/media/bspfpsfp/231201-prmrs-mfl-0461-4542.jpg", + ), + 21459: PtImageIrImage( + doc_id="img21460", + text="https://www.presidencia.pt/media/c5wbrtqc/191219-prmrs-ro-0017-8746.jpg", + ), + 42919: PtImageIrImage( + doc_id="img42920", + text="https://www.presidencia.pt/media/dw1kvy3f/170602-prmrs-ro-0002-1624.jpg", + ), + }, + ) + + def test_queries(self): + # Test that the dataset 'pt-image-ir-dataset' has 80 queries (excluding header) + # Testing start (index 0), middle (index 39), and end (index 79) entries + self._test_queries( + "pt-image-ir-dataset", + count=80, + items={ + 0: GenericQuery("q01", "Emoções de tristeza em rostos"), + 39: GenericQuery("q40", "Brexit"), + 79: GenericQuery("q80", "Algarve"), + }, + ) + + def test_qrels(self): + # Test that the dataset 'pt-image-ir-dataset' has 5201 qrels + # Testing start (index 0), middle (index 2600), and end (index 5200) entries + self._test_qrels( + "pt-image-ir-dataset", + count=5201, + items={ + 0: TrecQrel("q01", "img40494", 0, "0"), + 2600: TrecQrel("q40", "img22242", 0, "0"), + 5200: TrecQrel("q80", "img24820", 1, "0"), + }, + ) + + def test_images_qrels(self): + # Test qrels for the images-only dataset variant + # Should have the same qrels as the main dataset + self._test_qrels( + "pt-image-ir-dataset/images", + count=5201, + items={ + 0: TrecQrel("q01", "img40494", 0, "0"), + 2600: TrecQrel("q40", "img22242", 0, "0"), + 5200: TrecQrel("q80", "img24820", 1, "0"), + }, + ) + + def test_images_queries(self): + # Test queries for the images-only dataset variant + # Should have the same queries as the main dataset + self._test_queries( + "pt-image-ir-dataset/images", + count=80, + items={ + 0: GenericQuery("q01", "Emoções de tristeza em rostos"), + 39: GenericQuery("q40", "Brexit"), + 79: GenericQuery("q80", "Algarve"), + }, + ) + + +if __name__ == "__main__": + unittest.main() From 91de96e1150ab2d19ac6ae591126d14c6f5f69ec Mon Sep 17 00:00:00 2001 From: Rodrigo Duarte <52805238+RodrigDuarte@users.noreply.github.com> Date: Sun, 21 Sep 2025 18:48:54 +0100 Subject: [PATCH 2/3] Removed extra category --- ir_datasets/docs/pt-image-ir-dataset.yaml | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/ir_datasets/docs/pt-image-ir-dataset.yaml b/ir_datasets/docs/pt-image-ir-dataset.yaml index 8e5acab4..95bf49e3 100644 --- a/ir_datasets/docs/pt-image-ir-dataset.yaml +++ b/ir_datasets/docs/pt-image-ir-dataset.yaml @@ -21,15 +21,3 @@ The dataset was annotated manually by three annotators following specific annota
  • Dataset Repository
  • ' - - -images: - pretty_name: 'PT Image IR Dataset (Images only)' - desc: ' -

    -Image collection from the PT Image IR dataset. Contains 42,333 images from the Portuguese Presidency website. -

    -

    -This variant provides access to only the image documents from the full PT Image IR dataset, useful for image-focused retrieval tasks. -

    -' \ No newline at end of file From 68f323c4a3bcdf2231e20994e3229d8f0bfc765b Mon Sep 17 00:00:00 2001 From: Rodrigo Duarte <52805238+RodrigDuarte@users.noreply.github.com> Date: Sun, 21 Sep 2025 18:48:54 +0100 Subject: [PATCH 3/3] Brought back original formatting --- ir_datasets/datasets/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ir_datasets/datasets/__init__.py b/ir_datasets/datasets/__init__.py index 92c314de..06c211ae 100644 --- a/ir_datasets/datasets/__init__.py +++ b/ir_datasets/datasets/__init__.py @@ -53,8 +53,9 @@ from . import wikiclir from . import wikir from . import trec_fair -from . import trec_cast # must be after wapo,car,msmarco_passage +from . import trec_cast # must be after wapo,car,msmarco_passage from . import hc4 -from . import neuclir # must be after hc4 +from . import neuclir # must be after hc4 from . import sara from . import trec_tot_2025 +