From 153b592a7b379138bad7e463ddae0654a006c45c Mon Sep 17 00:00:00 2001
From: Rodrigo Duarte <52805238+RodrigDuarte@users.noreply.github.com>
Date: Sun, 21 Sep 2025 18:48:54 +0100
Subject: [PATCH 1/3] Added pt-image-ir-dataset

---
 ir_datasets/datasets/__init__.py            |   6 +-
 ir_datasets/datasets/pt_image_ir_dataset.py | 120 ++++++++++++++++++++
 ir_datasets/docs/pt-image-ir-dataset.yaml   |  35 ++++++
 ir_datasets/etc/downloads.json              |  22 ++++
 test/integration/pt_image_ir_dataset.py     | 103 +++++++++++++++++
 5 files changed, 283 insertions(+), 3 deletions(-)
 create mode 100644 ir_datasets/datasets/pt_image_ir_dataset.py
 create mode 100644 ir_datasets/docs/pt-image-ir-dataset.yaml
 create mode 100644 test/integration/pt_image_ir_dataset.py

diff --git a/ir_datasets/datasets/__init__.py b/ir_datasets/datasets/__init__.py
index 3fa77c20..92c314de 100644
--- a/ir_datasets/datasets/__init__.py
+++ b/ir_datasets/datasets/__init__.py
@@ -38,6 +38,7 @@
 from . import natural_questions
 from . import nyt
 from . import pmc
+from . import pt_image_ir_dataset
 from . import touche_image
 from . import touche  # must be after argsme,clueweb12,touche_image
 from . import trec_arabic
@@ -52,9 +53,8 @@
 from . import wikiclir
 from . import wikir
 from . import trec_fair
-from . import trec_cast # must be after wapo,car,msmarco_passage
+from . import trec_cast  # must be after wapo,car,msmarco_passage
 from . import hc4
-from . import neuclir # must be after hc4
+from . import neuclir  # must be after hc4
 from . import sara
 from . import trec_tot_2025
-
diff --git a/ir_datasets/datasets/pt_image_ir_dataset.py b/ir_datasets/datasets/pt_image_ir_dataset.py
new file mode 100644
index 00000000..b6fadc42
--- /dev/null
+++ b/ir_datasets/datasets/pt_image_ir_dataset.py
@@ -0,0 +1,120 @@
+import ir_datasets
+
+from ir_datasets.formats import TsvDocs
+from ir_datasets.formats import TrecQrels
+from ir_datasets.formats import TsvQueries
+from ir_datasets.formats.tsv import _TsvBase
+from ir_datasets.formats.base import BaseQueries
+from ir_datasets.formats.base import GenericQuery
+
+from .base import Dataset
+from .base import YamlDocumentation
+
+from ir_datasets.util import DownloadConfig
+
+from typing import NamedTuple
+
+NAME = "pt-image-ir-dataset"
+
+
+class PtImageIrArticle(NamedTuple):
+    doc_id: str
+    url: str
+    title: str
+    text: str  # content field
+    date: str
+    images: str
+
+
+class PtImageIrImage(NamedTuple):
+    doc_id: str
+    text: str  # url field
+
+
+# Custom TsvQueries class that supports skipping the first line (header)
+class TsvQueriesWithHeader(TsvQueries):
+    def __init__(
+        self,
+        queries_dlc,
+        query_cls=None,
+        namespace=None,
+        lang=None,
+        skip_first_line=False,
+    ):
+        if query_cls is None:
+            query_cls = GenericQuery
+        # Call the _TsvBase constructor directly with skip_first_line
+        _TsvBase.__init__(
+            self, queries_dlc, query_cls, "queries", skip_first_line=skip_first_line
+        )
+        BaseQueries.__init__(self)
+        self._queries_namespace = namespace
+        self._queries_lang = lang
+
+
+# What do the relevance levels in qrels mean?
+QREL_DEFS = {
+    1: "relevant - the image is relevant to the query",
+    0: "not relevant - the image is not relevant to the query",
+}
+
+# This message is shown to the user before downloads are started
+DUA = (
+    "This work is licensed under the Creative Commons Attribution 4.0 International License. "
+    "To view a copy of this license, visit https://creativecommons.org/licenses/by/4.0/. "
+    "By using this dataset, you agree to the terms and conditions of this license."
+)
+
+
+def _init():
+    # The directory where this dataset's data files will be stored
+    base_path = ir_datasets.util.home_path() / NAME
+
+    # Load an object that is used for providing the documentation
+    documentation = YamlDocumentation(f"docs/{NAME}.yaml")
+
+    # A reference to the downloads file, under the key "pt-image-ir". (DLC stands for DownLoadable Content)
+    dlc = DownloadConfig.context(NAME, base_path, dua=DUA)
+
+    # How to process the documents (articles). Since they are in a TSV format with 6 fields, we'll use TsvDocs with custom doc class.
+    articles = TsvDocs(
+        dlc["articles"],
+        doc_cls=PtImageIrArticle,
+        namespace=NAME,
+        lang="pt",
+        count_hint=4678,
+        skip_first_line=True,  # Skip header row
+    )
+
+    # How to process the images. TSV format with 2 fields, using custom doc class.
+    images = TsvDocs(
+        dlc["images"],
+        doc_cls=PtImageIrImage,
+        namespace=f"{NAME}/images",
+        lang="pt",
+        count_hint=42333,
+        skip_first_line=True,  # Skip header row
+    )
+
+    # How to process the queries. Using the custom class that can skip header.
+    queries = TsvQueriesWithHeader(
+        dlc["queries"], namespace=NAME, lang="pt", skip_first_line=True
+    )
+
+    # Qrels: The qrels file is in the TREC format, so we'll use TrecQrels to process them
+    qrels = TrecQrels(dlc["qrels"], QREL_DEFS)
+
+    # Package the docs, queries, qrels, and documentation into a Dataset object
+    dataset = Dataset(articles, queries, qrels, documentation("_"))
+
+    # Also create a dataset just for images
+    images_dataset = Dataset(images, queries, qrels, documentation("images"))
+
+    # Register the dataset in ir_datasets
+    ir_datasets.registry.register(NAME, dataset)
+    ir_datasets.registry.register(f"{NAME}/images", images_dataset)
+
+    return dataset, images_dataset
+
+
+dataset, images_dataset = _init()
diff --git a/ir_datasets/docs/pt-image-ir-dataset.yaml b/ir_datasets/docs/pt-image-ir-dataset.yaml
new file mode 100644
index 00000000..8e5acab4
--- /dev/null
+++ b/ir_datasets/docs/pt-image-ir-dataset.yaml
@@ -0,0 +1,35 @@
+_: # matches documentation key above
+  pretty_name: 'PT Image IR Dataset' # a more human-readable way to present this dataset than the dataset-id
+  desc: '
+<p>
+A Dataset for Image Information Retrieval in European Portuguese. The data is sourced from the
+<a href="https://www.presidencia.pt/">Portuguese Presidency</a> website. It contains 4,678 articles, 42,333 images, and 80 queries related to the Portuguese Presidency. Over 5,000 images were annotated by three annotators.
+</p>
+<p>
+The dataset includes:
+</p>
+<ul>
+<li><strong>Articles:</strong> 4,678 articles with URL, title, content, date, and associated images</li>
+<li><strong>Images:</strong> 42,333 images with URLs</li>
+<li><strong>Queries:</strong> 80 queries in Portuguese created by the dataset authors</li>
+<li><strong>Relevance judgments:</strong> Over 5,000 image-query relevance annotations (binary: relevant/not relevant)</li>
+</ul>
+<p>
+The dataset was annotated manually by three annotators following specific annotation rules. The relevance judgments are in TREC format, where each line contains a query ID, a zero, an image ID, and a relevance score (0 or 1).
+</p>
+<ul>
+<li><a href="https://github.com/LIAAD/pt-image-ir-dataset">Dataset Repository</a></li>
+</ul>
+'
+
+
+images:
+  pretty_name: 'PT Image IR Dataset (Images only)'
+  desc: '
+<p>
+Image collection from the PT Image IR dataset. Contains 42,333 images from the Portuguese Presidency website.
+</p>
+<p>
+This variant provides access to only the image documents from the full PT Image IR dataset, useful for image-focused retrieval tasks.
+</p>
+'
\ No newline at end of file
diff --git a/ir_datasets/etc/downloads.json b/ir_datasets/etc/downloads.json
index c65efff3..bcc632b3 100644
--- a/ir_datasets/etc/downloads.json
+++ b/ir_datasets/etc/downloads.json
@@ -6490,5 +6490,27 @@
       "expected_md5": "49589ab65d1eaf78dbbadfc5ae56ef72",
       "cache_path": "qrels.txt"
     }
+  },
+  "pt-image-ir-dataset": {
+    "articles": {
+      "url": "https://raw.githubusercontent.com/LIAAD/pt-image-ir-dataset/main/data/articles.tsv",
+      "expected_md5": "ebbce9e470f683918d526b44849ad97c",
+      "cache_path": "articles.tsv"
+    },
+    "images": {
+      "url": "https://raw.githubusercontent.com/LIAAD/pt-image-ir-dataset/main/data/images.tsv",
+      "expected_md5": "eaefd26a5b2ba1e18c48715d3363d8a1",
+      "cache_path": "images.tsv"
+    },
+    "queries": {
+      "url": "https://raw.githubusercontent.com/LIAAD/pt-image-ir-dataset/main/data/queries.tsv",
+      "expected_md5": "2e094149f0ba84e2eb8d5dedc574c3e2",
+      "cache_path": "queries.tsv"
+    },
+    "qrels": {
+      "url": "https://raw.githubusercontent.com/LIAAD/pt-image-ir-dataset/main/data/qrels.txt",
+      "expected_md5": "53187432192e9989b913e5c3259322ca",
+      "cache_path": "qrels.txt"
+    }
   }
   }
diff --git a/test/integration/pt_image_ir_dataset.py b/test/integration/pt_image_ir_dataset.py
new file mode 100644
index 00000000..f5ba451f
--- /dev/null
+++ b/test/integration/pt_image_ir_dataset.py
@@ -0,0 +1,103 @@
+import unittest
+import ir_datasets
+
+from ir_datasets.formats import TrecQrel
+from ir_datasets.formats import GenericQuery
+
+from test.integration.base import DatasetIntegrationTest
+
+from ir_datasets.datasets.pt_image_ir_dataset import PtImageIrImage
+
+
+class TestPtImageIr(DatasetIntegrationTest):
+    def test_articles(self):
+        # Test that the dataset 'pt-image-ir-dataset' has 4743 articles (excluding header)
+        # Just test the count and basic structure
+        docs = list(ir_datasets.load("pt-image-ir-dataset").docs_iter())
+        self.assertEqual(len(docs), 4743)
+
+        # Test first document structure
+        first_doc = docs[0]
+        self.assertEqual(first_doc.doc_id, "art001")
+        self.assertEqual(first_doc.date, "2023-12-01")
+        self.assertTrue(first_doc.title.startswith("Comemorações"))
+        self.assertTrue(first_doc.text.startswith("O Presidente"))
+        self.assertTrue("img00001" in first_doc.images)
+
+    def test_images(self):
+        # Test that the dataset 'pt-image-ir-dataset/images' has 42920 images (excluding header)
+        # Testing start (index 0), middle (index 21459), and end (index 42919) entries
+        self._test_docs(
+            "pt-image-ir-dataset/images",
+            count=42920,
+            items={
+                0: PtImageIrImage(
+                    doc_id="img00001",
+                    text="https://www.presidencia.pt/media/bspfpsfp/231201-prmrs-mfl-0461-4542.jpg",
+                ),
+                21459: PtImageIrImage(
+                    doc_id="img21460",
+                    text="https://www.presidencia.pt/media/c5wbrtqc/191219-prmrs-ro-0017-8746.jpg",
+                ),
+                42919: PtImageIrImage(
+                    doc_id="img42920",
+                    text="https://www.presidencia.pt/media/dw1kvy3f/170602-prmrs-ro-0002-1624.jpg",
+                ),
+            },
+        )
+
+    def test_queries(self):
+        # Test that the dataset 'pt-image-ir-dataset' has 80 queries (excluding header)
+        # Testing start (index 0), middle (index 39), and end (index 79) entries
+        self._test_queries(
+            "pt-image-ir-dataset",
+            count=80,
+            items={
+                0: GenericQuery("q01", "Emoções de tristeza em rostos"),
+                39: GenericQuery("q40", "Brexit"),
+                79: GenericQuery("q80", "Algarve"),
+            },
+        )
+
+    def test_qrels(self):
+        # Test that the dataset 'pt-image-ir-dataset' has 5201 qrels
+        # Testing start (index 0), middle (index 2600), and end (index 5200) entries
+        self._test_qrels(
+            "pt-image-ir-dataset",
+            count=5201,
+            items={
+                0: TrecQrel("q01", "img40494", 0, "0"),
+                2600: TrecQrel("q40", "img22242", 0, "0"),
+                5200: TrecQrel("q80", "img24820", 1, "0"),
+            },
+        )
+
+    def test_images_qrels(self):
+        # Test qrels for the images-only dataset variant
+        # Should have the same qrels as the main dataset
+        self._test_qrels(
+            "pt-image-ir-dataset/images",
+            count=5201,
+            items={
+                0: TrecQrel("q01", "img40494", 0, "0"),
+                2600: TrecQrel("q40", "img22242", 0, "0"),
+                5200: TrecQrel("q80", "img24820", 1, "0"),
+            },
+        )
+
+    def test_images_queries(self):
+        # Test queries for the images-only dataset variant
+        # Should have the same queries as the main dataset
+        self._test_queries(
+            "pt-image-ir-dataset/images",
+            count=80,
+            items={
+                0: GenericQuery("q01", "Emoções de tristeza em rostos"),
+                39: GenericQuery("q40", "Brexit"),
+                79: GenericQuery("q80", "Algarve"),
+            },
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()

From 91de96e1150ab2d19ac6ae591126d14c6f5f69ec Mon Sep 17 00:00:00 2001
From: Rodrigo Duarte <52805238+RodrigDuarte@users.noreply.github.com>
Date: Sun, 21 Sep 2025 18:48:54 +0100
Subject: [PATCH 2/3] Removed extra category

---
 ir_datasets/docs/pt-image-ir-dataset.yaml | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/ir_datasets/docs/pt-image-ir-dataset.yaml b/ir_datasets/docs/pt-image-ir-dataset.yaml
index 8e5acab4..95bf49e3 100644
--- a/ir_datasets/docs/pt-image-ir-dataset.yaml
+++ b/ir_datasets/docs/pt-image-ir-dataset.yaml
@@ -21,15 +21,3 @@ The dataset was annotated manually by three annotators following specific annota
 <li><a href="https://github.com/LIAAD/pt-image-ir-dataset">Dataset Repository</a></li>
 </ul>
 '
-
-
-images:
-  pretty_name: 'PT Image IR Dataset (Images only)'
-  desc: '
-<p>
-Image collection from the PT Image IR dataset. Contains 42,333 images from the Portuguese Presidency website.
-</p>
-<p>
-This variant provides access to only the image documents from the full PT Image IR dataset, useful for image-focused retrieval tasks.
-</p>
-'
\ No newline at end of file

From 68f323c4a3bcdf2231e20994e3229d8f0bfc765b Mon Sep 17 00:00:00 2001
From: Rodrigo Duarte <52805238+RodrigDuarte@users.noreply.github.com>
Date: Sun, 21 Sep 2025 18:48:54 +0100
Subject: [PATCH 3/3] Brought back original formatting

---
 ir_datasets/datasets/__init__.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/ir_datasets/datasets/__init__.py b/ir_datasets/datasets/__init__.py
index 92c314de..06c211ae 100644
--- a/ir_datasets/datasets/__init__.py
+++ b/ir_datasets/datasets/__init__.py
@@ -53,8 +53,9 @@
 from . import wikiclir
 from . import wikir
 from . import trec_fair
-from . import trec_cast  # must be after wapo,car,msmarco_passage
+from . import trec_cast # must be after wapo,car,msmarco_passage
 from . import hc4
-from . import neuclir  # must be after hc4
+from . import neuclir # must be after hc4
 from . import sara
 from . import trec_tot_2025
+