From 5c36a11cd09da365b558e3249a0e4f94ab885ff6 Mon Sep 17 00:00:00 2001
From: Eugene Yang
Date: Tue, 28 Dec 2021 17:39:40 -0500
Subject: [PATCH 1/3] add page for Patapsco
---
generate.py | 4 +++
templates/patapsco.html | 59 +++++++++++++++++++++++++++++++++++++++++
2 files changed, 63 insertions(+)
create mode 100644 templates/patapsco.html
diff --git a/generate.py b/generate.py
index d91d9e0b4..86cf61707 100644
--- a/generate.py
+++ b/generate.py
@@ -396,6 +396,7 @@ def generate_index(out_dir, version, top_level_map):
ir_datasets SIGIR resource paper
Using ir_datasets with…
PyTerrier ·
+Patapsco ·
ir-measures ·
trec_eval ·
Experimaestro
@@ -615,6 +616,9 @@ def hlb(c):
template = Template(filename=os.path.join("templates", "pyterrier.html"))
with page_template('pyterrier.html', out_dir, version, title='PyTerrier & ir_datasets', include_irds_title=False) as out:
out.write(template.render(hl=hl))
+ template = Template(filename=os.path.join("templates", "patapsco.html"))
+ with page_template('patapsco.html', out_dir, version, title='Patapsco & ir_datasets', include_irds_title=False) as out:
+ out.write(template.render(hl=hl))
template = Template(filename=os.path.join("templates", "ir-measures.html"))
with page_template('ir-measures.html', out_dir, version, title='ir_measures & ir_datasets', include_irds_title=False) as out:
out.write(template.render(hl=hl, hlb=hlb))
diff --git a/templates/patapsco.html b/templates/patapsco.html
new file mode 100644
index 000000000..aaaaf2442
--- /dev/null
+++ b/templates/patapsco.html
@@ -0,0 +1,59 @@
+
+Patapsco is a framework for running cross-language
+infomration retrieval (CLIR) experiments developed by Human Language
+Technlogy Center of Excellence (HLTCOE) at Johns Hopkins University.
+
+
+
+To get started with Patapsco, see this guide.
+
+
+Basic Usage
+
+
+Patapsco specify the source of the collection via config files or config dictionary in Python.
+Please see this
+example config file for reference.
+
+
+
+For both documents, topics and scores sections, use irds
+as the format in the input to tell Pataspco to use ir_datasets and
+specify the dataset name at path. The lang value has to match the language
+information provided by ir_datasets.
+
+
+${hl('''
+documents:
+ input:
+ format: irds
+ lang: zho
+ path: clirmatrix/zh/bi139-base/en/dev
+ process:
+ inherit: text
+ output: true
+
+topics:
+ input:
+ format: irds
+ lang: eng
+ source: original
+ encoding: utf8
+ path: clirmatrix/zh/bi139-base/en/dev
+
+score:
+ input:
+ format: irds
+ path: clirmatrix/zh/bi139-base/en/dev
+''')}
+
+
+This YAML config file can also be specified as a Python dictionary. Please refer to the
+documentation of Pataspco for further information.
+
+
+Further Information
+
+
From ff0785f3e44b5cf250f524cda5b52cb723dd2d06 Mon Sep 17 00:00:00 2001
From: Eugene Yang
Date: Tue, 28 Dec 2021 17:41:37 -0500
Subject: [PATCH 2/3] fix typo
---
templates/patapsco.html | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/templates/patapsco.html b/templates/patapsco.html
index aaaaf2442..75deb7c4e 100644
--- a/templates/patapsco.html
+++ b/templates/patapsco.html
@@ -11,15 +11,15 @@
Basic Usage
-Patapsco specify the source of the collection via config files or config dictionary in Python.
+Patapsco specifies the source of the collection via config files or config dictionaries in Python.
Please see this
example config file for reference.
For both documents, topics and scores sections, use irds
-as the format in the input to tell Pataspco to use ir_datasets and
-specify the dataset name at path. The lang value has to match the language
+as the format in the input subsection to tell Pataspco to use ir_datasets
+and specify the dataset name at path. The lang value has to match the language
information provided by ir_datasets.
From 4aa98cbbe7500e13a73a90f1f6cea54fa87467f4 Mon Sep 17 00:00:00 2001
From: Sean MacAvaney
Date: Wed, 29 Dec 2021 10:28:38 +0000
Subject: [PATCH 3/3] Update patapsco.html
---
templates/patapsco.html | 17 ++++++++++++++++-
1 file changed, 16 insertions(+), 1 deletion(-)
diff --git a/templates/patapsco.html b/templates/patapsco.html
index 75deb7c4e..42d2b1a72 100644
--- a/templates/patapsco.html
+++ b/templates/patapsco.html
@@ -20,7 +20,9 @@ Basic Usage
For both documents, topics and scores sections, use irds
as the format in the input subsection to tell Pataspco to use ir_datasets
and specify the dataset name at path. The lang value has to match the language
-information provided by ir_datasets.
+information provided by ir_datasets through dataset.docs_lang() and
+dataset.queries_lang(). Note that Patapsco uses the 3-letter ISO 639-3 language codes,
+whereas ir_datasets provides two-letter ISO 639-1 language codes.
${hl('''
@@ -52,6 +54,19 @@ Basic Usage
documentation of Pataspco for further information.
+
+| Patapsco's... | Corresponds to... | Notes |
+| documents | docs | |
+| documents.input.path | dataset's ID | |
+| documents.input.lang | dataset.docs_lang() | Need to convert from ISO 639-1 to ISO 639-3 |
+| documents.process.inherit | the doc's field representing the text to use | |
+| topics | queries | |
+| topics.input.path | dataset's ID | |
+| topics.input.lang | dataset.queries_lang() | Need to convert from ISO 639-1 to ISO 639-3 |
+| score | qrels | |
+| score.input.path | dataset's ID | |
+
+
Further Information