From 8073b8171b50157603e3bb527f561b9e29c848e7 Mon Sep 17 00:00:00 2001 From: Craig Macdonald Date: Fri, 25 Jul 2025 13:36:46 +0100 Subject: [PATCH 1/3] move car and warc to be optional dependencies --- .github/workflows/test.yml | 2 +- ir_datasets/lazy_libs.py | 15 ++++++++++++--- pyproject.toml | 14 ++++++++++++++ requirements.txt | 3 --- 4 files changed, 27 insertions(+), 7 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 27f715a2..ec68ae3e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -25,7 +25,7 @@ jobs: - name: Install Dependencies run: | pip install --upgrade -r requirements.txt -r requirements-test.txt - pip install -e . + pip install -e '.[all]' - name: Unit Test if: matrix.os == 'ubuntu-latest' || matrix.os == 'macOs-latest' diff --git a/ir_datasets/lazy_libs.py b/ir_datasets/lazy_libs.py index 196940e9..3a2b636f 100644 --- a/ir_datasets/lazy_libs.py +++ b/ir_datasets/lazy_libs.py @@ -53,19 +53,28 @@ def json(): def trec_car(): if 'trec_car' not in _cache: - import trec_car.read_data + try: + import trec_car.read_data + except ImportError as ie: + raise ImportError("This dataset requires trec-car-tools. Run 'pip install ir_datasets[car]' to install dependencies for this dataset") from ie _cache['trec_car'] = trec_car return _cache['trec_car'] def warc(): if 'warc' not in _cache: - import warc + try: + import warc + except ImportError as ie: + raise ImportError("This dataset requires warc. Run 'pip install ir_datasets[warc]' to install dependencies for this dataset") from ie _cache['warc'] = warc return _cache['warc'] def warc_clueweb09(): if 'warc_clueweb09' not in _cache: - import warc3_wet_clueweb09 + try: + import warc3_wet_clueweb09 + except ImportError as ie: + raise ImportError("This dataset requires warc. Run 'pip install ir_datasets[warc]' to install dependencies for this dataset") from ie _cache['warc_clueweb09'] = warc3_wet_clueweb09 return _cache['warc_clueweb09'] diff --git a/pyproject.toml b/pyproject.toml index 21b0f5c2..9fc96858 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,6 +32,20 @@ exclude = ["test"] version = {attr = "ir_datasets.__version__"} dependencies = {file = ["requirements.txt"]} +[project.optional-dependencies] +car = [ + "trec-car-tools>=2.5.4", +] +warc = [ + "warc3-wet>=0.2.3", + "warc3-wet-clueweb09>=0.2.5" +] +all = [ + "trec-car-tools>=2.5.4", + "warc3-wet>=0.2.3", + "warc3-wet-clueweb09>=0.2.5" +] + [project.urls] "Homepage" = "https://ir-datasets.com/" "Documentation" = "https://project.readthedocs.io/" diff --git a/requirements.txt b/requirements.txt index c566ad79..3fbce679 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,10 +5,7 @@ numpy>=1.18.1 pyyaml>=5.3.1 requests>=2.22.0 tqdm>=4.38.0 -trec-car-tools>=2.5.4 lz4>=3.1.10 -warc3-wet>=0.2.3 -warc3-wet-clueweb09>=0.2.5 zlib-state>=0.1.3 ijson>=3.1.3 unlzw3>=0.2.1 From 02053766223e5df4b1651b35d5f9e277f6c94c9a Mon Sep 17 00:00:00 2001 From: Sean MacAvaney Date: Thu, 16 Oct 2025 08:52:32 +0100 Subject: [PATCH 2/3] moving the rest of the uncommon dependencies to optional --- ir_datasets/lazy_libs.py | 31 +++++++++++++++++++++++-------- pyproject.toml | 30 +++++++++++++++++++++++++++++- requirements.txt | 6 ------ 3 files changed, 52 insertions(+), 15 deletions(-) diff --git a/ir_datasets/lazy_libs.py b/ir_datasets/lazy_libs.py index 3a2b636f..19425ff7 100644 --- a/ir_datasets/lazy_libs.py +++ b/ir_datasets/lazy_libs.py @@ -25,14 +25,20 @@ def requests(): def bs4(): if 'bs4' not in _cache: - import bs4 + try: + import bs4 + except ImportError as ie: + raise ImportError("This dataset requires beautifulsoup4. Run 'pip install ir_datasets[beautifulsoup4]' to install dependencies for this dataset") from ie _cache['bs4'] = bs4 return _cache['bs4'] def inscriptis(): if 'inscriptis' not in _cache: - import inscriptis + try: + import inscriptis + except ImportError as ie: + raise ImportError("This dataset requires inscriptis. Run 'pip install ir_datasets[inscriptis]' to install dependencies for this dataset") from ie _cache['inscriptis'] = inscriptis return _cache['inscriptis'] @@ -92,7 +98,10 @@ def lz4_frame(): def zlib_state(): if 'zlib_state' not in _cache: - import zlib_state + try: + import zlib_state + except ImportError as ie: + raise ImportError("This dataset requires zlib-state. Run 'pip install ir_datasets[zlib-state]' to install dependencies for this dataset") from ie _cache['zlib_state'] = zlib_state return _cache['zlib_state'] @@ -110,7 +119,10 @@ def lxml_html(): def ijson(): if 'ijson' not in _cache: - import ijson + try: + import ijson + except ImportError as ie: + raise ImportError("This dataset requires ijson. Run 'pip install ir_datasets[ijson]' to install dependencies for this dataset") from ie _cache['ijson'] = ijson return _cache['ijson'] @@ -119,13 +131,16 @@ def pyautocorpus(): try: import pyautocorpus except ImportError as ie: - raise ImportError("This dataset requires pyautocorpus. Run 'pip install pyautocorpus'") from ie + raise ImportError("This dataset requires pyautocorpus. Run 'pip install ir_datasets[pyautocorpus]' to install dependencies for this dataset") from ie _cache['pyautocorpus'] = pyautocorpus return _cache['pyautocorpus'] def unlzw3(): if 'unlzw3' not in _cache: - import unlzw3 + try: + import unlzw3 + except ImportError as ex: + raise ImportError("This dataset requires unlzw3. Run 'pip install ir_datasets[unlzw3]' to install dependencies for this dataset") from ex _cache['unlzw3'] = unlzw3 return _cache['unlzw3'] @@ -133,7 +148,7 @@ def pyarrow_parquet(): if 'pyarrow_parquet' not in _cache: try: import pyarrow.parquet - except ImportError as ex: - raise ImportError("This dataset requires pyarrow. Run 'pip install pyarrow>=16.1.0'") from ex + except ImportError as ie: + raise ImportError("This dataset requires pyarrow. Run 'pip install ir_datasets[pyarrow]' to install dependencies for this dataset") from ie _cache['pyarrow_parquet'] = pyarrow.parquet return _cache['pyarrow_parquet'] diff --git a/pyproject.toml b/pyproject.toml index 9fc96858..c3abc9b9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,10 +40,38 @@ warc = [ "warc3-wet>=0.2.3", "warc3-wet-clueweb09>=0.2.5" ] +pyautocorpus = [ + "pyautocorpus>=0.1.12" +] +pyarrow = [ + "pyarrow>=16.1.0" +] +unlzw3 = [ + "unlzw3>=0.2.1" +] +beautifulsoup4 = [ + "beautifulsoup4>=4.4.1" +] +inscriptis = [ + "inscriptis>=2.2.0" +] +zlib-state = [ + "zlib-state>=0.1.3" +] +ijson = [ + "ijson>=3.1.3" +] all = [ "trec-car-tools>=2.5.4", "warc3-wet>=0.2.3", - "warc3-wet-clueweb09>=0.2.5" + "warc3-wet-clueweb09>=0.2.5", + "pyarrow>=16.1.0", + "pyautocorpus>=0.1.12", + "unlzw3>=0.2.1", + "beautifulsoup4>=4.4.1", + "inscriptis>=2.2.0", + "zlib-state>=0.1.3", + "ijson>=3.1.3" ] [project.urls] diff --git a/requirements.txt b/requirements.txt index 3fbce679..69f8ae7b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,12 +1,6 @@ -beautifulsoup4>=4.4.1 -inscriptis>=2.2.0 lxml>=4.5.2 numpy>=1.18.1 pyyaml>=5.3.1 requests>=2.22.0 tqdm>=4.38.0 lz4>=3.1.10 -zlib-state>=0.1.3 -ijson>=3.1.3 -unlzw3>=0.2.1 -pyarrow>=16.1.0 From 261435517c4ef416aedf49229873349083ee29c3 Mon Sep 17 00:00:00 2001 From: Sean MacAvaney Date: Thu, 16 Oct 2025 09:14:04 +0100 Subject: [PATCH 3/3] cap lxml (for now, at least) since it breaks in 6.0.0 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 69f8ae7b..4d5f713d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -lxml>=4.5.2 +lxml>=4.5.2,<6.0.0 numpy>=1.18.1 pyyaml>=5.3.1 requests>=2.22.0