diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 27f715a2..ec68ae3e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -25,7 +25,7 @@ jobs: - name: Install Dependencies run: | pip install --upgrade -r requirements.txt -r requirements-test.txt - pip install -e . + pip install -e '.[all]' - name: Unit Test if: matrix.os == 'ubuntu-latest' || matrix.os == 'macOs-latest' diff --git a/ir_datasets/lazy_libs.py b/ir_datasets/lazy_libs.py index 196940e9..19425ff7 100644 --- a/ir_datasets/lazy_libs.py +++ b/ir_datasets/lazy_libs.py @@ -25,14 +25,20 @@ def requests(): def bs4(): if 'bs4' not in _cache: - import bs4 + try: + import bs4 + except ImportError as ie: + raise ImportError("This dataset requires beautifulsoup4. Run 'pip install ir_datasets[beautifulsoup4]' to install dependencies for this dataset") from ie _cache['bs4'] = bs4 return _cache['bs4'] def inscriptis(): if 'inscriptis' not in _cache: - import inscriptis + try: + import inscriptis + except ImportError as ie: + raise ImportError("This dataset requires inscriptis. Run 'pip install ir_datasets[inscriptis]' to install dependencies for this dataset") from ie _cache['inscriptis'] = inscriptis return _cache['inscriptis'] @@ -53,19 +59,28 @@ def json(): def trec_car(): if 'trec_car' not in _cache: - import trec_car.read_data + try: + import trec_car.read_data + except ImportError as ie: + raise ImportError("This dataset requires trec-car-tools. Run 'pip install ir_datasets[car]' to install dependencies for this dataset") from ie _cache['trec_car'] = trec_car return _cache['trec_car'] def warc(): if 'warc' not in _cache: - import warc + try: + import warc + except ImportError as ie: + raise ImportError("This dataset requires warc. Run 'pip install ir_datasets[warc]' to install dependencies for this dataset") from ie _cache['warc'] = warc return _cache['warc'] def warc_clueweb09(): if 'warc_clueweb09' not in _cache: - import warc3_wet_clueweb09 + try: + import warc3_wet_clueweb09 + except ImportError as ie: + raise ImportError("This dataset requires warc. Run 'pip install ir_datasets[warc]' to install dependencies for this dataset") from ie _cache['warc_clueweb09'] = warc3_wet_clueweb09 return _cache['warc_clueweb09'] @@ -83,7 +98,10 @@ def lz4_frame(): def zlib_state(): if 'zlib_state' not in _cache: - import zlib_state + try: + import zlib_state + except ImportError as ie: + raise ImportError("This dataset requires zlib-state. Run 'pip install ir_datasets[zlib-state]' to install dependencies for this dataset") from ie _cache['zlib_state'] = zlib_state return _cache['zlib_state'] @@ -101,7 +119,10 @@ def lxml_html(): def ijson(): if 'ijson' not in _cache: - import ijson + try: + import ijson + except ImportError as ie: + raise ImportError("This dataset requires ijson. Run 'pip install ir_datasets[ijson]' to install dependencies for this dataset") from ie _cache['ijson'] = ijson return _cache['ijson'] @@ -110,13 +131,16 @@ def pyautocorpus(): try: import pyautocorpus except ImportError as ie: - raise ImportError("This dataset requires pyautocorpus. Run 'pip install pyautocorpus'") from ie + raise ImportError("This dataset requires pyautocorpus. Run 'pip install ir_datasets[pyautocorpus]' to install dependencies for this dataset") from ie _cache['pyautocorpus'] = pyautocorpus return _cache['pyautocorpus'] def unlzw3(): if 'unlzw3' not in _cache: - import unlzw3 + try: + import unlzw3 + except ImportError as ex: + raise ImportError("This dataset requires unlzw3. Run 'pip install ir_datasets[unlzw3]' to install dependencies for this dataset") from ex _cache['unlzw3'] = unlzw3 return _cache['unlzw3'] @@ -124,7 +148,7 @@ def pyarrow_parquet(): if 'pyarrow_parquet' not in _cache: try: import pyarrow.parquet - except ImportError as ex: - raise ImportError("This dataset requires pyarrow. Run 'pip install pyarrow>=16.1.0'") from ex + except ImportError as ie: + raise ImportError("This dataset requires pyarrow. Run 'pip install ir_datasets[pyarrow]' to install dependencies for this dataset") from ie _cache['pyarrow_parquet'] = pyarrow.parquet return _cache['pyarrow_parquet'] diff --git a/pyproject.toml b/pyproject.toml index 21b0f5c2..c3abc9b9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,6 +32,48 @@ exclude = ["test"] version = {attr = "ir_datasets.__version__"} dependencies = {file = ["requirements.txt"]} +[project.optional-dependencies] +car = [ + "trec-car-tools>=2.5.4", +] +warc = [ + "warc3-wet>=0.2.3", + "warc3-wet-clueweb09>=0.2.5" +] +pyautocorpus = [ + "pyautocorpus>=0.1.12" +] +pyarrow = [ + "pyarrow>=16.1.0" +] +unlzw3 = [ + "unlzw3>=0.2.1" +] +beautifulsoup4 = [ + "beautifulsoup4>=4.4.1" +] +inscriptis = [ + "inscriptis>=2.2.0" +] +zlib-state = [ + "zlib-state>=0.1.3" +] +ijson = [ + "ijson>=3.1.3" +] +all = [ + "trec-car-tools>=2.5.4", + "warc3-wet>=0.2.3", + "warc3-wet-clueweb09>=0.2.5", + "pyarrow>=16.1.0", + "pyautocorpus>=0.1.12", + "unlzw3>=0.2.1", + "beautifulsoup4>=4.4.1", + "inscriptis>=2.2.0", + "zlib-state>=0.1.3", + "ijson>=3.1.3" +] + [project.urls] "Homepage" = "https://ir-datasets.com/" "Documentation" = "https://project.readthedocs.io/" diff --git a/requirements.txt b/requirements.txt index c566ad79..4d5f713d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,15 +1,6 @@ -beautifulsoup4>=4.4.1 -inscriptis>=2.2.0 -lxml>=4.5.2 +lxml>=4.5.2,<6.0.0 numpy>=1.18.1 pyyaml>=5.3.1 requests>=2.22.0 tqdm>=4.38.0 -trec-car-tools>=2.5.4 lz4>=3.1.10 -warc3-wet>=0.2.3 -warc3-wet-clueweb09>=0.2.5 -zlib-state>=0.1.3 -ijson>=3.1.3 -unlzw3>=0.2.1 -pyarrow>=16.1.0