Skip to content

Commit aebd707

Browse files
Merge pull request #296 from cmacdonald/optional_dependencies
move car and warc to be optional dependencies
2 parents 66e95b4 + 1cffcd4 commit aebd707

File tree

4 files changed

+79
-22
lines changed

4 files changed

+79
-22
lines changed

.github/workflows/test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ jobs:
2525
- name: Install Dependencies
2626
run: |
2727
pip install --upgrade -r requirements.txt -r requirements-test.txt
28-
pip install -e .
28+
pip install -e '.[all]'
2929
3030
- name: Unit Test
3131
if: matrix.os == 'ubuntu-latest' || matrix.os == 'macOs-latest'

ir_datasets/lazy_libs.py

Lines changed: 35 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -25,14 +25,20 @@ def requests():
2525

2626
def bs4():
2727
if 'bs4' not in _cache:
28-
import bs4
28+
try:
29+
import bs4
30+
except ImportError as ie:
31+
raise ImportError("This dataset requires beautifulsoup4. Run 'pip install ir_datasets[beautifulsoup4]' to install dependencies for this dataset") from ie
2932
_cache['bs4'] = bs4
3033
return _cache['bs4']
3134

3235

3336
def inscriptis():
3437
if 'inscriptis' not in _cache:
35-
import inscriptis
38+
try:
39+
import inscriptis
40+
except ImportError as ie:
41+
raise ImportError("This dataset requires inscriptis. Run 'pip install ir_datasets[inscriptis]' to install dependencies for this dataset") from ie
3642
_cache['inscriptis'] = inscriptis
3743
return _cache['inscriptis']
3844

@@ -53,19 +59,28 @@ def json():
5359

5460
def trec_car():
5561
if 'trec_car' not in _cache:
56-
import trec_car.read_data
62+
try:
63+
import trec_car.read_data
64+
except ImportError as ie:
65+
raise ImportError("This dataset requires trec-car-tools. Run 'pip install ir_datasets[car]' to install dependencies for this dataset") from ie
5766
_cache['trec_car'] = trec_car
5867
return _cache['trec_car']
5968

6069
def warc():
6170
if 'warc' not in _cache:
62-
import warc
71+
try:
72+
import warc
73+
except ImportError as ie:
74+
raise ImportError("This dataset requires warc. Run 'pip install ir_datasets[warc]' to install dependencies for this dataset") from ie
6375
_cache['warc'] = warc
6476
return _cache['warc']
6577

6678
def warc_clueweb09():
6779
if 'warc_clueweb09' not in _cache:
68-
import warc3_wet_clueweb09
80+
try:
81+
import warc3_wet_clueweb09
82+
except ImportError as ie:
83+
raise ImportError("This dataset requires warc. Run 'pip install ir_datasets[warc]' to install dependencies for this dataset") from ie
6984
_cache['warc_clueweb09'] = warc3_wet_clueweb09
7085
return _cache['warc_clueweb09']
7186

@@ -83,7 +98,10 @@ def lz4_frame():
8398

8499
def zlib_state():
85100
if 'zlib_state' not in _cache:
86-
import zlib_state
101+
try:
102+
import zlib_state
103+
except ImportError as ie:
104+
raise ImportError("This dataset requires zlib-state. Run 'pip install ir_datasets[zlib-state]' to install dependencies for this dataset") from ie
87105
_cache['zlib_state'] = zlib_state
88106
return _cache['zlib_state']
89107

@@ -101,7 +119,10 @@ def lxml_html():
101119

102120
def ijson():
103121
if 'ijson' not in _cache:
104-
import ijson
122+
try:
123+
import ijson
124+
except ImportError as ie:
125+
raise ImportError("This dataset requires ijson. Run 'pip install ir_datasets[ijson]' to install dependencies for this dataset") from ie
105126
_cache['ijson'] = ijson
106127
return _cache['ijson']
107128

@@ -110,21 +131,24 @@ def pyautocorpus():
110131
try:
111132
import pyautocorpus
112133
except ImportError as ie:
113-
raise ImportError("This dataset requires pyautocorpus. Run 'pip install pyautocorpus'") from ie
134+
raise ImportError("This dataset requires pyautocorpus. Run 'pip install ir_datasets[pyautocorpus]' to install dependencies for this dataset") from ie
114135
_cache['pyautocorpus'] = pyautocorpus
115136
return _cache['pyautocorpus']
116137

117138
def unlzw3():
118139
if 'unlzw3' not in _cache:
119-
import unlzw3
140+
try:
141+
import unlzw3
142+
except ImportError as ex:
143+
raise ImportError("This dataset requires unlzw3. Run 'pip install ir_datasets[unlzw3]' to install dependencies for this dataset") from ex
120144
_cache['unlzw3'] = unlzw3
121145
return _cache['unlzw3']
122146

123147
def pyarrow_parquet():
124148
if 'pyarrow_parquet' not in _cache:
125149
try:
126150
import pyarrow.parquet
127-
except ImportError as ex:
128-
raise ImportError("This dataset requires pyarrow. Run 'pip install pyarrow>=16.1.0'") from ex
151+
except ImportError as ie:
152+
raise ImportError("This dataset requires pyarrow. Run 'pip install ir_datasets[pyarrow]' to install dependencies for this dataset") from ie
129153
_cache['pyarrow_parquet'] = pyarrow.parquet
130154
return _cache['pyarrow_parquet']

pyproject.toml

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,48 @@ exclude = ["test"]
3232
version = {attr = "ir_datasets.__version__"}
3333
dependencies = {file = ["requirements.txt"]}
3434

35+
[project.optional-dependencies]
36+
car = [
37+
"trec-car-tools>=2.5.4",
38+
]
39+
warc = [
40+
"warc3-wet>=0.2.3",
41+
"warc3-wet-clueweb09>=0.2.5"
42+
]
43+
pyautocorpus = [
44+
"pyautocorpus>=0.1.12"
45+
]
46+
pyarrow = [
47+
"pyarrow>=16.1.0"
48+
]
49+
unlzw3 = [
50+
"unlzw3>=0.2.1"
51+
]
52+
beautifulsoup4 = [
53+
"beautifulsoup4>=4.4.1"
54+
]
55+
inscriptis = [
56+
"inscriptis>=2.2.0"
57+
]
58+
zlib-state = [
59+
"zlib-state>=0.1.3"
60+
]
61+
ijson = [
62+
"ijson>=3.1.3"
63+
]
64+
all = [
65+
"trec-car-tools>=2.5.4",
66+
"warc3-wet>=0.2.3",
67+
"warc3-wet-clueweb09>=0.2.5",
68+
"pyarrow>=16.1.0",
69+
"pyautocorpus>=0.1.12",
70+
"unlzw3>=0.2.1",
71+
"beautifulsoup4>=4.4.1",
72+
"inscriptis>=2.2.0",
73+
"zlib-state>=0.1.3",
74+
"ijson>=3.1.3"
75+
]
76+
3577
[project.urls]
3678
"Homepage" = "https://ir-datasets.com/"
3779
"Documentation" = "https://project.readthedocs.io/"

requirements.txt

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,6 @@
1-
beautifulsoup4>=4.4.1
2-
inscriptis>=2.2.0
3-
lxml>=4.5.2
1+
lxml>=4.5.2,<6.0.0
42
numpy>=1.18.1
53
pyyaml>=5.3.1
64
requests>=2.22.0
75
tqdm>=4.38.0
8-
trec-car-tools>=2.5.4
96
lz4>=3.1.10
10-
warc3-wet>=0.2.3
11-
warc3-wet-clueweb09>=0.2.5
12-
zlib-state>=0.1.3
13-
ijson>=3.1.3
14-
unlzw3>=0.2.1
15-
pyarrow>=16.1.0

0 commit comments

Comments
 (0)