Skip to content

Commit 40cd9bf

Browse files
committed
python/ get_dataset_path() can download remote datasets.
pyproject.toml: platformdirs is a new dependency. tests/test_data/ALG_1_dat.xz: Deleted. Fetch from remote. src/moocore/_datasets.py: New file.
1 parent fa2566a commit 40cd9bf

9 files changed

Lines changed: 257 additions & 46 deletions

File tree

python/doc/source/reference/functions.io.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ Read data
1010

1111
read_datasets
1212
ReadDatasetsError
13+
get_dataset
1314
get_dataset_path
1415

1516
Transform data

python/doc/source/whatsnew/index.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ Version 0.1.5
1313
- Extended example :ref:`sphx_glr_auto_examples_plot_metrics.py`.
1414
- ``vorobT()`` and ``vorobDev()`` were renamed to :func:`~moocore.vorob_t` and
1515
:func:`~moocore.vorob_dev` to follow Python convention.
16-
16+
- :func:`~moocore.get_dataset_path` and :func:`~moocore.get_dataset` can download large datasets from a remote repository.
1717

1818
Version 0.1.4 (30/10/2024)
1919
--------------------------

python/pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ classifiers = [
3636
dependencies = [
3737
"cffi>=1.15.1",
3838
"numpy>=1.23",
39+
"platformdirs",
3940
]
4041

4142
urls.Documentation = "https://multi-objective.github.io/moocore/python/"

python/src/moocore/__init__.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,6 @@
99
epsilon_mult,
1010
filter_dominated,
1111
filter_dominated_within_sets,
12-
get_dataset,
13-
get_dataset_path,
1412
hypervolume,
1513
Hypervolume,
1614
RelativeHypervolume,
@@ -30,6 +28,11 @@
3028
total_whv_rect,
3129
)
3230

31+
from ._datasets import (
32+
get_dataset,
33+
get_dataset_path,
34+
)
35+
3336
from importlib.metadata import version as _metadata_version
3437

3538
__version__ = _metadata_version("moocore")

python/src/moocore/_datasets.py

Lines changed: 240 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,240 @@
1+
from __future__ import annotations
2+
3+
import hashlib
4+
import numpy as np
5+
import os
6+
import shutil
7+
import warnings
8+
import time
9+
from pathlib import Path
10+
from tempfile import NamedTemporaryFile
11+
from urllib.error import URLError
12+
from urllib.request import urlretrieve
13+
14+
from importlib.resources import files
15+
from importlib.metadata import version as _metadata_version
16+
from platformdirs import user_cache_path
17+
from ._moocore import read_datasets
18+
19+
_moocore_version = _metadata_version("moocore")
20+
21+
_BASE_URL = (
22+
"https://github.com/multi-objective/testsuite/raw/refs/heads/main/data/"
23+
)
24+
25+
# FIXME: This could be a namedtuple so that each dataset can have its own URL.
26+
_DATASETS_CHECKSUMS = {
27+
"ALG_1_dat.xz": "a51165fe69b356c45e5bb052c747e7dcb97432975b5e7a632fc94f0b59620046",
28+
}
29+
30+
31+
# FIXME: Python >=3.11 has hashlib.file_digest()
32+
def _file_checksum(path) -> str:
33+
"""Calculate the sha256 hash of the file at path."""
34+
sha256hash = hashlib.sha256()
35+
chunk_size = 8192
36+
with open(path, "rb") as f:
37+
while True:
38+
buffer = f.read(chunk_size)
39+
if not buffer:
40+
break
41+
sha256hash.update(buffer)
42+
return sha256hash.hexdigest()
43+
44+
45+
def _fetch_dataset(
46+
name: str,
47+
checksum: str,
48+
force: bool = False,
49+
n_retries: int = 3,
50+
delay: int = 1,
51+
) -> Path:
52+
"""Fetch a dataset from if not already present in the local cache folder.
53+
54+
If the file already exists locally and the SHA256 checksums match what is
55+
expected by this version of ``moocore``, the path to the local file is
56+
returned without re-downloading.
57+
58+
Parameters
59+
----------
60+
name :
61+
Name of the dataset.
62+
63+
checksum :
64+
Expected checksum.
65+
66+
force :
67+
If ``True``, always download the dataset. By default, the dataset is only downloaded
68+
if it doesn't exist locally or the checksum does not match.
69+
70+
n_retries :
71+
Number of retries when download errors are encountered.
72+
73+
delay :
74+
Number of seconds between retries.
75+
76+
Returns
77+
-------
78+
file_path :
79+
Full path of the downloaded file.
80+
81+
"""
82+
url = _BASE_URL + name
83+
84+
local_folder = user_cache_path(
85+
appname="moocore",
86+
appauthor=None,
87+
version=_moocore_version,
88+
opinion=True,
89+
ensure_exists=True,
90+
)
91+
92+
file_path = local_folder / name
93+
if not force and file_path.exists():
94+
local_checksum = _file_checksum(file_path)
95+
if local_checksum == checksum:
96+
return file_path
97+
else:
98+
warnings.warn(
99+
f"The SHA256 checksum of existing local file {file_path.name} "
100+
f"({local_checksum}) differs from expected ({checksum}): "
101+
f"re-downloading from {url} ."
102+
)
103+
# We create a temporary file dedicated to this particular download to avoid
104+
# conflicts with parallel downloads. If the download is successful, the
105+
# temporary file is atomically renamed to the final file path (with
106+
# `shutil.move`). We therefore pass `delete=False` to `NamedTemporaryFile`.
107+
# Otherwise, garbage collecting temp_file would raise an error when
108+
# attempting to delete a file that was already renamed. If the download
109+
# fails or the result does not match the expected SHA256 digest, the
110+
# temporary file is removed manually in the except block.
111+
temp_file = NamedTemporaryFile(
112+
prefix=name + ".part_", dir=local_folder, delete=False
113+
)
114+
# Note that Python 3.12's `delete_on_close=True` is ignored as we set
115+
# `delete=False` explicitly. So after this line the empty temporary file still
116+
# exists on disk to make sure that it's uniquely reserved for this specific call of
117+
# `_fetch_remote` and therefore it protects against any corruption by parallel
118+
# calls.
119+
temp_file.close()
120+
try:
121+
temp_file_path = Path(temp_file.name)
122+
while True:
123+
try:
124+
urlretrieve(url, temp_file_path)
125+
break
126+
except (URLError, TimeoutError):
127+
if n_retries == 0:
128+
# If no more retries are left, re-raise the caught exception.
129+
raise
130+
warnings.warn(f"Retry downloading from url: {url}")
131+
n_retries -= 1
132+
time.sleep(delay)
133+
134+
local_checksum = _file_checksum(temp_file_path)
135+
if local_checksum != checksum:
136+
message = (
137+
f"The SHA256 checksum of remote file {name} ({local_checksum}) "
138+
+ f"differs from expected ({checksum}). "
139+
+ "The remote file may not be valid for this version of moocore. "
140+
)
141+
if force:
142+
warnings.warn(
143+
message + "Overwriting local file because 'force=True'."
144+
)
145+
else:
146+
raise OSError(
147+
message + "Use 'force=True' to download the file."
148+
)
149+
150+
except (Exception, KeyboardInterrupt):
151+
os.unlink(temp_file.name)
152+
raise
153+
154+
# The following renaming is atomic whenever temp_file_path and
155+
# file_path are on the same filesystem. This should be the case most of
156+
# the time, but we still use shutil.move instead of os.rename in case
157+
# they are not.
158+
shutil.move(temp_file_path, file_path)
159+
160+
return file_path
161+
162+
163+
def get_dataset_path(
164+
filename: str, /, *, force: bool = False, n_retries: int = 3, delay: int = 1
165+
) -> Path:
166+
"""Return path to a dataset provided with ``moocore``.
167+
168+
Small datasets are installed together with the ``moocore`` package.
169+
Large datasets are downloaded from a remote repository and stored in the
170+
local cache folder. If the file already exists locally (and the SHA256
171+
checksums match when provided), the path to the local file is returned
172+
without re-downloading.
173+
174+
Parameters
175+
----------
176+
filename :
177+
Name of the dataset.
178+
179+
force :
180+
If ``True``, always download remote datasets, even if present or their
181+
checksum does not match what is expected for this version of ``moocore``.
182+
183+
n_retries :
184+
Number of retries when download errors are encountered.
185+
186+
delay :
187+
Number of seconds between retries when downloading.
188+
189+
Returns
190+
-------
191+
Full path to the dataset.
192+
193+
"""
194+
local_path = files("moocore.data") / filename
195+
if local_path.exists():
196+
return local_path
197+
198+
checksum = _DATASETS_CHECKSUMS.get(filename)
199+
if checksum is None:
200+
local_names = "\n".join(
201+
[f.name for f in files("moocore.data").iterdir() if f.is_file()]
202+
)
203+
remote_names = "\n".join(_DATASETS_CHECKSUMS.keys())
204+
raise ValueError(
205+
f"Unknown dataset '{filename}', local datasets are:\n"
206+
f"{local_names}\n\nand remote datasets are:\n{remote_names}"
207+
)
208+
209+
return _fetch_dataset(
210+
filename,
211+
checksum=checksum,
212+
force=force,
213+
n_retries=n_retries,
214+
delay=delay,
215+
)
216+
217+
218+
def get_dataset(filename: str, /, **kwargs) -> np.ndarray:
219+
"""Return dataset provided by ``moocore`` as a NumPy array.
220+
221+
Parameters
222+
----------
223+
filename :
224+
Name of the dataset.
225+
226+
kwargs :
227+
Additional arguments are passed to :func:`get_dataset_path`.
228+
229+
Returns
230+
-------
231+
An array containing a representation of the data in the file.
232+
The first :math:`n-1` columns contain the numerical data for each of the objectives.
233+
The last column contains an identifier for which set the data is relevant to.
234+
235+
See Also
236+
--------
237+
read_datasets : Function used to read the dataset.
238+
239+
"""
240+
return read_datasets(get_dataset_path(filename))

python/src/moocore/_moocore.py

Lines changed: 0 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
import lzma
1515
import shutil
1616
import tempfile
17-
from importlib.resources import files
1817

1918
import numpy as np
2019

@@ -2089,44 +2088,6 @@ def whv_hype(
20892088
return hv
20902089

20912090

2092-
def get_dataset_path(filename: str, /) -> str:
2093-
"""Return path to dataset within the package.
2094-
2095-
Parameters
2096-
----------
2097-
filename :
2098-
Name of the dataset.
2099-
2100-
Returns
2101-
-------
2102-
Full path to the dataset.
2103-
2104-
"""
2105-
return files("moocore.data").joinpath(filename)
2106-
2107-
2108-
def get_dataset(filename: str, /) -> np.ndarray:
2109-
"""Return dataset within the package as a NumPy array .
2110-
2111-
Parameters
2112-
----------
2113-
filename :
2114-
Name of the dataset.
2115-
2116-
Returns
2117-
-------
2118-
An array containing a representation of the data in the file.
2119-
The first :math:`n-1` columns contain the numerical data for each of the objectives.
2120-
The last column contains an identifier for which set the data is relevant to.
2121-
2122-
See Also
2123-
--------
2124-
read_datasets : Function used to read the dataset.
2125-
2126-
"""
2127-
return read_datasets(get_dataset_path(filename))
2128-
2129-
21302091
def apply_within_sets(
21312092
x: ArrayLike, sets: ArrayLike, func: Callable[..., Any], **kwargs
21322093
) -> np.ndarray:

python/tests/conftest.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
# ruff: noqa: D100, D101, D102, D103
22
import pytest
3-
import os
43
import moocore
54

65

@@ -9,9 +8,9 @@ def test_datapath(request):
98
"""Return the directory of the currently running test script."""
109

1110
def _(file_path: str):
12-
filename = moocore.get_dataset_path(file_path)
13-
if os.path.isfile(filename):
11+
filename = request.path.parent / "test_data" / file_path
12+
if filename.is_file():
1413
return filename
15-
return request.path.parent.joinpath("test_data/" + file_path)
14+
return moocore.get_dataset_path(file_path)
1615

1716
return _
-184 KB
Binary file not shown.

python/tests/test_moocore.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -335,6 +335,12 @@ def test_eaf(test_datapath):
335335
)
336336

337337

338+
def test_get_dataset_path():
339+
with pytest.raises(Exception) as expt:
340+
moocore.get_dataset_path("notavailable")
341+
assert expt.type is ValueError
342+
343+
338344
# def test_eafdiff(test_datapath):
339345
# diff1 = np.loadtxt(test_datapath("100_diff_points_1.txt"))
340346
# diff2 = np.loadtxt(test_datapath("100_diff_points_2.txt"))

0 commit comments

Comments
 (0)