diff --git a/dandi/consts.py b/dandi/consts.py index 996f331f1..d689c57bc 100644 --- a/dandi/consts.py +++ b/dandi/consts.py @@ -170,7 +170,7 @@ def urls(self) -> Iterator[str]: #: HTTP response status codes that should always be retried (until we run out #: of retries) -RETRY_STATUSES = (500, 502, 503, 504) +RETRY_STATUSES = (429, 500, 502, 503, 504) VIDEO_FILE_EXTENSIONS = [".mp4", ".avi", ".wmv", ".mov", ".flv", ".mkv"] VIDEO_FILE_MODULES = ["processing", "acquisition"] diff --git a/dandi/dandiapi.py b/dandi/dandiapi.py index e351b32e3..87f86e164 100644 --- a/dandi/dandiapi.py +++ b/dandi/dandiapi.py @@ -43,6 +43,7 @@ chunked, ensure_datetime, get_instance, + get_retry_after, is_interactive, is_page2_url, joinurl, @@ -236,6 +237,13 @@ def request( ) if data is not None and hasattr(data, "seek"): data.seek(0) + if retry_after := get_retry_after(result): + lgr.debug( + "Sleeping for %d seconds as instructed in response " + "(in addition to tenacity imposed)", + retry_after, + ) + sleep(retry_after) result.raise_for_status() except Exception as e: if isinstance(e, requests.HTTPError): diff --git a/dandi/dandiarchive.py b/dandi/dandiarchive.py index afffe4450..75d27607d 100644 --- a/dandi/dandiarchive.py +++ b/dandi/dandiarchive.py @@ -51,7 +51,7 @@ ) from .dandiapi import BaseRemoteAsset, DandiAPIClient, RemoteDandiset from .exceptions import FailedToConnectError, NotFoundError, UnknownURLError -from .utils import get_instance +from .utils import get_instance, get_retry_after lgr = get_logger() @@ -893,14 +893,26 @@ def follow_redirect(url: str) -> str: while True: r = requests.head(url, allow_redirects=True) if r.status_code in RETRY_STATUSES and i < 4: - delay = 0.1 * 10**i - lgr.warning( - "HEAD request to %s returned %d; sleeping for %f seconds and then retrying...", - url, - r.status_code, - delay, - ) - sleep(delay) + retry_after = get_retry_after(r) + if retry_after is not None: + delay = retry_after + else: + delay = 0.1 * 10**i + if delay: + lgr.warning( + "HEAD request to %s returned %d; " + "sleeping for %f seconds and then retrying...", + url, + r.status_code, + delay, + ) + sleep(delay) + else: + lgr.warning( + "HEAD request to %s returned %d; retrying...", + url, + r.status_code, + ) i += 1 continue elif r.status_code == 404: diff --git a/dandi/download.py b/dandi/download.py index 93f878275..536878389 100644 --- a/dandi/download.py +++ b/dandi/download.py @@ -49,6 +49,7 @@ ensure_datetime, exclude_from_zarr, flattened, + get_retry_after, is_same_time, path_is_subpath, pluralize, @@ -796,7 +797,7 @@ def _download_file( # Catching RequestException lets us retry on timeout & connection # errors (among others) in addition to HTTP status errors. except requests.RequestException as exc: - attempts_allowed_or_not = _check_if_more_attempts_allowed( + attempts_allowed_or_not = _check_attempts_and_sleep( path=path, exc=exc, attempt=attempt, @@ -1080,15 +1081,29 @@ def downloads_gen(): yield {"status": "done"} -def _check_if_more_attempts_allowed( +def _check_attempts_and_sleep( path: Path, exc: requests.RequestException, attempt: int, attempts_allowed: int, - downloaded_in_attempt: int, + downloaded_in_attempt: int = 0, ) -> int | None: - """Check if we should retry the download, return potentially adjusted 'attempts_allowed'""" - sleep_amount = random.random() * 5 * attempt + """ + Check if we should retry the download, sleep if still allowed, + and return potentially adjusted 'attempts_allowed' + + :param path: Destination of the download + :param exc: Exception raised during the last download attempt + :param attempt: The index of the last download attempt + :param attempts_allowed: The number of download attempts currently allowed + :param downloaded_in_attempt: The number of bytes downloaded in the last attempt + + :returns: The number of download attempts allowed, potentially adjusted, if download + should be retried. None if download should not be retried. + Note: If download should be retried, this function sleeps before returning. + otherwise, it returns immediately. + """ + sleep_amount: float | None = None if os.environ.get("DANDI_DOWNLOAD_AGGRESSIVE_RETRY"): # in such a case if we downloaded a little more -- # consider it a successful attempt @@ -1104,12 +1119,11 @@ def _check_if_more_attempts_allowed( ) attempts_allowed += 1 if attempt >= attempts_allowed: - lgr.debug("%s - download failed after %d attempts: %s", path, attempt, exc) + lgr.debug("%s - download failed after %d attempts: %s", path, attempt + 1, exc) return None - # TODO: actually we should probably retry only on selected codes, - elif exc.response is not None: + if exc.response is not None: if exc.response.status_code not in ( - 400, # Bad Request, but happened with gider: + 400, # Bad Request, but happened with girder: # https://github.com/dandi/dandi-cli/issues/87 *RETRY_STATUSES, ): @@ -1120,46 +1134,17 @@ def _check_if_more_attempts_allowed( exc, ) return None - elif retry_after := exc.response.headers.get("Retry-After"): - # playing safe - if not str(retry_after).isdigit(): - # our code is wrong, do not crash but issue warning so - # we might get report/fix it up - lgr.warning( - "%s - download failed due to response %d with non-integer" - " Retry-After=%r: %s", - path, - exc.response.status_code, - retry_after, - exc, - ) - return None - sleep_amount = int(retry_after) - lgr.debug( - "%s - download failed due to response %d with " - "Retry-After=%d: %s, will sleep and retry", - path, - exc.response.status_code, - sleep_amount, - exc, - ) - else: - lgr.debug( - "%s - download failed on attempt #%d: %s, will sleep a bit and retry", - path, - attempt, - exc, - ) - # if is_access_denied(exc) or attempt >= 2: - # raise - # sleep a little and retry - else: - lgr.debug( - "%s - download failed on attempt #%d: %s, will sleep a bit and retry", - path, - attempt, - exc, - ) + sleep_amount = get_retry_after(exc.response) + if sleep_amount is None: + # it was not Retry-after set, so we come up with random duration to sleep + sleep_amount = random.random() * 5 * attempt + lgr.debug( + "%s - download failed on attempt #%d: %s, will sleep %f and retry", + path, + attempt, + exc, + sleep_amount, + ) time.sleep(sleep_amount) return attempts_allowed diff --git a/dandi/tests/test_download.py b/dandi/tests/test_download.py index fb9626d36..de03e9872 100644 --- a/dandi/tests/test_download.py +++ b/dandi/tests/test_download.py @@ -2,6 +2,8 @@ from collections.abc import Callable from contextlib import nullcontext +from email.utils import parsedate_to_datetime +from functools import partial from glob import glob import json import logging @@ -12,10 +14,13 @@ import re from shutil import rmtree import time +from unittest import mock import numpy as np import pytest from pytest_mock import MockerFixture +import requests +from requests.exceptions import HTTPError import responses import zarr @@ -32,6 +37,7 @@ PathType, ProgressCombiner, PYOUTHelper, + _check_attempts_and_sleep, download, ) from ..exceptions import NotFoundError @@ -1169,3 +1175,120 @@ def test_DownloadDirectory_exc( assert dl.dirpath.exists() assert dl.fp is None assert dl.writefile.read_bytes() == b"456" + + +def test__check_attempts_and_sleep() -> None: + f = partial(_check_attempts_and_sleep, Path("some/path")) + + response403 = requests.Response() + response403.status_code = 403 # no retry + + response500 = requests.Response() + response500.status_code = 500 + + # we do retry if cause is unknown (no response) + with mock.patch("time.sleep") as mock_sleep: + assert f(HTTPError(), attempt=1, attempts_allowed=2) == 2 + mock_sleep.assert_called_once() + assert mock_sleep.call_args.args[0] >= 0 + + # or if some 500 + with mock.patch("time.sleep") as mock_sleep: + assert f(HTTPError(response=response500), attempt=1, attempts_allowed=2) == 2 + mock_sleep.assert_called_once() + assert mock_sleep.call_args.args[0] >= 0 + + # do not bother if already at limit + with mock.patch("time.sleep") as mock_sleep: + assert f(HTTPError(), attempt=2, attempts_allowed=2) is None + mock_sleep.assert_not_called() + + # do not bother if 403 + with mock.patch("time.sleep") as mock_sleep: + assert f(HTTPError(response=response403), attempt=1, attempts_allowed=2) is None + mock_sleep.assert_not_called() + + # And in case of "Aggressive setting" when DANDI_DOWNLOAD_AGGRESSIVE_RETRY + # env var is set to 1, we retry if there was extra content downloaded + # patch env var DANDI_DOWNLOAD_AGGRESSIVE_RETRY + with mock.patch.dict(os.environ, {"DANDI_DOWNLOAD_AGGRESSIVE_RETRY": "1"}): + with mock.patch("time.sleep") as mock_sleep: + assert ( + f(HTTPError(), attempt=2, attempts_allowed=2, downloaded_in_attempt=0) + is None + ) + mock_sleep.assert_not_called() + + assert ( + f(HTTPError(), attempt=2, attempts_allowed=2, downloaded_in_attempt=1) + == 3 + ) + mock_sleep.assert_called_once() + assert mock_sleep.call_args.args[0] >= 0 + + +@pytest.mark.parametrize("status_code", [429, 503]) +def test__check_attempts_and_sleep_retries(status_code: int) -> None: + f = partial(_check_attempts_and_sleep, Path("some/path")) + + response = requests.Response() + response.status_code = status_code + + response.headers["Retry-After"] = "10" + with mock.patch("time.sleep") as mock_sleep: + assert f(HTTPError(response=response), attempt=1, attempts_allowed=2) == 2 + mock_sleep.assert_called_once() + assert mock_sleep.call_args.args[0] == 10 + + response.headers["Retry-After"] = "Wed, 21 Oct 2015 07:28:00 GMT" + with mock.patch("time.sleep") as mock_sleep, mock.patch( + "dandi.utils.datetime" + ) as mock_datetime: + # shifted by 2 minutes + mock_datetime.datetime.now.return_value = parsedate_to_datetime( + "Wed, 21 Oct 2015 07:26:00 GMT" + ) + assert f(HTTPError(response=response), attempt=1, attempts_allowed=2) == 2 + mock_sleep.assert_called_once() + assert mock_sleep.call_args.args[0] == 120 + + # we would still sleep some time if Retry-After is not decypherable + response.headers["Retry-After"] = "indecipherable" + with mock.patch("time.sleep") as mock_sleep: + assert ( + _check_attempts_and_sleep( + Path("some/path"), + HTTPError(response=response), + attempt=1, + attempts_allowed=2, + ) + == 2 + ) + mock_sleep.assert_called_once() + assert mock_sleep.call_args.args[0] > 0 + + # shifted by 1 year! (too long) + response.headers["Retry-After"] = "Wed, 21 Oct 2016 07:28:00 GMT" + with mock.patch("time.sleep") as mock_sleep, mock.patch( + "dandi.utils.datetime" + ) as mock_datetime: + mock_datetime.datetime.now.return_value = parsedate_to_datetime( + "Wed, 21 Oct 2015 07:28:00 GMT" + ) + assert f(HTTPError(response=response), attempt=1, attempts_allowed=2) == 2 + mock_sleep.assert_called_once() + # and we do sleep some time + assert mock_sleep.call_args.args[0] > 0 + + # in the past second (too quick) + response.headers["Retry-After"] = "Wed, 21 Oct 2015 07:27:59 GMT" + with mock.patch("time.sleep") as mock_sleep, mock.patch( + "dandi.utils.datetime" + ) as mock_datetime: + mock_datetime.datetime.now.return_value = parsedate_to_datetime( + "Wed, 21 Oct 2015 07:28:00 GMT" + ) + assert f(HTTPError(response=response), attempt=1, attempts_allowed=2) == 2 + mock_sleep.assert_called_once() + # and we do not sleep really + assert not mock_sleep.call_args.args[0] diff --git a/dandi/utils.py b/dandi/utils.py index b7f9ed5b5..a5c3fbc16 100644 --- a/dandi/utils.py +++ b/dandi/utils.py @@ -3,6 +3,7 @@ from bisect import bisect from collections.abc import Iterable, Iterator import datetime +from email.utils import parsedate_to_datetime from functools import lru_cache from importlib.metadata import version as importlib_version import inspect @@ -890,3 +891,65 @@ def joinurl(base: str, path: str) -> str: return path else: return base.rstrip("/") + "/" + path.lstrip("/") + + +def get_retry_after(response: requests.Response) -> Optional[int]: + """If provided and parsed ok, returns duration in seconds to sleep before retry. + + If not provided in the response header `Retry-After`, would + return None. + If parsing fails, or provided date/sleep does not make sense + since either too far in the past (over 2 seconds) or in the future + (over a week), would return None. + """ + retry_after = response.headers.get("Retry-After") + if retry_after is None: + return None + sleep_amount: int | None + current_date = datetime.datetime.now(datetime.timezone.utc) + try: + sleep_amount = int(retry_after) + except ValueError: + # else if it is a datestamp like "Wed, 21 Oct 2015 07:28:00 GMT" + # we could parse it and calculate how long to sleep + try: + retry_after_date = parsedate_to_datetime(retry_after) + except (ValueError, TypeError) as exc_ve: + # our code or response is wrong, do not crash but issue warning + # and continue with "if_unparsable" sleep logic + sleep_amount = None + lgr.warning( + "response %d has incorrect date in Retry-After=%s: %s. Returning %r", + response.status_code, + retry_after, + exc_ve, + sleep_amount, + ) + else: + difference = retry_after_date - current_date + sleep_amount = int(difference.total_seconds()) + + if sleep_amount: + if -2 < sleep_amount < 0: + # allow for up to a few seconds delay in us receiving/parsing etc + # but otherwise assume abnormality and just return if_unparsable + sleep_amount = 0 + elif sleep_amount < 0: + sleep_amount = None + lgr.warning( + "date in Retry-After=%s is in the past (current is %r). " + "Returning %r", + retry_after, + current_date, + sleep_amount, + ) + elif sleep_amount > 7 * 24 * 60 * 60: # week + sleep_amount = None + lgr.warning( + "date in Retry-After=%s is over a week in the future (current is %r). " + "Returning %r", + retry_after, + current_date, + sleep_amount, + ) + return sleep_amount