diff --git a/CHANGELOG.md b/CHANGELOG.md index aa56a7a4..a50092b0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,6 +28,8 @@ * Added `check_image_series_starting_frame_without_external_file` to verify that `starting_frame` is not set when `external_file` is not used in an `ImageSeries`. [#235](https://github.com/NeurodataWithoutBorders/nwbinspector/issues/235) * Added `check_sweeptable_deprecated` to detect usage of the deprecated `SweepTable` in NWB files with schema version >= 2.4.0, which should use `IntracellularRecordingsTable` instead. [#657](https://github.com/NeurodataWithoutBorders/nwbinspector/issues/657) * Added `check_time_series_data_is_not_empty` to detect empty `.data` fields in `TimeSeries` containers, which often indicate incomplete data entry or conversion errors. Skips `ImageSeries` with `external_file` set, where empty data is intentional. [#668](https://github.com/NeurodataWithoutBorders/nwbinspector/pull/668) +* Added `check_publication_list_format` to detect comma-separated DOIs/URLs in `related_publications` entries that should be separate list entries. [#419](https://github.com/NeurodataWithoutBorders/nwbinspector/issues/419) +* Added `check_publication_doi_resolves` to verify that DOI URLs in `related_publications` actually resolve to valid publications. [#419](https://github.com/NeurodataWithoutBorders/nwbinspector/issues/419) ### Improvements * Added documentation to API and CLI docs on how to use the dandi config option. [#624](https://github.com/NeurodataWithoutBorders/nwbinspector/pull/624) diff --git a/docs/best_practices/nwbfile_metadata.rst b/docs/best_practices/nwbfile_metadata.rst index 6cbbeedb..b67893e0 100644 --- a/docs/best_practices/nwbfile_metadata.rst +++ b/docs/best_practices/nwbfile_metadata.rst @@ -181,7 +181,16 @@ of the form ``'doi: ###'`` or as an external link of the form ``'http://dx.doi.o This allows metadata collection programs, such as those on the :dandi-archive:`DANDI archive <>` to easily form direct hyperlinks to the publications. -Check function: :py:meth:`~nwbinspector.checks._nwbfile_metadata.check_doi_publications` +Each publication should be a separate entry in the list. Do not combine multiple DOIs or URLs into a single +comma-separated string. For example, use ``["https://doi.org/10.1234/abc", "https://doi.org/10.5678/def"]`` instead of +``["https://doi.org/10.1234/abc,https://doi.org/10.5678/def"]``. + +DOIs should be valid and resolvable. The inspector can verify that DOI URLs actually resolve to valid publications +by making network requests to the DOI resolver service. + +Check functions: :py:meth:`~nwbinspector.checks._nwbfile_metadata.check_doi_publications`, +:py:meth:`~nwbinspector.checks._nwbfile_metadata.check_publication_list_format`, and +:py:meth:`~nwbinspector.checks._nwbfile_metadata.check_publication_doi_resolves` diff --git a/src/nwbinspector/checks/__init__.py b/src/nwbinspector/checks/__init__.py index 1ae15f17..86b5bcbd 100644 --- a/src/nwbinspector/checks/__init__.py +++ b/src/nwbinspector/checks/__init__.py @@ -52,6 +52,8 @@ check_institution, check_keywords, check_processing_module_name, + check_publication_doi_resolves, + check_publication_list_format, check_session_id_no_slashes, check_session_start_time_future_date, check_session_start_time_old_date, @@ -146,6 +148,8 @@ "check_session_start_time_future_date", "check_processing_module_name", "check_session_start_time_old_date", + "check_publication_list_format", + "check_publication_doi_resolves", "check_optogenetic_stimulus_site_has_optogenetic_series", "check_excitation_lambda_in_nm", "check_plane_segmentation_image_mask_shape_against_ref_images", diff --git a/src/nwbinspector/checks/_nwbfile_metadata.py b/src/nwbinspector/checks/_nwbfile_metadata.py index c0ea102e..3744e70a 100644 --- a/src/nwbinspector/checks/_nwbfile_metadata.py +++ b/src/nwbinspector/checks/_nwbfile_metadata.py @@ -4,6 +4,8 @@ from datetime import datetime from pathlib import Path from typing import Iterable, Optional +from urllib.error import HTTPError, URLError +from urllib.request import Request, urlopen from hdmf_zarr import NWBZarrIO from isodate import Duration, parse_duration @@ -165,6 +167,119 @@ def check_doi_publications(nwbfile: NWBFile) -> Optional[Iterable[InspectorMessa return None +@register_check(importance=Importance.BEST_PRACTICE_SUGGESTION, neurodata_type=NWBFile) +def check_publication_list_format(nwbfile: NWBFile) -> Optional[Iterable[InspectorMessage]]: + """ + Check if related_publications entries contain comma-separated values that should be separate list entries. + + Best Practice: :ref:`best_practice_doi_publications` + """ + if not nwbfile.related_publications: + return None + for publication in nwbfile.related_publications: + publication = publication.decode() if isinstance(publication, bytes) else publication + # Check for comma-separated DOIs or URLs within a single entry + # Look for patterns like "doi:xxx,doi:yyy" or "https://doi.org/xxx,https://doi.org/yyy" + if "," in publication: + # Check if the comma appears to separate multiple DOIs/URLs + parts = [p.strip() for p in publication.split(",")] + doi_indicators = ["doi:", "doi.org/", "dx.doi.org/"] + doi_like_parts = [part for part in parts if any(indicator in part.lower() for indicator in doi_indicators)] + if len(doi_like_parts) > 1: + yield InspectorMessage( + message=( + f"Metadata /general/related_publications contains a comma-separated list '{publication}'. " + "Each publication should be a separate entry in the list, not combined in a single string." + ) + ) + + return None + + +def _convert_doi_to_url(doi_string: str) -> Optional[str]: + """ + Convert a DOI string to a resolvable URL. + + Handles formats like: + - "doi:10.1234/abc" -> "https://doi.org/10.1234/abc" + - "https://doi.org/10.1234/abc" -> "https://doi.org/10.1234/abc" + - "http://dx.doi.org/10.1234/abc" -> "http://dx.doi.org/10.1234/abc" + + Returns None if the string is not a recognizable DOI format. + """ + doi_string = doi_string.strip() + + if doi_string.startswith("https://doi.org/") or doi_string.startswith("http://dx.doi.org/"): + return doi_string + elif doi_string.startswith("doi:"): + # Extract the DOI identifier after "doi:" + doi_id = doi_string[4:].strip() + return f"https://doi.org/{doi_id}" + + return None + + +def _check_url_resolves(url: str, timeout: int = 10) -> tuple[bool, Optional[str]]: + """ + Check if a URL resolves by making a HEAD request. + + Returns a tuple of (success, error_message). + """ + try: + request = Request(url, method="HEAD") + request.add_header("User-Agent", "NWBInspector/1.0") + with urlopen(request, timeout=timeout) as response: + # 2xx and 3xx status codes are considered successful + if response.status < 400: + return True, None + return False, f"HTTP {response.status}" + except HTTPError as e: + return False, f"HTTP {e.code}" + except URLError as e: + return False, str(e.reason) + except TimeoutError: + return False, "Request timed out" + except Exception as e: + return False, str(e) + + +@register_check(importance=Importance.BEST_PRACTICE_SUGGESTION, neurodata_type=NWBFile) +def check_publication_doi_resolves(nwbfile: NWBFile) -> Optional[Iterable[InspectorMessage]]: + """ + Check if DOI URLs in related_publications actually resolve. + + This check makes network requests to verify that DOI URLs are valid and accessible. + + Best Practice: :ref:`best_practice_doi_publications` + """ + if not nwbfile.related_publications: + return None + + valid_starts = ["doi:", "http://dx.doi.org/", "https://doi.org/"] + + for publication in nwbfile.related_publications: + publication = publication.decode() if isinstance(publication, bytes) else publication + + # Only check entries that look like DOIs + if not any(publication.startswith(valid_start) for valid_start in valid_starts): + continue + + url = _convert_doi_to_url(publication) + if url is None: + continue + + resolves, error = _check_url_resolves(url) + if not resolves: + yield InspectorMessage( + message=( + f"Metadata /general/related_publications DOI '{publication}' does not resolve. " + f"Error: {error}. Please verify the DOI is correct." + ) + ) + + return None + + @register_check(importance=Importance.CRITICAL, neurodata_type=Subject) def check_subject_age(subject: Subject) -> Optional[InspectorMessage]: """Check if the Subject age is in ISO 8601 or our extension of it for ranges.""" diff --git a/tests/unit_tests/test_nwbfile_metadata.py b/tests/unit_tests/test_nwbfile_metadata.py index 7200abe1..8c4c2e06 100644 --- a/tests/unit_tests/test_nwbfile_metadata.py +++ b/tests/unit_tests/test_nwbfile_metadata.py @@ -1,5 +1,6 @@ import tempfile from datetime import datetime, timezone +from unittest.mock import patch from uuid import uuid4 from hdmf_zarr import NWBZarrIO @@ -16,6 +17,8 @@ check_institution, check_keywords, check_processing_module_name, + check_publication_doi_resolves, + check_publication_list_format, check_session_id_no_slashes, check_session_start_time_future_date, check_session_start_time_old_date, @@ -29,7 +32,11 @@ check_subject_species_form, check_subject_weight, ) -from nwbinspector.checks._nwbfile_metadata import PROCESSING_MODULE_CONFIG +from nwbinspector.checks._nwbfile_metadata import ( + PROCESSING_MODULE_CONFIG, + _check_url_resolves, + _convert_doi_to_url, +) from nwbinspector.testing import make_minimal_nwbfile minimal_nwbfile = make_minimal_nwbfile() @@ -269,6 +276,239 @@ def test_check_doi_publications_multiple_fail(): ] +def test_check_publication_list_format_pass(): + """Test that properly formatted publications pass the check.""" + nwbfile = NWBFile( + session_description="", + identifier=str(uuid4()), + session_start_time=datetime.now().astimezone(), + related_publications=["https://doi.org/10.1234/abc", "https://doi.org/10.5678/def"], + ) + assert check_publication_list_format(nwbfile) is None + + +def test_check_publication_list_format_pass_single_comma_in_title(): + """Test that a single publication with a comma in the title passes (not multiple DOIs).""" + nwbfile = NWBFile( + session_description="", + identifier=str(uuid4()), + session_start_time=datetime.now().astimezone(), + related_publications=["Some publication title, with comma"], + ) + assert check_publication_list_format(nwbfile) is None + + +def test_check_publication_list_format_pass_no_publications(): + """Test that no related_publications passes the check.""" + nwbfile = NWBFile( + session_description="", + identifier=str(uuid4()), + session_start_time=datetime.now().astimezone(), + ) + assert check_publication_list_format(nwbfile) is None + + +def test_check_publication_list_format_fail_comma_separated_doi_urls(): + """Test detection of comma-separated DOI URLs in a single entry.""" + nwbfile = NWBFile( + session_description="", + identifier=str(uuid4()), + session_start_time=datetime.now().astimezone(), + related_publications=["https://doi.org/10.1234/abc,https://doi.org/10.5678/def"], + ) + assert check_publication_list_format(nwbfile) == [ + InspectorMessage( + message=( + "Metadata /general/related_publications contains a comma-separated list " + "'https://doi.org/10.1234/abc,https://doi.org/10.5678/def'. " + "Each publication should be a separate entry in the list, not combined in a single string." + ), + importance=Importance.BEST_PRACTICE_SUGGESTION, + check_function_name="check_publication_list_format", + object_type="NWBFile", + object_name="root", + location="/", + ) + ] + + +def test_check_publication_list_format_fail_comma_separated_doi_prefix(): + """Test detection of comma-separated DOI prefixes in a single entry.""" + nwbfile = NWBFile( + session_description="", + identifier=str(uuid4()), + session_start_time=datetime.now().astimezone(), + related_publications=["doi:10.1234/abc, doi:10.5678/def"], + ) + assert check_publication_list_format(nwbfile) == [ + InspectorMessage( + message=( + "Metadata /general/related_publications contains a comma-separated list " + "'doi:10.1234/abc, doi:10.5678/def'. " + "Each publication should be a separate entry in the list, not combined in a single string." + ), + importance=Importance.BEST_PRACTICE_SUGGESTION, + check_function_name="check_publication_list_format", + object_type="NWBFile", + object_name="root", + location="/", + ) + ] + + +def test_check_publication_list_format_bytestring_fail(): + """Test that bytestrings are properly decoded and checked.""" + nwbfile = NWBFile( + session_description="", + identifier=str(uuid4()), + session_start_time=datetime.now().astimezone(), + related_publications=[b"https://doi.org/10.1234/abc,https://doi.org/10.5678/def"], + ) + assert check_publication_list_format(nwbfile) == [ + InspectorMessage( + message=( + "Metadata /general/related_publications contains a comma-separated list " + "'https://doi.org/10.1234/abc,https://doi.org/10.5678/def'. " + "Each publication should be a separate entry in the list, not combined in a single string." + ), + importance=Importance.BEST_PRACTICE_SUGGESTION, + check_function_name="check_publication_list_format", + object_type="NWBFile", + object_name="root", + location="/", + ) + ] + + +# Unit tests for _convert_doi_to_url helper +def test_convert_doi_to_url_https(): + assert _convert_doi_to_url("https://doi.org/10.1234/abc") == "https://doi.org/10.1234/abc" + + +def test_convert_doi_to_url_dx(): + assert _convert_doi_to_url("http://dx.doi.org/10.1234/abc") == "http://dx.doi.org/10.1234/abc" + + +def test_convert_doi_to_url_doi_prefix(): + assert _convert_doi_to_url("doi:10.1234/abc") == "https://doi.org/10.1234/abc" + + +def test_convert_doi_to_url_not_a_doi(): + assert _convert_doi_to_url("not a doi") is None + + +# Unit tests for _check_url_resolves helper (mocked) +def test_check_url_resolves_success(): + with patch("nwbinspector.checks._nwbfile_metadata.urlopen") as mock_urlopen: + mock_urlopen.return_value.__enter__ = lambda s: type("Response", (), {"status": 200})() + mock_urlopen.return_value.__exit__ = lambda s, *a: None + success, error = _check_url_resolves("https://doi.org/10.1234/abc") + assert success is True + assert error is None + + +def test_check_url_resolves_http_error(): + from urllib.error import HTTPError + + with patch("nwbinspector.checks._nwbfile_metadata.urlopen") as mock_urlopen: + mock_urlopen.side_effect = HTTPError(url=None, code=404, msg="Not Found", hdrs=None, fp=None) + success, error = _check_url_resolves("https://doi.org/10.1234/abc") + assert success is False + assert "404" in error + + +def test_check_url_resolves_url_error(): + from urllib.error import URLError + + with patch("nwbinspector.checks._nwbfile_metadata.urlopen") as mock_urlopen: + mock_urlopen.side_effect = URLError(reason="Name or service not known") + success, error = _check_url_resolves("https://doi.org/10.1234/abc") + assert success is False + assert "Name or service not known" in error + + +def test_check_url_resolves_timeout(): + with patch("nwbinspector.checks._nwbfile_metadata.urlopen") as mock_urlopen: + mock_urlopen.side_effect = TimeoutError() + success, error = _check_url_resolves("https://doi.org/10.1234/abc") + assert success is False + assert "timed out" in error + + +# Mock-based tests for check_publication_doi_resolves (no network) +def test_check_publication_doi_resolves_pass_mocked(): + """Test that a valid DOI resolves successfully (mocked).""" + nwbfile = NWBFile( + session_description="", + identifier=str(uuid4()), + session_start_time=datetime.now().astimezone(), + related_publications=["https://doi.org/10.1000/182"], + ) + with patch("nwbinspector.checks._nwbfile_metadata._check_url_resolves", return_value=(True, None)): + assert check_publication_doi_resolves(nwbfile) is None + + +def test_check_publication_doi_resolves_pass_no_publications(): + """Test that no related_publications passes the check.""" + nwbfile = NWBFile( + session_description="", + identifier=str(uuid4()), + session_start_time=datetime.now().astimezone(), + ) + assert check_publication_doi_resolves(nwbfile) is None + + +def test_check_publication_doi_resolves_fail_mocked(): + """Test that an invalid DOI fails to resolve (mocked).""" + nwbfile = NWBFile( + session_description="", + identifier=str(uuid4()), + session_start_time=datetime.now().astimezone(), + related_publications=["https://doi.org/10.1234/this-doi-does-not-exist-abc123xyz"], + ) + with patch("nwbinspector.checks._nwbfile_metadata._check_url_resolves", return_value=(False, "HTTP 404")): + results = list(check_publication_doi_resolves(nwbfile)) + assert len(results) == 1 + assert "does not resolve" in results[0].message + assert results[0].importance == Importance.BEST_PRACTICE_SUGGESTION + assert results[0].check_function_name == "check_publication_doi_resolves" + + +def test_check_publication_doi_resolves_pass_non_doi_skipped(): + """Test that non-DOI entries are skipped (not checked for resolution).""" + nwbfile = NWBFile( + session_description="", + identifier=str(uuid4()), + session_start_time=datetime.now().astimezone(), + related_publications=["Some random text that is not a DOI"], + ) + assert check_publication_doi_resolves(nwbfile) is None + + +def test_check_publication_doi_resolves_doi_prefix_format_mocked(): + """Test that DOIs with 'doi:' prefix are resolved correctly (mocked).""" + nwbfile = NWBFile( + session_description="", + identifier=str(uuid4()), + session_start_time=datetime.now().astimezone(), + related_publications=["doi:10.1000/182"], + ) + with patch("nwbinspector.checks._nwbfile_metadata._check_url_resolves", return_value=(True, None)): + assert check_publication_doi_resolves(nwbfile) is None + + +def test_check_publication_doi_resolves_bytestring_mocked(): + """Test that bytestrings are properly decoded.""" + nwbfile = NWBFile( + session_description="", + identifier=str(uuid4()), + session_start_time=datetime.now().astimezone(), + related_publications=[b"https://doi.org/10.1000/182"], + ) + with patch("nwbinspector.checks._nwbfile_metadata._check_url_resolves", return_value=(True, None)): + assert check_publication_doi_resolves(nwbfile) is None + + def test_check_subject_sex(): nwbfile = NWBFile(session_description="", identifier=str(uuid4()), session_start_time=datetime.now().astimezone()) nwbfile.subject = Subject(subject_id="001")