From 8ea22552b07b5177ea3c5006fc340d340a96b601 Mon Sep 17 00:00:00 2001 From: Isaac To Date: Sun, 9 Feb 2025 16:18:19 -0800 Subject: [PATCH 1/9] feat: Add API management command for correcting corrupt metadata --- .../management/commands/correct_metadata.py | 94 +++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 dandiapi/api/management/commands/correct_metadata.py diff --git a/dandiapi/api/management/commands/correct_metadata.py b/dandiapi/api/management/commands/correct_metadata.py new file mode 100644 index 000000000..269828290 --- /dev/null +++ b/dandiapi/api/management/commands/correct_metadata.py @@ -0,0 +1,94 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from collections.abc import Callable + +from django.db import transaction +import djclick as click + +from dandiapi.api.models import Version +from dandiapi.api.tasks import write_manifest_files + + +@click.command( + help='Correct corrupted metadata. If `--dandiset` and `--dandiset-version` are provided, ' + 'apply the correction to the specified Dandiset version. Otherwise, apply the correction to ' + 'all Dandiset versions.' +) +@click.option( + '--dandiset', + 'dandiset', + help='The Dandiset. (This must be provided if `--dandiset-version` is provided.)', +) +@click.option( + '--dandiset-version', + 'dandiset_version', + help='The version of the Dandiset. (This must be provided if `--dandiset` is provided.)', +) +def correct_metadata(*, dandiset: str | None, dandiset_version: str | None): + # Func to use to correct the metadata (update as needed) + correct_func: Callable[[dict], dict | None] = correct_affiliation_corruption + + # Ensure both options are provided together (or both omitted) + if (dandiset is None) != (dandiset_version is None): + raise click.UsageError( + 'Both `--dandiset` and `--dandiset-version` must be provided together.' + ) + + vers = ( + Version.objects.all() + if dandiset is None + else (Version.objects.get(dandiset=dandiset, version=dandiset_version),) + ) + + # List of changes to be applied to the database represented as tuples of the form + # version to be changed and the new metadata + changes: list[tuple[Version, dict]] = [] + for ver in vers: + try: + meta_corrected = correct_func(ver.metadata) + except Exception as e: + click.echo( + f'{ver.dandiset.identifier}/{ver.version}: Failed to correct metadata with ' + f'`{correct_func.__name__}()`' + ) + click.echo(e) + raise click.Abort from e + + if meta_corrected is not None: + changes.append((ver, meta_corrected)) + + if changes: + # Apply the changes to the database atomically + with transaction.atomic(): + for ver, meta_corrected in changes: + ver.metadata = meta_corrected + ver.save() # TODO: should I use `save()` in this case? It has some custom logic. + + for ver, _ in changes: + write_manifest_files.delay(ver.id) # TODO: Please check if this is needed + + click.echo( + f'{ver.dandiset.identifier}/{ver.version}: Metadata corrected with ' + f'`{correct_func.__name__}()`' + ) + else: + click.echo( + f'No changes. No metadata of the Dandiset version(s) need to be corrected per ' + f'`{correct_func.__name__}()`' + ) + + +def correct_affiliation_corruption(meta: dict) -> dict | None: + """ + Correct corruptions in JSON objects with the `"schemaKey"` of `"Affiliation"`. + + :param meta: The Dandiset metadata instance potentially containing the objects to be corrected. + :return: If there is correction to be made, return the corrected metadata; otherwise, return + `None`. + + Note: This function corrects the corruptions described in + https://github.com/dandi/dandi-schema/issues/276 + """ From 25e4cfae4dd0156c7683f8293e958ffaf3473e6d Mon Sep 17 00:00:00 2001 From: Isaac To Date: Sun, 9 Feb 2025 22:23:25 -0800 Subject: [PATCH 2/9] feat: add solution to correct `Affiliation` corruption Provide solution to correct the corruption of `Affiliation` JSON objects documented in https://github.com/dandi/dandi-schema/issues/276 --- .../management/commands/correct_metadata.py | 42 +++- .../api/tests/test_management/__init__.py | 0 .../test_management/test_commands/__init__.py | 0 .../test_commands/test_correct_metadata.py | 224 ++++++++++++++++++ 4 files changed, 265 insertions(+), 1 deletion(-) create mode 100644 dandiapi/api/tests/test_management/__init__.py create mode 100644 dandiapi/api/tests/test_management/test_commands/__init__.py create mode 100644 dandiapi/api/tests/test_management/test_commands/test_correct_metadata.py diff --git a/dandiapi/api/management/commands/correct_metadata.py b/dandiapi/api/management/commands/correct_metadata.py index 269828290..8104b7b01 100644 --- a/dandiapi/api/management/commands/correct_metadata.py +++ b/dandiapi/api/management/commands/correct_metadata.py @@ -1,6 +1,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from copy import deepcopy +from typing import TYPE_CHECKING, Any if TYPE_CHECKING: from collections.abc import Callable @@ -92,3 +93,42 @@ def correct_affiliation_corruption(meta: dict) -> dict | None: Note: This function corrects the corruptions described in https://github.com/dandi/dandi-schema/issues/276 """ + unwanted_fields = ['contactPoint', 'includeInCitation', 'roleName'] + + meta_corrected = deepcopy(meta) + affiliation_objs = find_objs(meta_corrected, 'Affiliation') + + corrected = False + for obj in affiliation_objs: + for field in unwanted_fields: + if field in obj: + del obj[field] + corrected = True + + return meta_corrected if corrected else None + + +def find_objs(instance: Any, schema_key: str) -> list[dict]: + """ + Find JSON objects with a specified `"schemaKey"` field within a data instance. + + :param instance: The data instance to find JSON objects from + :param schema_key: The `"schemaKey"` field value + :return: The list of JSON objects with the specified `"schemaKey"` in the data instance + """ + + def find_objs_(data: Any) -> None: + if isinstance(data, dict): + if 'schemaKey' in data and data['schemaKey'] == schema_key: + objs.append(data) + for value in data.values(): + find_objs_(value) + elif isinstance(data, list): + for item in data: + find_objs_(item) + else: + return + + objs: list[dict] = [] + find_objs_(instance) + return objs diff --git a/dandiapi/api/tests/test_management/__init__.py b/dandiapi/api/tests/test_management/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/dandiapi/api/tests/test_management/test_commands/__init__.py b/dandiapi/api/tests/test_management/test_commands/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/dandiapi/api/tests/test_management/test_commands/test_correct_metadata.py b/dandiapi/api/tests/test_management/test_commands/test_correct_metadata.py new file mode 100644 index 000000000..d4915d60c --- /dev/null +++ b/dandiapi/api/tests/test_management/test_commands/test_correct_metadata.py @@ -0,0 +1,224 @@ +from __future__ import annotations + +from copy import deepcopy +from typing import Any + +import pytest + +from dandiapi.api.management.commands.correct_metadata import ( + correct_affiliation_corruption, + find_objs, +) + + +@pytest.mark.parametrize( + ('instance', 'schema_key', 'expected'), + [ + # Single matching object. + pytest.param( + {'schemaKey': 'Test', 'data': 123}, + 'Test', + [{'schemaKey': 'Test', 'data': 123}], + id='single-match', + ), + # No match. + pytest.param( + {'schemaKey': 'NotMatch', 'data': 123}, + 'Test', + [], + id='no-match', + ), + # Empty dictionary should return an empty list. + pytest.param( + {}, + 'Test', + [], + id='empty-dict', + ), + # Empty list should return an empty list. + pytest.param( + [], + 'Test', + [], + id='empty-list', + ), + # Nested dictionary: the matching object is nested within another dictionary. + pytest.param( + {'level1': {'schemaKey': 'Test', 'info': 'nested'}}, + 'Test', + [{'schemaKey': 'Test', 'info': 'nested'}], + id='nested-dict', + ), + # List of dictionaries: only those with matching schema key are returned. + pytest.param( + [ + {'schemaKey': 'Test', 'data': 1}, + {'schemaKey': 'Test', 'data': 2}, + {'schemaKey': 'NotTest', 'data': 3}, + ], + 'Test', + [ + {'schemaKey': 'Test', 'data': 1}, + {'schemaKey': 'Test', 'data': 2}, + ], + id='list-of-dicts', + ), + # Mixed structure: nested dictionaries and lists. + pytest.param( + { + 'a': {'schemaKey': 'Test', 'value': 1}, + 'b': [ + {'schemaKey': 'NotTest', 'value': 2}, + {'schemaKey': 'Test', 'value': 3}, + ], + 'c': 'irrelevant', + 'd': [{'e': {'schemaKey': 'Test', 'value': 4}}], + }, + 'Test', + [ + {'schemaKey': 'Test', 'value': 1}, + {'schemaKey': 'Test', 'value': 3}, + {'schemaKey': 'Test', 'value': 4}, + ], + id='mixed-structure', + ), + # Non-collection type: integer. + pytest.param( + 42, + 'Test', + [], + id='non-collection-int', + ), + # Non-collection type: string. + pytest.param( + 'some string', + 'Test', + [], + id='non-collection-string', + ), + # Non-collection type: float. + pytest.param( + 3.14, + 'Test', + [], + id='non-collection-float', + ), + # Non-collection type: None. + pytest.param( + None, + 'Test', + [], + id='non-collection-None', + ), + # Nested child: an object with the schema key contains a nested child that also + # has the schema key. + pytest.param( + {'schemaKey': 'Test', 'child': {'schemaKey': 'Test', 'data': 'child'}}, + 'Test', + [ + {'schemaKey': 'Test', 'child': {'schemaKey': 'Test', 'data': 'child'}}, + {'schemaKey': 'Test', 'data': 'child'}, + ], + id='nested-child', + ), + # List in field: + # The object with the given schema key has a field whose value is a list + # containing objects, some of which also have the given schema key. + pytest.param( + { + 'schemaKey': 'Test', + 'items': [ + {'schemaKey': 'Test', 'data': 'item1'}, + {'schemaKey': 'Other', 'data': 'item2'}, + {'schemaKey': 'Test', 'data': 'item3'}, + ], + }, + 'Test', + [ + # The outer object is returned first... + { + 'schemaKey': 'Test', + 'items': [ + {'schemaKey': 'Test', 'data': 'item1'}, + {'schemaKey': 'Other', 'data': 'item2'}, + {'schemaKey': 'Test', 'data': 'item3'}, + ], + }, + # ...followed by the matching objects within the list. + {'schemaKey': 'Test', 'data': 'item1'}, + {'schemaKey': 'Test', 'data': 'item3'}, + ], + id='list-in-field', + ), + ], +) +def test_find_objs_parametrized(instance: Any, schema_key: str, expected: list[dict]) -> None: + result = find_objs(instance, schema_key) + assert result == expected + + +@pytest.mark.parametrize( + ('input_meta', 'expected_output'), + [ + # No Affiliation object: nothing to change. + ( + {'key': 'value'}, + None, + ), + # Affiliation exists but has no unwanted fields: returns None. + ( + {'affiliation': {'schemaKey': 'Affiliation', 'name': 'Alice'}}, + None, + ), + # Single unwanted field ("contactPoint") should be removed. + ( + {'affiliation': {'schemaKey': 'Affiliation', 'name': 'Alice', 'contactPoint': 'info'}}, + {'affiliation': {'schemaKey': 'Affiliation', 'name': 'Alice'}}, + ), + # Multiple unwanted fields should all be removed. + ( + { + 'affiliation': { + 'schemaKey': 'Affiliation', + 'name': 'Test', + 'contactPoint': 'a', + 'includeInCitation': 'b', + 'roleName': 'c', + } + }, + {'affiliation': {'schemaKey': 'Affiliation', 'name': 'Test'}}, + ), + # Nested Affiliation objects should be corrected. + ( + { + 'users': [ + {'profile': {'schemaKey': 'Affiliation', 'name': 'Bob', 'roleName': 'Member'}}, + {'profile': {'schemaKey': 'Affiliation', 'name': 'Charlie'}}, + ], + 'data': {'schemaKey': 'NotAffiliation', 'contactPoint': 'should not be touched'}, + }, + { + 'users': [ + {'profile': {'schemaKey': 'Affiliation', 'name': 'Bob'}}, + {'profile': {'schemaKey': 'Affiliation', 'name': 'Charlie'}}, + ], + 'data': {'schemaKey': 'NotAffiliation', 'contactPoint': 'should not be touched'}, + }, + ), + ], +) +def test_correct_affiliation_corruption(input_meta, expected_output): + """ + Test `correct_affiliation_corruption()`. + + Ensure that it returns the correct modified metadata (if any corrections are needed) + while not mutating the original input. + """ + # Make a deep copy of the input to ensure immutability. + original_meta = deepcopy(input_meta) + result = correct_affiliation_corruption(input_meta) + + assert result == expected_output + + # Verify that the original metadata has not been mutated. + assert input_meta == original_meta, 'The input metadata should remain unchanged.' From eba4c3be72d8c4d692b453dbadd4ecb460c263fa Mon Sep 17 00:00:00 2001 From: Jacob Nesbitt Date: Fri, 28 Feb 2025 17:21:14 -0500 Subject: [PATCH 3/9] Move test_correct_metadata.py into tests/ --- .../{test_management/test_commands => }/test_correct_metadata.py | 0 dandiapi/api/tests/test_management/__init__.py | 0 dandiapi/api/tests/test_management/test_commands/__init__.py | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename dandiapi/api/tests/{test_management/test_commands => }/test_correct_metadata.py (100%) delete mode 100644 dandiapi/api/tests/test_management/__init__.py delete mode 100644 dandiapi/api/tests/test_management/test_commands/__init__.py diff --git a/dandiapi/api/tests/test_management/test_commands/test_correct_metadata.py b/dandiapi/api/tests/test_correct_metadata.py similarity index 100% rename from dandiapi/api/tests/test_management/test_commands/test_correct_metadata.py rename to dandiapi/api/tests/test_correct_metadata.py diff --git a/dandiapi/api/tests/test_management/__init__.py b/dandiapi/api/tests/test_management/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/dandiapi/api/tests/test_management/test_commands/__init__.py b/dandiapi/api/tests/test_management/test_commands/__init__.py deleted file mode 100644 index e69de29bb..000000000 From a21f6d00ce101489170c20c25047c2a8763ae965 Mon Sep 17 00:00:00 2001 From: Isaac To Date: Fri, 28 Feb 2025 15:07:12 -0800 Subject: [PATCH 4/9] feat: change call interface of `correct_metadata` cmd This make the default behavior to require user to specify a particular dandiset version to apply the correct to. Only when the `--all` flag is provided, should the command apply the correction to all dandiset versions --- .../management/commands/correct_metadata.py | 42 ++++++++++--------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/dandiapi/api/management/commands/correct_metadata.py b/dandiapi/api/management/commands/correct_metadata.py index 8104b7b01..9f4f68140 100644 --- a/dandiapi/api/management/commands/correct_metadata.py +++ b/dandiapi/api/management/commands/correct_metadata.py @@ -14,33 +14,37 @@ @click.command( - help='Correct corrupted metadata. If `--dandiset` and `--dandiset-version` are provided, ' - 'apply the correction to the specified Dandiset version. Otherwise, apply the correction to ' - 'all Dandiset versions.' + help='Correct corrupted metadata. If `--all` is provided, apply the correction to ' + 'all Dandiset versions. Otherwise, provide the Dandiset and Dandiset version to ' + 'apply the correction to.' ) +@click.argument('dandiset', required=False) +@click.argument('dandiset_version', required=False) @click.option( - '--dandiset', - 'dandiset', - help='The Dandiset. (This must be provided if `--dandiset-version` is provided.)', + '--all', + 'apply_to_all', + is_flag=True, + default=False, + help='Apply the correction to all Dandiset versions ' + '(cannot be combined with dandiset arguments).', ) -@click.option( - '--dandiset-version', - 'dandiset_version', - help='The version of the Dandiset. (This must be provided if `--dandiset` is provided.)', -) -def correct_metadata(*, dandiset: str | None, dandiset_version: str | None): - # Func to use to correct the metadata (update as needed) - correct_func: Callable[[dict], dict | None] = correct_affiliation_corruption - - # Ensure both options are provided together (or both omitted) - if (dandiset is None) != (dandiset_version is None): +def correct_metadata(*, dandiset: str | None, dandiset_version: str | None, apply_to_all: bool): + if apply_to_all: + if dandiset is not None or dandiset_version is not None: + raise click.UsageError( + 'Cannot specify `--all` together with `dandiset` or `dandiset_version` arguments.' + ) + elif dandiset is None or dandiset_version is None: raise click.UsageError( - 'Both `--dandiset` and `--dandiset-version` must be provided together.' + 'Either `--all` or two arguments (dandiset, dandiset_version) must be provided.' ) + # Func to use to correct the metadata (update as needed) + correct_func: Callable[[dict], dict | None] = correct_affiliation_corruption + vers = ( Version.objects.all() - if dandiset is None + if apply_to_all else (Version.objects.get(dandiset=dandiset, version=dandiset_version),) ) From 256a6dc893e136c82d1c6fe829206efb7a3cf4bc Mon Sep 17 00:00:00 2001 From: Jacob Nesbitt Date: Mon, 3 Mar 2025 12:01:21 -0500 Subject: [PATCH 5/9] Add --check flag --- dandiapi/api/management/commands/correct_metadata.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/dandiapi/api/management/commands/correct_metadata.py b/dandiapi/api/management/commands/correct_metadata.py index 9f4f68140..0e44128bc 100644 --- a/dandiapi/api/management/commands/correct_metadata.py +++ b/dandiapi/api/management/commands/correct_metadata.py @@ -28,7 +28,14 @@ help='Apply the correction to all Dandiset versions ' '(cannot be combined with dandiset arguments).', ) -def correct_metadata(*, dandiset: str | None, dandiset_version: str | None, apply_to_all: bool): +@click.option( + '--check', + is_flag=True, + help="Don't perform any changes, just check for corrupted metadata.", +) +def correct_metadata( + *, dandiset: str | None, dandiset_version: str | None, apply_to_all: bool, check: bool +): if apply_to_all: if dandiset is not None or dandiset_version is not None: raise click.UsageError( From a89592196098f3534058ff12a9056b0fb6e6ef75 Mon Sep 17 00:00:00 2001 From: Jacob Nesbitt Date: Mon, 3 Mar 2025 12:41:49 -0500 Subject: [PATCH 6/9] Simplify correct_metadata and helper functions - Don't allow correct function configuration - Remove verbose error handling logic - Write manifest files synchronously - Only support correction of the `Affiliation` schema key - Remove `find_objs` tests against generalized schema key - Use transactions to couple logic of metadata save and manifest file generation --- .../management/commands/correct_metadata.py | 134 +++++++-------- dandiapi/api/tests/test_correct_metadata.py | 152 +----------------- 2 files changed, 55 insertions(+), 231 deletions(-) diff --git a/dandiapi/api/management/commands/correct_metadata.py b/dandiapi/api/management/commands/correct_metadata.py index 0e44128bc..f437075be 100644 --- a/dandiapi/api/management/commands/correct_metadata.py +++ b/dandiapi/api/management/commands/correct_metadata.py @@ -1,16 +1,15 @@ from __future__ import annotations from copy import deepcopy -from typing import TYPE_CHECKING, Any - -if TYPE_CHECKING: - from collections.abc import Callable +import sys +import typing +from typing import Any from django.db import transaction import djclick as click +from dandiapi.api.manifests import write_dandiset_jsonld, write_dandiset_yaml from dandiapi.api.models import Version -from dandiapi.api.tasks import write_manifest_files @click.command( @@ -33,7 +32,7 @@ is_flag=True, help="Don't perform any changes, just check for corrupted metadata.", ) -def correct_metadata( +def correct_metadata( # noqa: C901 *, dandiset: str | None, dandiset_version: str | None, apply_to_all: bool, check: bool ): if apply_to_all: @@ -46,51 +45,46 @@ def correct_metadata( 'Either `--all` or two arguments (dandiset, dandiset_version) must be provided.' ) - # Func to use to correct the metadata (update as needed) - correct_func: Callable[[dict], dict | None] = correct_affiliation_corruption + # Get version queryset + vers = Version.objects.all() + if not apply_to_all: + dandiset = typing.cast(str, dandiset) + vers = vers.filter(dandiset=int(dandiset), version=dandiset_version) - vers = ( - Version.objects.all() - if apply_to_all - else (Version.objects.get(dandiset=dandiset, version=dandiset_version),) - ) + if not vers.exists(): + click.echo('No matching versions found') + return - # List of changes to be applied to the database represented as tuples of the form - # version to be changed and the new metadata - changes: list[tuple[Version, dict]] = [] + # For each version, find and fix metadata corruptions, along with saving out manifest files for ver in vers: - try: - meta_corrected = correct_func(ver.metadata) - except Exception as e: - click.echo( - f'{ver.dandiset.identifier}/{ver.version}: Failed to correct metadata with ' - f'`{correct_func.__name__}()`' - ) - click.echo(e) - raise click.Abort from e + new_meta = correct_affiliation_corruption(ver.metadata) + if new_meta is None: + continue - if meta_corrected is not None: - changes.append((ver, meta_corrected)) + click.echo(f'Found corruption in {ver}') + if check: + continue - if changes: - # Apply the changes to the database atomically + # Save each version in a separate transaction to avoid de-sync with dandiset yaml/jsonld with transaction.atomic(): - for ver, meta_corrected in changes: - ver.metadata = meta_corrected - ver.save() # TODO: should I use `save()` in this case? It has some custom logic. + write_dandiset_yaml(ver) + write_dandiset_jsonld(ver) + click.echo(f'\tWrote dandiset yaml and json for version {ver}') - for ver, _ in changes: - write_manifest_files.delay(ver.id) # TODO: Please check if this is needed + ver.metadata = new_meta + ver.save() - click.echo( - f'{ver.dandiset.identifier}/{ver.version}: Metadata corrected with ' - f'`{correct_func.__name__}()`' - ) - else: + # Remaining check is not needed since no data was modified + if check: + return + + # If we find any un-fixed instances, raise exception + remaining = [ver for ver in vers if correct_affiliation_corruption(ver.metadata) is not None] + if remaining: click.echo( - f'No changes. No metadata of the Dandiset version(s) need to be corrected per ' - f'`{correct_func.__name__}()`' + click.style(f'\nFound remaining corrupted versions: {remaining}', fg='red', bold=True) ) + sys.exit(1) def correct_affiliation_corruption(meta: dict) -> dict | None: @@ -104,42 +98,22 @@ def correct_affiliation_corruption(meta: dict) -> dict | None: Note: This function corrects the corruptions described in https://github.com/dandi/dandi-schema/issues/276 """ - unwanted_fields = ['contactPoint', 'includeInCitation', 'roleName'] - - meta_corrected = deepcopy(meta) - affiliation_objs = find_objs(meta_corrected, 'Affiliation') - - corrected = False - for obj in affiliation_objs: - for field in unwanted_fields: - if field in obj: - del obj[field] - corrected = True - - return meta_corrected if corrected else None - - -def find_objs(instance: Any, schema_key: str) -> list[dict]: - """ - Find JSON objects with a specified `"schemaKey"` field within a data instance. - - :param instance: The data instance to find JSON objects from - :param schema_key: The `"schemaKey"` field value - :return: The list of JSON objects with the specified `"schemaKey"` in the data instance - """ - - def find_objs_(data: Any) -> None: - if isinstance(data, dict): - if 'schemaKey' in data and data['schemaKey'] == schema_key: - objs.append(data) - for value in data.values(): - find_objs_(value) - elif isinstance(data, list): - for item in data: - find_objs_(item) - else: - return - - objs: list[dict] = [] - find_objs_(instance) - return objs + new_meta = deepcopy(meta) + correct_objs(new_meta) + + return new_meta if new_meta != meta else None + + +def correct_objs(data: Any) -> None: + if isinstance(data, dict): + if 'schemaKey' in data and data['schemaKey'] == 'Affiliation': + data.pop('contactPoint', None) + data.pop('includeInCitation', None) + data.pop('roleName', None) + for value in data.values(): + correct_objs(value) + elif isinstance(data, list): + for item in data: + correct_objs(item) + else: + return diff --git a/dandiapi/api/tests/test_correct_metadata.py b/dandiapi/api/tests/test_correct_metadata.py index d4915d60c..6d9b734df 100644 --- a/dandiapi/api/tests/test_correct_metadata.py +++ b/dandiapi/api/tests/test_correct_metadata.py @@ -1,160 +1,10 @@ from __future__ import annotations from copy import deepcopy -from typing import Any import pytest -from dandiapi.api.management.commands.correct_metadata import ( - correct_affiliation_corruption, - find_objs, -) - - -@pytest.mark.parametrize( - ('instance', 'schema_key', 'expected'), - [ - # Single matching object. - pytest.param( - {'schemaKey': 'Test', 'data': 123}, - 'Test', - [{'schemaKey': 'Test', 'data': 123}], - id='single-match', - ), - # No match. - pytest.param( - {'schemaKey': 'NotMatch', 'data': 123}, - 'Test', - [], - id='no-match', - ), - # Empty dictionary should return an empty list. - pytest.param( - {}, - 'Test', - [], - id='empty-dict', - ), - # Empty list should return an empty list. - pytest.param( - [], - 'Test', - [], - id='empty-list', - ), - # Nested dictionary: the matching object is nested within another dictionary. - pytest.param( - {'level1': {'schemaKey': 'Test', 'info': 'nested'}}, - 'Test', - [{'schemaKey': 'Test', 'info': 'nested'}], - id='nested-dict', - ), - # List of dictionaries: only those with matching schema key are returned. - pytest.param( - [ - {'schemaKey': 'Test', 'data': 1}, - {'schemaKey': 'Test', 'data': 2}, - {'schemaKey': 'NotTest', 'data': 3}, - ], - 'Test', - [ - {'schemaKey': 'Test', 'data': 1}, - {'schemaKey': 'Test', 'data': 2}, - ], - id='list-of-dicts', - ), - # Mixed structure: nested dictionaries and lists. - pytest.param( - { - 'a': {'schemaKey': 'Test', 'value': 1}, - 'b': [ - {'schemaKey': 'NotTest', 'value': 2}, - {'schemaKey': 'Test', 'value': 3}, - ], - 'c': 'irrelevant', - 'd': [{'e': {'schemaKey': 'Test', 'value': 4}}], - }, - 'Test', - [ - {'schemaKey': 'Test', 'value': 1}, - {'schemaKey': 'Test', 'value': 3}, - {'schemaKey': 'Test', 'value': 4}, - ], - id='mixed-structure', - ), - # Non-collection type: integer. - pytest.param( - 42, - 'Test', - [], - id='non-collection-int', - ), - # Non-collection type: string. - pytest.param( - 'some string', - 'Test', - [], - id='non-collection-string', - ), - # Non-collection type: float. - pytest.param( - 3.14, - 'Test', - [], - id='non-collection-float', - ), - # Non-collection type: None. - pytest.param( - None, - 'Test', - [], - id='non-collection-None', - ), - # Nested child: an object with the schema key contains a nested child that also - # has the schema key. - pytest.param( - {'schemaKey': 'Test', 'child': {'schemaKey': 'Test', 'data': 'child'}}, - 'Test', - [ - {'schemaKey': 'Test', 'child': {'schemaKey': 'Test', 'data': 'child'}}, - {'schemaKey': 'Test', 'data': 'child'}, - ], - id='nested-child', - ), - # List in field: - # The object with the given schema key has a field whose value is a list - # containing objects, some of which also have the given schema key. - pytest.param( - { - 'schemaKey': 'Test', - 'items': [ - {'schemaKey': 'Test', 'data': 'item1'}, - {'schemaKey': 'Other', 'data': 'item2'}, - {'schemaKey': 'Test', 'data': 'item3'}, - ], - }, - 'Test', - [ - # The outer object is returned first... - { - 'schemaKey': 'Test', - 'items': [ - {'schemaKey': 'Test', 'data': 'item1'}, - {'schemaKey': 'Other', 'data': 'item2'}, - {'schemaKey': 'Test', 'data': 'item3'}, - ], - }, - # ...followed by the matching objects within the list. - {'schemaKey': 'Test', 'data': 'item1'}, - {'schemaKey': 'Test', 'data': 'item3'}, - ], - id='list-in-field', - ), - ], -) -def test_find_objs_parametrized(instance: Any, schema_key: str, expected: list[dict]) -> None: - result = find_objs(instance, schema_key) - assert result == expected +from dandiapi.api.management.commands.correct_metadata import correct_affiliation_corruption @pytest.mark.parametrize( From b02e2ff9661732527745aa5c436c9777b58e25ab Mon Sep 17 00:00:00 2001 From: Jacob Nesbitt Date: Wed, 12 Mar 2025 18:45:24 -0400 Subject: [PATCH 7/9] Only apply correction to draft versions --- .../management/commands/correct_metadata.py | 21 +++++++------------ 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/dandiapi/api/management/commands/correct_metadata.py b/dandiapi/api/management/commands/correct_metadata.py index f437075be..e24c1afde 100644 --- a/dandiapi/api/management/commands/correct_metadata.py +++ b/dandiapi/api/management/commands/correct_metadata.py @@ -14,18 +14,17 @@ @click.command( help='Correct corrupted metadata. If `--all` is provided, apply the correction to ' - 'all Dandiset versions. Otherwise, provide the Dandiset and Dandiset version to ' + 'all Dandiset versions. Otherwise, provide the Dandiset to ' 'apply the correction to.' ) @click.argument('dandiset', required=False) -@click.argument('dandiset_version', required=False) @click.option( '--all', 'apply_to_all', is_flag=True, default=False, help='Apply the correction to all Dandiset versions ' - '(cannot be combined with dandiset arguments).', + '(cannot be combined with dandiset argument).', ) @click.option( '--check', @@ -33,23 +32,19 @@ help="Don't perform any changes, just check for corrupted metadata.", ) def correct_metadata( # noqa: C901 - *, dandiset: str | None, dandiset_version: str | None, apply_to_all: bool, check: bool + *, dandiset: str | None, apply_to_all: bool, check: bool ): if apply_to_all: - if dandiset is not None or dandiset_version is not None: - raise click.UsageError( - 'Cannot specify `--all` together with `dandiset` or `dandiset_version` arguments.' - ) - elif dandiset is None or dandiset_version is None: - raise click.UsageError( - 'Either `--all` or two arguments (dandiset, dandiset_version) must be provided.' - ) + if dandiset is not None: + raise click.UsageError('Cannot specify `--all` together with `dandiset` argument.') + elif dandiset is None: + raise click.UsageError('Either `--all` or `dandiset` argument must be provided.') # Get version queryset vers = Version.objects.all() if not apply_to_all: dandiset = typing.cast(str, dandiset) - vers = vers.filter(dandiset=int(dandiset), version=dandiset_version) + vers = vers.filter(dandiset=int(dandiset), version='draft') if not vers.exists(): click.echo('No matching versions found') From c2c14f53be7093b557d11047cc36aad1a0c2b93a Mon Sep 17 00:00:00 2001 From: Jacob Nesbitt Date: Thu, 27 Mar 2025 17:13:25 -0400 Subject: [PATCH 8/9] Add audit event to correct_metadata command --- dandiapi/api/management/commands/correct_metadata.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/dandiapi/api/management/commands/correct_metadata.py b/dandiapi/api/management/commands/correct_metadata.py index e24c1afde..8d0a8de6e 100644 --- a/dandiapi/api/management/commands/correct_metadata.py +++ b/dandiapi/api/management/commands/correct_metadata.py @@ -10,6 +10,7 @@ from dandiapi.api.manifests import write_dandiset_jsonld, write_dandiset_yaml from dandiapi.api.models import Version +from dandiapi.api.services import audit @click.command( @@ -69,6 +70,14 @@ def correct_metadata( # noqa: C901 ver.metadata = new_meta ver.save() + audit.update_metadata( + dandiset=ver.dandiset, + metadata=new_meta, + user=None, + admin=True, + description='Apply metadata correction from https://github.com/dandi/dandi-schema/issues/276', + ) + # Remaining check is not needed since no data was modified if check: return From 3ee22a7ef036690c521bba225fd6fe8637f7bfea Mon Sep 17 00:00:00 2001 From: Jacob Nesbitt Date: Fri, 28 Mar 2025 11:19:01 -0400 Subject: [PATCH 9/9] Use queryset iterators to reduce memory usage Co-Authored-By: Mike VanDenburgh --- dandiapi/api/management/commands/correct_metadata.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dandiapi/api/management/commands/correct_metadata.py b/dandiapi/api/management/commands/correct_metadata.py index 8d0a8de6e..6a03ec974 100644 --- a/dandiapi/api/management/commands/correct_metadata.py +++ b/dandiapi/api/management/commands/correct_metadata.py @@ -52,7 +52,7 @@ def correct_metadata( # noqa: C901 return # For each version, find and fix metadata corruptions, along with saving out manifest files - for ver in vers: + for ver in vers.iterator(): new_meta = correct_affiliation_corruption(ver.metadata) if new_meta is None: continue @@ -83,7 +83,9 @@ def correct_metadata( # noqa: C901 return # If we find any un-fixed instances, raise exception - remaining = [ver for ver in vers if correct_affiliation_corruption(ver.metadata) is not None] + remaining = [ + ver for ver in vers.iterator() if correct_affiliation_corruption(ver.metadata) is not None + ] if remaining: click.echo( click.style(f'\nFound remaining corrupted versions: {remaining}', fg='red', bold=True)