From 4c4418fd65d1276a43137778967327917caabf9b Mon Sep 17 00:00:00 2001 From: Jan van Mansum Date: Wed, 11 Mar 2026 11:25:16 +0100 Subject: [PATCH] Apply #12167 --- .../12122-archiving in sequence.md | 3 + doc/release-notes/12122-archiving updates.md | 8 + ...-Bag-updates.md => 12144-un-holey-bags.md} | 35 +- .../source/admin/big-data-administration.rst | 1 + .../source/installation/config.rst | 211 +++++- .../edu/harvard/iq/dataverse/DatasetPage.java | 171 +++-- .../iq/dataverse/DatasetServiceBean.java | 42 ++ .../harvard/iq/dataverse/DatasetVersion.java | 27 +- .../dataverse/DatasetVersionServiceBean.java | 23 + .../iq/dataverse/DvObjectServiceBean.java | 1 + .../iq/dataverse/EjbDataverseEngine.java | 25 + .../dataverse/FileMetadataVersionsHelper.java | 14 +- .../edu/harvard/iq/dataverse/api/Admin.java | 3 +- .../harvard/iq/dataverse/api/Datasets.java | 63 +- .../iq/dataverse/dataset/DatasetUtil.java | 17 + .../impl/AbstractSubmitToArchiveCommand.java | 212 ++++-- .../impl/DRSSubmitToArchiveCommand.java | 149 ++++- .../impl/DuraCloudSubmitToArchiveCommand.java | 346 +++++----- .../FinalizeDatasetPublicationCommand.java | 50 +- .../GoogleCloudSubmitToArchiveCommand.java | 311 ++++++--- .../impl/LocalSubmitToArchiveCommand.java | 131 ++-- .../command/impl/PublishDatasetCommand.java | 27 +- .../impl/S3SubmitToArchiveCommand.java | 249 ++++--- .../harvest/server/OAIRecordServiceBean.java | 5 +- .../iq/dataverse/search/IndexServiceBean.java | 11 +- .../iq/dataverse/settings/FeatureFlags.java | 13 + .../iq/dataverse/settings/JvmSettings.java | 5 + .../settings/SettingsServiceBean.java | 19 +- .../iq/dataverse/util/ArchiverUtil.java | 11 + .../iq/dataverse/util/bagit/BagGenerator.java | 613 +++++++++++------- .../iq/dataverse/util/bagit/OREMap.java | 15 +- .../workflow/WorkflowServiceBean.java | 61 +- .../ArchivalSubmissionWorkflowStep.java | 50 +- src/main/java/propertyFiles/Bundle.properties | 5 +- src/main/webapp/dataset-versions.xhtml | 13 +- src/main/webapp/resources/css/structure.css | 3 + .../edu/harvard/iq/dataverse/api/BagIT.java | 2 +- .../edu/harvard/iq/dataverse/api/SwordIT.java | 3 +- .../util/bagit/BagGeneratorInfoFileTest.java | 3 +- .../bagit/BagGeneratorMultilineWrapTest.java | 99 ++- 40 files changed, 2154 insertions(+), 896 deletions(-) create mode 100644 doc/release-notes/12122-archiving in sequence.md create mode 100644 doc/release-notes/12122-archiving updates.md rename doc/release-notes/{12063-ORE-and-Bag-updates.md => 12144-un-holey-bags.md} (63%) diff --git a/doc/release-notes/12122-archiving in sequence.md b/doc/release-notes/12122-archiving in sequence.md new file mode 100644 index 00000000000..6f4373a1e31 --- /dev/null +++ b/doc/release-notes/12122-archiving in sequence.md @@ -0,0 +1,3 @@ +This release introduces an additial setting related to archival bag creation, ArchiveOnlyIfEarlierVersionsAreArchived (default false). +If it is true, dataset versions must be archived in order. That is, all prior versions of a dataset must be archived before the latest version can be archived. +This is intended to support use cases where deduplication of files between dataset versions will be done (i.e. by a third-party service running at the archival copy location) and is a step towards supporting the Oxford Common File Layout (OCFL) as an archival format. diff --git a/doc/release-notes/12122-archiving updates.md b/doc/release-notes/12122-archiving updates.md new file mode 100644 index 00000000000..2dd4eb6909f --- /dev/null +++ b/doc/release-notes/12122-archiving updates.md @@ -0,0 +1,8 @@ +## Notifications + +This release includes multiple updates to the process of creating archival bags including +- performance/scaling improvements for large datasets (multiple changes) +- bug fixes for when superusers see the "Submit" button to launch archiving from the dataset page version table +- new functionality to optionally suppress an archiving workflow when using the Update Current Version functionality and mark the current archive as out of date +- new functionality to support recreating an archival bag when Update Current Version has been used, which is available for archivers that can delete existing files +- \ No newline at end of file diff --git a/doc/release-notes/12063-ORE-and-Bag-updates.md b/doc/release-notes/12144-un-holey-bags.md similarity index 63% rename from doc/release-notes/12063-ORE-and-Bag-updates.md rename to doc/release-notes/12144-un-holey-bags.md index b2926f40c96..3cd895fd45c 100644 --- a/doc/release-notes/12063-ORE-and-Bag-updates.md +++ b/doc/release-notes/12144-un-holey-bags.md @@ -1,14 +1,21 @@ -This release contains multiple updates to the OAI-ORE metadata export and archival Bag output: - -OAI-ORE -- now uses URI for checksum algorithms -- a bug causing failures with deaccessioned versions when the deaccession note ("Deaccession Reason" in the UI) was null (which has been allowed via the API). -- the "https://schema.org/additionalType" is updated to "Dataverse OREMap Format v1.0.2" to indicate that the out has changed - -Archival Bag -- for dataset versions with no files, the (empty) manifest-.txt file created will now use the default algorithm defined by the "FileFixityChecksumAlgorithm" setting rather than always defaulting to "md5" -- a bug causing the bag-info.txt to not have information on contacts when the dataset version has more than one contact has been fixed -- values used in the bag-info.txt file that may be multi-line (with embedded CR or LF characters) are now properly indented/formatted per the BagIt specification (i.e. Internal-Sender-Identifier, External-Description, Source-Organization, Organization-Address). -- the name of the dataset is no longer used as a subdirectory under the data directory (dataset names can be long enough to cause failures when unzipping) -- a new key, "Dataverse-Bag-Version" has been added to bag-info.txt with a value "1.0", allowing tracking of changes to Dataverse's arhival bag generation -- improvements to file retrieval w.r.t. retries on errors or throttling \ No newline at end of file +This release contains multiple updates to the OAI-ORE metadata export and archival Bag output: + +OAI-ORE +- now uses URI for checksum algorithms +- a bug causing failures with deaccessioned versions when the deaccession note ("Deaccession Reason" in the UI) was null (which has been allowed via the API). +- the "https://schema.org/additionalType" is updated to "Dataverse OREMap Format v1.0.2" to indicate that the out has changed + +Archival Bag +- for dataset versions with no files, the (empty) manifest-.txt file created will now use the default algorithm defined by the "FileFixityChecksumAlgorithm" setting rather than always defaulting to "md5" +- a bug causing the bag-info.txt to not have information on contacts when the dataset version has more than one contact has been fixed +- values used in the bag-info.txt file that may be multi-line (with embedded CR or LF characters) are now properly indented/formatted per the BagIt specification (i.e. Internal-Sender-Identifier, External-Description, Source-Organization, Organization-Address). +- the name of the dataset is no longer used as a subdirectory under the data directory (dataset names can be long enough to cause failures when unzipping) +- a new key, "Dataverse-Bag-Version" has been added to bag-info.txt with a value "1.0", allowing tracking of changes to Dataverse's arhival bag generation +- improvements to file retrieval w.r.t. retries on errors or throttling +- retrieval of files for inclusion in the bag is no longer counted as a download by Dataverse +- the size of data files and total dataset size that will be included in an archival bag can now be limited. Admins can choose whether files above these limits are transferred along with the zipped bag (creating a complete archival copy) or are just referenced (using the concept of a "holey" bag and just listing the oversized files and the Dataverse urls from which they can be retrieved. In the holey bag case, an active service on the archiving platform must retrieve the oversized files (using appropriate credentials as needed) to make a complete copy + +### New JVM Options (MicroProfile Config Settings) +dataverse.bagit.zip.holey +dataverse.bagit.zip.max-data-size +dataverse.bagit.zip.max-file-size \ No newline at end of file diff --git a/doc/sphinx-guides/source/admin/big-data-administration.rst b/doc/sphinx-guides/source/admin/big-data-administration.rst index b3c7e79c382..d1b3a6dc3b5 100644 --- a/doc/sphinx-guides/source/admin/big-data-administration.rst +++ b/doc/sphinx-guides/source/admin/big-data-administration.rst @@ -302,6 +302,7 @@ There are a broad range of options (that are not turned on by default) for impro - :ref:`:DisableSolrFacetsWithoutJsession` - disables facets for users who have disabled cookies (e.g. for bots) - :ref:`:DisableUncheckedTypesFacet` - only disables the facet showing the number of collections, datasets, files matching the query (this facet is potentially less useful than others) - :ref:`:StoreIngestedTabularFilesWithVarHeaders` - by default, Dataverse stores ingested files without headers and dynamically adds them back at download time. Once this setting is enabled, Dataverse will leave the headers in place (for newly ingested files), reducing the cost of downloads +- :ref:`dataverse.bagit.zip.max-file-size`, :ref:`dataverse.bagit.zip.max-data-size`, and :ref:`dataverse.bagit.zip.holey` - options to control the size and temporary storage requirements when generating archival Bags - see :ref:`BagIt Export` Scaling Infrastructure diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index 10314aff195..ed9f1ff2fcd 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -2249,10 +2249,25 @@ These archival Bags include all of the files and metadata in a given dataset ver The Dataverse Software offers an internal archive workflow which may be configured as a PostPublication workflow via an admin API call to manually submit previously published Datasets and prior versions to a configured archive such as Chronopolis. The workflow creates a `JSON-LD `_ serialized `OAI-ORE `_ map file, which is also available as a metadata export format in the Dataverse Software web interface. +The size of the zipped archival Bag can be limited, and files that don't fit within that limit can either be transferred separately (placed so that they are correctly positioned according to the BagIt specification when the zipped bag in unzipped in place) or just referenced for later download (using the BagIt concept of a 'holey' bag with a list of files in a ``fetch.txt`` file) can now be configured for all archivers. These settings allow for managing large datasets by excluding files over a certain size or total data size, which can be useful for archivers with size limitations or to reduce transfer times. See the :ref:`dataverse.bagit.zip.max-file-size`, :ref:`dataverse.bagit.zip.max-data-size`, and :ref:`dataverse.bagit.zip.holey` JVM options for more details. + At present, archiving classes include the DuraCloudSubmitToArchiveCommand, LocalSubmitToArchiveCommand, GoogleCloudSubmitToArchive, and S3SubmitToArchiveCommand , which all extend the AbstractSubmitToArchiveCommand and use the configurable mechanisms discussed below. (A DRSSubmitToArchiveCommand, which works with Harvard's DRS also exists and, while specific to DRS, is a useful example of how Archivers can support single-version-only semantics and support archiving only from specified collections (with collection specific parameters)). All current options support the :ref:`Archival Status API` calls and the same status is available in the dataset page version table (for contributors/those who could view the unpublished dataset, with more detail available to superusers). +Archival Bags are created per dataset version. By default, if a version is republished (via the superuser-only 'Update Current Version' publication option in the UI/API), a new archival bag is not created for the version. +If the archiver used is capable of deleting existing bags (Google, S3, and File Archivers) superusers can trigger a manual update of the archival bag, and, if the :ref:`dataverse.feature.archive-on-version-update` flag is set to true, this will be done automatically when 'Update Current Version' is used. + +Two settings that can be used with all current Archivers are: + +- \:BagGeneratorThreads - the number of threads to use when adding data files to the zipped bag. The default is 2. Values of 4 or more may increase performance on larger machines but may cause problems if file access is throttled +- \:ArchiveOnlyIfEarlierVersionsAreArchived - when true, requires dataset versions to be archived in order by confirming that all prior versions have been successfully archived before allowing a new version to be archived. Default is false + +These must be included in the \:ArchiverSettings for the Archiver to work + +Archival Bags are created per dataset version. By default, if a version is republished (via the superuser-only 'Update Current Version' publication option in the UI/API), a new archival bag is not created for the version. +If the archiver used is capable of deleting existing bags (Google, S3, and File Archivers) superusers can trigger a manual update of the archival bag, and, if the :ref:`dataverse.bagit.archive-on-version-update` flag is set to true, this will be done automatically when 'Update Current Version' is used. + .. _Duracloud Configuration: Duracloud Configuration @@ -3052,6 +3067,13 @@ Once you have a password from your provider, you should create a password alias You should delete the old JVM option and the wrapped password alias, then recreate as described above. +.. _dataverse.feature.archive-on-version-update: + +dataverse.feature.archive-on-version-update ++++++++++++++++++++++++++++++++++++++++++++ + +Indicates whether archival bag creation should be triggered (if configured) when a version is updated and was already successfully archived, +i.e via the Update-Current-Version publication option. Setting the flag true only works if the archiver being used supports deleting existing archival bags. .. _dataverse.pid.handlenet.key.path: @@ -3705,12 +3727,20 @@ The email for your institution that you'd like to appear in bag-info.txt. See :r Can also be set via *MicroProfile Config API* sources, e.g. the environment variable ``DATAVERSE_BAGIT_SOURCEORG_EMAIL``. +.. _dataverse.bagit.archive-on-version-update: + +dataverse.bagit.archive-on-version-update ++++++++++++++++++++++++++++++++++++++++++ + +Indicates whether archival bag creation should be triggered (if configured) when a version is updated and was already successfully archived, +i.e via the Update-Current-Version publication option. Setting the flag true only works if the archiver being used supports deleting existing archival bags. + .. _dataverse.files.globus-monitoring-server: dataverse.files.globus-monitoring-server ++++++++++++++++++++++++++++++++++++++++ -This setting is required in conjunction with the ``globus-use-experimental-async-framework`` feature flag (see :ref:`feature-flags`). Setting it to true designates the Dataverse instance to serve as the dedicated polling server. It is needed so that the new framework can be used in a multi-node installation. +This setting is required in conjunction with the ``globus-use-experimental-async-framework`` feature flag (see :ref:`feature-flags`). Setting it to true designates the Dataverse instance to serve as the dedicated polling server. It is needed so that the new framework can be used in a multi-node installation. .. _dataverse.csl.common-styles: @@ -3858,6 +3888,21 @@ This can instead be restricted to only superusers who can publish the dataset us Example: ``dataverse.coar-notify.relationship-announcement.notify-superusers-only=true`` +.. _dataverse.bagit.zip.holey: + +``dataverse.bagit.zip.holey`` + A boolean that, if true, will cause the BagIt archiver to create a "holey" bag. In a holey bag, files that are not included in the bag are listed in the ``fetch.txt`` file with a URL from which they can be downloaded. This is used in conjunction with ``dataverse.bagit.zip.max-file-size`` and/or ``dataverse.bagit.zip.max-data-size``. Default: false. + +.. _dataverse.bagit.zip.max-data-size: + +``dataverse.bagit.zip.max-data-size`` + The maximum total (uncompressed) size of data files (in bytes) to include in a BagIt zip archive. If the total size of the dataset files exceeds this limit, files will be excluded from the zipped bag (starting from the largest) until the total size is under the limit. Excluded files will be handled as defined by ``dataverse.bagit.zip.holey`` - just listed if that setting is true or being transferred separately and placed next to the zipped bag. When not set, there is no limit. + +.. _dataverse.bagit.zip.max-file-size: + +``dataverse.bagit.zip.max-file-size`` + The maximum (uncompressed) size of a single file (in bytes) to include in a BagIt zip archive. Any file larger than this will be excluded. Excluded files will be handled as defined by ``dataverse.bagit.zip.holey`` - just listed if that setting is true or being transferred separately and placed next to the zipped bag. When not set, there is no limit. + .. _feature-flags: Feature Flags @@ -3897,28 +3942,28 @@ please find all known feature flags below. Any of these flags can be activated u - Allows the use of an OAuth user account (GitHub, Google, or ORCID) when an identity match is found during API bearer authentication. This feature enables automatic association of an incoming IdP identity with an existing OAuth user account, bypassing the need for additional user registration steps. This feature only works when the feature flag ``api-bearer-auth`` is also enabled. **Caution: Enabling this flag could result in impersonation risks if (and only if) used with a misconfigured IdP.** - ``Off`` * - avoid-expensive-solr-join - - Changes the way Solr queries are constructed for public content (published Collections, Datasets and Files). It removes a very expensive Solr join on all such documents, improving overall performance, especially for large instances under heavy load. Before this feature flag is enabled, the corresponding indexing feature (see next feature flag) must be turned on and a full reindex performed (otherwise public objects are not going to be shown in search results). See :doc:`/admin/solr-search-index`. + - Changes the way Solr queries are constructed for public content (published Collections, Datasets and Files). It removes a very expensive Solr join on all such documents, improving overall performance, especially for large instances under heavy load. Before this feature flag is enabled, the corresponding indexing feature (see next feature flag) must be turned on and a full reindex performed (otherwise public objects are not going to be shown in search results). See :doc:`/admin/solr-search-index`. - ``Off`` * - add-publicobject-solr-field - - Adds an extra boolean field `PublicObject_b:true` for public content (published Collections, Datasets and Files). Once reindexed with these fields, we can rely on it to remove a very expensive Solr join on all such documents in Solr queries, significantly improving overall performance (by enabling the feature flag above, `avoid-expensive-solr-join`). These two flags are separate so that an instance can reindex their holdings before enabling the optimization in searches, thus avoiding having their public objects temporarily disappear from search results while the reindexing is in progress. + - Adds an extra boolean field `PublicObject_b:true` for public content (published Collections, Datasets and Files). Once reindexed with these fields, we can rely on it to remove a very expensive Solr join on all such documents in Solr queries, significantly improving overall performance (by enabling the feature flag above, `avoid-expensive-solr-join`). These two flags are separate so that an instance can reindex their holdings before enabling the optimization in searches, thus avoiding having their public objects temporarily disappear from search results while the reindexing is in progress. - ``Off`` * - reduce-solr-deletes - - Avoids deleting and recreating solr documents for dataset files when reindexing. + - Avoids deleting and recreating solr documents for dataset files when reindexing. - ``Off`` * - disable-return-to-author-reason - - Removes the reason field in the `Publish/Return To Author` dialog that was added as a required field in v6.2 and makes the reason an optional parameter in the :ref:`return-a-dataset` API call. + - Removes the reason field in the `Publish/Return To Author` dialog that was added as a required field in v6.2 and makes the reason an optional parameter in the :ref:`return-a-dataset` API call. - ``Off`` * - disable-dataset-thumbnail-autoselect - Turns off automatic selection of a dataset thumbnail from image files in that dataset. When set to ``On``, a user can still manually pick a thumbnail image or upload a dedicated thumbnail image. - ``Off`` * - globus-use-experimental-async-framework - - Activates a new experimental implementation of Globus polling of ongoing remote data transfers that does not rely on the instance staying up continuously for the duration of the transfers and saves the state information about Globus upload requests in the database. Added in v6.4; extended in v6.6 to cover download transfers, in addition to uploads. Affects :ref:`:GlobusPollingInterval`. Note that the JVM option :ref:`dataverse.files.globus-monitoring-server` described above must also be enabled on one (and only one, in a multi-node installation) Dataverse instance. + - Activates a new experimental implementation of Globus polling of ongoing remote data transfers that does not rely on the instance staying up continuously for the duration of the transfers and saves the state information about Globus upload requests in the database. Added in v6.4; extended in v6.6 to cover download transfers, in addition to uploads. Affects :ref:`:GlobusPollingInterval`. Note that the JVM option :ref:`dataverse.files.globus-monitoring-server` described above must also be enabled on one (and only one, in a multi-node installation) Dataverse instance. - ``Off`` * - index-harvested-metadata-source - Index the nickname or the source name (See the optional ``sourceName`` field in :ref:`create-a-harvesting-client`) of the harvesting client as the "metadata source" of harvested datasets and files. If enabled, the Metadata Source facet will show separate groupings of the content harvested from different sources (by harvesting client nickname or source name) instead of the default behavior where there is one "Harvested" grouping for all harvested content. - ``Off`` * - enable-version-note - - Turns on the ability to add/view/edit/delete per-dataset-version notes intended to provide :ref:`provenance` information about why the dataset/version was created. + - Turns on the ability to add/view/edit/delete per-dataset-version notes intended to provide :ref:`provenance` information about why the dataset/version was created. - ``Off`` * - shibboleth-use-wayfinder - This flag allows an instance to use Shibboleth with InCommon federation services. Our original Shibboleth implementation that relies on DiscoFeed can no longer be used since InCommon discontinued their old-style metadata feed. An alternative mechanism had to be implemented in order to use WayFinder service, their recommended replacements, instead. @@ -3944,6 +3989,153 @@ please find all known feature flags below. Any of these flags can be activated u To check the status of feature flags via API, see :ref:`list-all-feature-flags` in the API Guide. +.. _dataverse.feature.api-session-auth: + +dataverse.feature.api-session-auth +++++++++++++++++++++++++++++++++++ + +Enables API authentication via session cookie (JSESSIONID). **Caution: Enabling this feature flag exposes the installation to CSRF risks!** We expect this feature flag to be temporary (only used by frontend developers, see `#9063 `_) and for the feature to be removed in the future. + +.. _dataverse.feature.api-bearer-auth: + +dataverse.feature.api-bearer-auth ++++++++++++++++++++++++++++++++++ + +Enables API authentication via Bearer Token. + +.. _dataverse.feature.api-bearer-auth-provide-missing-claims: + +dataverse.feature.api-bearer-auth-provide-missing-claims +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +Enables sending missing user claims in the request JSON provided during OIDC user registration, when these claims are not returned by the identity provider and are required for registration. This feature only works when the feature flag ``api-bearer-auth`` is also enabled. **Caution: Enabling this feature flag exposes the installation to potential user impersonation issues.** + +.. _dataverse.feature.api-bearer-auth-handle-tos-acceptance-in-idp: + +dataverse.feature.api-bearer-auth-handle-tos-acceptance-in-idp +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +Specifies that Terms of Service acceptance is handled by the IdP, eliminating the need to include ToS acceptance boolean parameter (termsAccepted) in the OIDC user registration request body. This feature only works when the feature flag ``api-bearer-auth`` is also enabled. + +.. _dataverse.feature.api-bearer-auth-use-builtin-user-on-id-match: + +dataverse.feature.api-bearer-auth-use-builtin-user-on-id-match +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +Allows the use of a built-in user account when an identity match is found during API bearer authentication. This feature enables automatic association of an incoming IdP identity with an existing built-in user account, bypassing the need for additional user registration steps. This feature only works when the feature flag ``api-bearer-auth`` is also enabled. **Caution: Enabling this flag could result in impersonation risks if (and only if) used with a misconfigured IdP.** + +.. _dataverse.feature.api-bearer-auth-use-shib-user-on-id-match: + +dataverse.feature.api-bearer-auth-use-shib-user-on-id-match ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +Allows the use of a Shibboleth user account when an identity match is found during API bearer authentication. This feature enables automatic association of an incoming IdP identity with an existing Shibboleth user account, bypassing the need for additional user registration steps. This feature only works when the feature flag ``api-bearer-auth`` is also enabled. **Caution: Enabling this flag could result in impersonation risks if (and only if) used with a misconfigured IdP.** + +.. _dataverse.feature.api-bearer-auth-use-oauth-user-on-id-match: + +dataverse.feature.api-bearer-auth-use-oauth-user-on-id-match +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +Allows the use of an OAuth user account (GitHub, Google, or ORCID) when an identity match is found during API bearer authentication. This feature enables automatic association of an incoming IdP identity with an existing OAuth user account, bypassing the need for additional user registration steps. This feature only works when the feature flag ``api-bearer-auth`` is also enabled. **Caution: Enabling this flag could result in impersonation risks if (and only if) used with a misconfigured IdP.** + +.. _dataverse.feature.avoid-expensive-solr-join: + +dataverse.feature.avoid-expensive-solr-join ++++++++++++++++++++++++++++++++++++++++++++ + +Changes the way Solr queries are constructed for public content (published Collections, Datasets and Files). It removes a very expensive Solr join on all such documents, improving overall performance, especially for large instances under heavy load. Before this feature flag is enabled, the corresponding indexing feature (see next feature flag) must be turned on and a full reindex performed (otherwise public objects are not going to be shown in search results). See :doc:`/admin/solr-search-index`. + +.. _dataverse.feature.add-publicobject-solr-field: + +dataverse.feature.add-publicobject-solr-field ++++++++++++++++++++++++++++++++++++++++++++++ + +Adds an extra boolean field `PublicObject_b:true` for public content (published Collections, Datasets and Files). Once reindexed with these fields, we can rely on it to remove a very expensive Solr join on all such documents in Solr queries, significantly improving overall performance (by enabling the feature flag above, `avoid-expensive-solr-join`). These two flags are separate so that an instance can reindex their holdings before enabling the optimization in searches, thus avoiding having their public objects temporarily disappear from search results while the reindexing is in progress. + +.. _dataverse.feature.reduce-solr-deletes: + +dataverse.feature.reduce-solr-deletes ++++++++++++++++++++++++++++++++++++++ + +Avoids deleting and recreating solr documents for dataset files when reindexing. + +.. _dataverse.feature.disable-return-to-author-reason: + +dataverse.feature.disable-return-to-author-reason ++++++++++++++++++++++++++++++++++++++++++++++++++ + +Removes the reason field in the `Publish/Return To Author` dialog that was added as a required field in v6.2 and makes the reason an optional parameter in the :ref:`return-a-dataset` API call. + +.. _dataverse.feature.disable-dataset-thumbnail-autoselect: + +dataverse.feature.disable-dataset-thumbnail-autoselect +++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +Turns off automatic selection of a dataset thumbnail from image files in that dataset. When set to ``On``, a user can still manually pick a thumbnail image or upload a dedicated thumbnail image. + +.. _dataverse.feature.globus-use-experimental-async-framework: + +dataverse.feature.globus-use-experimental-async-framework ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +Activates a new experimental implementation of Globus polling of ongoing remote data transfers that does not rely on the instance staying up continuously for the duration of the transfers and saves the state information about Globus upload requests in the database. Added in v6.4; extended in v6.6 to cover download transfers, in addition to uploads. Affects :ref:`:GlobusPollingInterval`. Note that the JVM option :ref:`dataverse.files.globus-monitoring-server` described above must also be enabled on one (and only one, in a multi-node installation) Dataverse instance. + +.. _dataverse.feature.index-harvested-metadata-source: + +dataverse.feature.index-harvested-metadata-source ++++++++++++++++++++++++++++++++++++++++++++++++++ + +Index the nickname or the source name (See the optional ``sourceName`` field in :ref:`create-a-harvesting-client`) of the harvesting client as the "metadata source" of harvested datasets and files. If enabled, the Metadata Source facet will show separate groupings of the content harvested from different sources (by harvesting client nickname or source name) instead of the default behavior where there is one "Harvested" grouping for all harvested content. + +.. _dataverse.feature.enable-version-note: + +dataverse.feature.enable-version-note ++++++++++++++++++++++++++++++++++++++ + +Turns on the ability to add/view/edit/delete per-dataset-version notes intended to provide :ref:`provenance` information about why the dataset/version was created. + +.. _dataverse.feature.shibboleth-use-wayfinder: + +dataverse.feature.shibboleth-use-wayfinder +++++++++++++++++++++++++++++++++++++++++++ + +This flag allows an instance to use Shibboleth with InCommon federation services. Our original Shibboleth implementation that relies on DiscoFeed can no longer be used since InCommon discontinued their old-style metadata feed. An alternative mechanism had to be implemented in order to use WayFinder service, their recommended replacements, instead. + +.. _dataverse.feature.shibboleth-use-localhost: + +dataverse.feature.shibboleth-use-localhost +++++++++++++++++++++++++++++++++++++++++++ + +A Shibboleth-using Dataverse instance needs to make network calls to the locally-running ``shibd`` service. The default behavior is to use the address configured via the ``siteUrl`` setting. There are however situations (firewalls, etc.) where localhost would be preferable. + +.. _dataverse.feature.add-local-contexts-permission-check: + +dataverse.feature.add-local-contexts-permission-check ++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +Adds a permission check to ensure that the user calling the /api/localcontexts/datasets/{id} API can edit the dataset with that id. This is currently the only use case - see https://github.com/gdcc/dataverse-external-vocab-support/tree/main/packages/local_contexts. The flag adds additional security to stop other uses, but would currently have to be used in conjunction with the api-session-auth feature flag (the security implications of which have not been fully investigated) to still allow adding Local Contexts metadata to a dataset. + +.. _dataverse.feature.enable-pid-failure-log: + +dataverse.feature.enable-pid-failure-log +++++++++++++++++++++++++++++++++++++++++ + +Turns on creation of a monthly log file (logs/PIDFailures_.log) showing failed requests for dataset/file PIDs. Can be used directly or with scripts at https://github.com/gdcc/dataverse-recipes/python/pid_reports to alert admins. + +.. _dataverse.feature.role-assignment-history: + +dataverse.feature.role-assignment-history ++++++++++++++++++++++++++++++++++++++++++ + +Turns on tracking/display of role assignments and revocations for collections, datasets, and files + +.. _dataverse.feature.only-update-datacite-when-needed: + +dataverse.feature.only-update-datacite-when-needed +++++++++++++++++++++++++++++++++++++++++++++++++++ + +Only contact DataCite to update a DOI after checking to see if DataCite has outdated information (for efficiency, lighter load on DataCite, especially when using file DOIs). + .. _:ApplicationServerSettings: Application Server Settings @@ -5243,6 +5435,11 @@ This setting specifies which storage system to use by identifying the particular For examples, see the specific configuration above in :ref:`BagIt Export`. +:ArchiveOnlyIfEarlierVersionsAreArchived +++++++++++++++++++++++++++++++++++++++++ + +This setting, if true, only allows creation of an archival Bag for a dataset version if all prior versions have been successfully archived. The default is false (any version can be archived independently as long as other settings allow it) + :ArchiverSettings +++++++++++++++++ diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java index 20617160a1c..34cf0e900a3 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java @@ -42,6 +42,7 @@ import edu.harvard.iq.dataverse.engine.command.impl.UpdateDatasetVersionCommand; import edu.harvard.iq.dataverse.export.ExportService; import edu.harvard.iq.dataverse.util.cache.CacheFactoryBean; +import edu.harvard.iq.dataverse.util.json.JsonUtil; import io.gdcc.spi.export.ExportException; import io.gdcc.spi.export.Exporter; import edu.harvard.iq.dataverse.ingest.IngestRequest; @@ -105,6 +106,8 @@ import jakarta.faces.view.ViewScoped; import jakarta.inject.Inject; import jakarta.inject.Named; +import jakarta.json.Json; +import jakarta.json.JsonObjectBuilder; import jakarta.persistence.OptimisticLockException; import org.apache.commons.lang3.StringUtils; @@ -160,6 +163,7 @@ import edu.harvard.iq.dataverse.search.SearchFields; import edu.harvard.iq.dataverse.search.SearchUtil; import edu.harvard.iq.dataverse.search.SolrClientService; +import edu.harvard.iq.dataverse.settings.FeatureFlags; import edu.harvard.iq.dataverse.settings.JvmSettings; import edu.harvard.iq.dataverse.util.SignpostingResources; import edu.harvard.iq.dataverse.util.FileMetadataUtil; @@ -387,7 +391,9 @@ public void setSelectedHostDataverse(Dataverse selectedHostDataverse) { private boolean showIngestSuccess; private Boolean archivable = null; - private Boolean versionArchivable = null; + private Boolean checkForArchivalCopy; + private Boolean supportsDelete; + private HashMap versionArchivable = new HashMap<>(); private Boolean someVersionArchived = null; public boolean isShowIngestSuccess() { @@ -2992,27 +2998,38 @@ public String updateCurrentVersion() { String className = settingsService.get(SettingsServiceBean.Key.ArchiverClassName.toString()); AbstractSubmitToArchiveCommand archiveCommand = ArchiverUtil.createSubmitToArchiveCommand(className, dvRequestService.getDataverseRequest(), updateVersion); if (archiveCommand != null) { - // Delete the record of any existing copy since it is now out of date/incorrect - updateVersion.setArchivalCopyLocation(null); - /* - * Then try to generate and submit an archival copy. Note that running this - * command within the CuratePublishedDatasetVersionCommand was causing an error: - * "The attribute [id] of class - * [edu.harvard.iq.dataverse.DatasetFieldCompoundValue] is mapped to a primary - * key column in the database. Updates are not allowed." To avoid that, and to - * simplify reporting back to the GUI whether this optional step succeeded, I've - * pulled this out as a separate submit(). - */ - try { - updateVersion = commandEngine.submit(archiveCommand); - if (!updateVersion.getArchivalCopyLocationStatus().equals(DatasetVersion.ARCHIVAL_STATUS_FAILURE)) { - successMsg = BundleUtil.getStringFromBundle("datasetversion.update.archive.success"); - } else { - errorMsg = BundleUtil.getStringFromBundle("datasetversion.update.archive.failure"); + //There is an archiver configured, so now decide what to dO: + // If a successful copy exists, don't automatically update, just note the old copy is obsolete (and enable the superadmin button in the display to allow a ~manual update if desired) + // If pending or an obsolete copy exists, do nothing (nominally if a pending run succeeds and we're updating the current version here, it should be marked as obsolete - ignoring for now since updates within the time an archiving run is pending should be rare + // If a failure or null, rerun archiving now. If a failure is due to an exiting copy in the repo, we'll fail again + String status = updateVersion.getArchivalCopyLocationStatus(); + if((status==null) || status.equals(DatasetVersion.ARCHIVAL_STATUS_FAILURE) || (JvmSettings.BAGIT_ARCHIVE_ON_VERSION_UPDATE.lookupOptional(Boolean.class).orElse(false) && archiveCommand.canDelete())){ + // Delete the record of any existing copy since it is now out of date/incorrect + JsonObjectBuilder job = Json.createObjectBuilder(); + job.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_PENDING); + updateVersion.setArchivalCopyLocation(JsonUtil.prettyPrint(job.build())); + //Persist to db now + datasetVersionService.persistArchivalCopyLocation(updateVersion); + /* + * Then try to generate and submit an archival copy. Note that running this + * command within the CuratePublishedDatasetVersionCommand was causing an error: + * "The attribute [id] of class + * [edu.harvard.iq.dataverse.DatasetFieldCompoundValue] is mapped to a primary + * key column in the database. Updates are not allowed." To avoid that, and to + * simplify reporting back to the GUI whether this optional step succeeded, I've + * pulled this out as a separate submit(). + */ + try { + commandEngine.submitAsync(archiveCommand); + JsfHelper.addSuccessMessage(BundleUtil.getStringFromBundle("datasetversion.archive.inprogress")); + } catch (CommandException ex) { + errorMsg = BundleUtil.getStringFromBundle("datasetversion.update.archive.failure") + " - " + ex.toString(); + logger.severe(ex.getMessage()); } - } catch (CommandException ex) { - errorMsg = BundleUtil.getStringFromBundle("datasetversion.update.archive.failure") + " - " + ex.toString(); - logger.severe(ex.getMessage()); + } else if(status.equals(DatasetVersion.ARCHIVAL_STATUS_SUCCESS)) { + //Not automatically replacing the old archival copy as creating it is expensive + updateVersion.setArchivalStatusOnly(DatasetVersion.ARCHIVAL_STATUS_OBSOLETE); + datasetVersionService.persistArchivalCopyLocation(updateVersion); } } } @@ -3062,7 +3079,7 @@ public String refresh() { } //dataset = datasetService.find(dataset.getId()); dataset = null; - workingVersion = null; + workingVersion = null; logger.fine("refreshing working version"); @@ -5554,7 +5571,7 @@ public boolean isRequestAccessPopupRequired() { return FileUtil.isRequestAccessPopupRequired(workingVersion); } - public boolean isGuestbookAndTermsPopupRequired() { + public boolean isGuestbookAndTermsPopupRequired() { return FileUtil.isGuestbookAndTermsPopupRequired(workingVersion); } @@ -6087,33 +6104,33 @@ public void refreshPaginator() { /** * This method can be called from *.xhtml files to allow archiving of a dataset - * version from the user interface. It is not currently (11/18) used in the IQSS/develop - * branch, but is used by QDR and is kept here in anticipation of including a - * GUI option to archive (already published) versions after other dataset page - * changes have been completed. + * version from the user interface. * * @param id - the id of the datasetversion to archive. */ - public void archiveVersion(Long id) { + public void archiveVersion(Long id, boolean force) { if (session.getUser() instanceof AuthenticatedUser) { DatasetVersion dv = datasetVersionService.retrieveDatasetVersionByVersionId(id).getDatasetVersion(); String className = settingsWrapper.getValueForKey(SettingsServiceBean.Key.ArchiverClassName, null); AbstractSubmitToArchiveCommand cmd = ArchiverUtil.createSubmitToArchiveCommand(className, dvRequestService.getDataverseRequest(), dv); if (cmd != null) { try { - DatasetVersion version = commandEngine.submit(cmd); - if (!version.getArchivalCopyLocationStatus().equals(DatasetVersion.ARCHIVAL_STATUS_FAILURE)) { + String status = dv.getArchivalCopyLocationStatus(); + if (status == null || (force && cmd.canDelete())) { + + // Set initial pending status + JsonObjectBuilder job = Json.createObjectBuilder(); + job.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_PENDING); + dv.setArchivalCopyLocation(JsonUtil.prettyPrint(job.build())); + //Persist now + datasetVersionService.persistArchivalCopyLocation(dv); + commandEngine.submitAsync(cmd); + logger.info( - "DatasetVersion id=" + version.getId() + " submitted to Archive, status: " + dv.getArchivalCopyLocationStatus()); - } else { - logger.severe("Error submitting version " + version.getId() + " due to conflict/error at Archive"); - } - if (version.getArchivalCopyLocation() != null) { + "DatasetVersion id=" + dv.getId() + " submitted to Archive, status: " + dv.getArchivalCopyLocationStatus()); setVersionTabList(resetVersionTabList()); this.setVersionTabListForPostLoad(getVersionTabList()); - JsfHelper.addSuccessMessage(BundleUtil.getStringFromBundle("datasetversion.archive.success")); - } else { - JsfHelper.addErrorMessage(BundleUtil.getStringFromBundle("datasetversion.archive.failure")); + JsfHelper.addSuccessMessage(BundleUtil.getStringFromBundle("datasetversion.archive.inprogress")); } } catch (CommandException ex) { logger.log(Level.SEVERE, "Unexpected Exception calling submit archive command", ex); @@ -6147,41 +6164,85 @@ public boolean isArchivable() { return archivable; } - public boolean isVersionArchivable() { - if (versionArchivable == null) { + /** Method to decide if a 'Submit' button should be enabled for archiving a dataset version. */ + public boolean isVersionArchivable(Long id) { + Boolean thisVersionArchivable = versionArchivable.get(id); + if (thisVersionArchivable == null) { // If this dataset isn't in an archivable collection return false - versionArchivable = false; + thisVersionArchivable = false; + boolean requiresEarlierVersionsToBeArchived = settingsWrapper.isTrueForKey(SettingsServiceBean.Key.ArchiveOnlyIfEarlierVersionsAreArchived, false); if (isArchivable()) { - boolean checkForArchivalCopy = false; - // Otherwise, we need to know if the archiver is single-version-only - // If it is, we have to check for an existing archived version to answer the - // question + String className = settingsWrapper.getValueForKey(SettingsServiceBean.Key.ArchiverClassName, null); if (className != null) { try { - Class clazz = Class.forName(className); - Method m = clazz.getMethod("isSingleVersion", SettingsWrapper.class); - Object[] params = { settingsWrapper }; - checkForArchivalCopy = (Boolean) m.invoke(null, params); + DatasetVersion targetVersion = dataset.getVersions().stream() + .filter(v -> v.getId().equals(id)).findFirst().orElse(null); + if (requiresEarlierVersionsToBeArchived) {// Find the specific version by id + // Check all prior versions to ensure they are successfully archived + boolean allPriorVersionsArchived = true; + boolean foundTarget = false; + List versions = dataset.getVersions(); + + for (DatasetVersion versionInLoop : versions) { + // Once we find the target version, start checking subsequent versions (which are prior versions) + if (foundTarget) { + // Check if this prior version has been successfully archived + String archivalStatus = versionInLoop.getArchivalCopyLocationStatus(); + if (archivalStatus == null || !archivalStatus.equals(DatasetVersion.ARCHIVAL_STATUS_SUCCESS)) { + allPriorVersionsArchived = false; + break; + } + } + if (versionInLoop.equals(targetVersion)) { + foundTarget = true; + } + } + if (allPriorVersionsArchived) { + thisVersionArchivable = true; + // This check has been passed, so we go on to check other conditions + } else { + // Store the false value and skip further checks + versionArchivable.put(id, thisVersionArchivable); + return thisVersionArchivable; + } + } + // Otherwise, we need to know if the archiver is single-version-only + // If it is, we have to check for an existing archived version to answer the + // question + if (checkForArchivalCopy == null) { + //Only check once + Class clazz = Class.forName(className); + Method m = clazz.getMethod("isSingleVersion", SettingsWrapper.class); + Method m2 = clazz.getMethod("supportsDelete"); + Object[] params = { settingsWrapper }; + checkForArchivalCopy = (Boolean) m.invoke(null, params); + supportsDelete = (Boolean) m2.invoke(null); + } if (checkForArchivalCopy) { // If we have to check (single version archiving), we can't allow archiving if // one version is already archived (or attempted - any non-null status) - versionArchivable = !isSomeVersionArchived(); + thisVersionArchivable = !isSomeVersionArchived(); } else { - // If we allow multiple versions or didn't find one that has had archiving run - // on it, we can archive, so return true - versionArchivable = true; + // If we didn't find one that has had archiving run + // on it, or archiving per version is supported and either + // the status is null or the archiver can delete prior runs and status isn't success, + // we can archive, so return true + // Find the specific version by id + String status = targetVersion.getArchivalCopyLocationStatus(); + thisVersionArchivable = (status == null) || ((!status.equals(DatasetVersion.ARCHIVAL_STATUS_SUCCESS) && (!status.equals(DatasetVersion.ARCHIVAL_STATUS_PENDING)) && supportsDelete)); } } catch (ClassNotFoundException | IllegalAccessException | IllegalArgumentException | InvocationTargetException | NoSuchMethodException | SecurityException e) { - logger.warning("Failed to call isSingleVersion on configured archiver class: " + className); + logger.warning("Failed to call methods on configured archiver class: " + className); e.printStackTrace(); } } } + versionArchivable.put(id, thisVersionArchivable); } - return versionArchivable; + return thisVersionArchivable; } public boolean isSomeVersionArchived() { diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java index a58dad4f4c7..fc07adc2e9e 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java @@ -30,6 +30,7 @@ import edu.harvard.iq.dataverse.workflows.WorkflowComment; import java.io.*; +import java.sql.Timestamp; import java.text.SimpleDateFormat; import java.util.*; import java.util.logging.FileHandler; @@ -1140,4 +1141,45 @@ public void saveStorageQuota(Dataset target, Long allocation) { } em.flush(); } + + @TransactionAttribute(TransactionAttributeType.REQUIRES_NEW) + public void setLastExportTimeInNewTransaction(Long datasetId, Date lastExportTime) { + try { + Dataset currentDataset = find(datasetId); + if (currentDataset != null) { + currentDataset.setLastExportTime(lastExportTime); + merge(currentDataset); + } else { + logger.log(Level.SEVERE, "Could not find Dataset with id={0} to retry persisting archival copy location after OptimisticLockException.", datasetId); + } + } catch (Exception e) { + logger.log(Level.SEVERE, "Failed to retry export after OptimisticLockException for dataset id=" + datasetId, e); + } + } + + @TransactionAttribute(TransactionAttributeType.REQUIRES_NEW) + public void updateIndexingAndExportTimes(Dataset dataset) { + Query timestampQuery = em.createNativeQuery( + "SELECT dvo.indextime, dvo.permissionindextime, d.lastexporttime " + + "FROM dvobject dvo, dataset d WHERE dvo.id = d.id AND dvo.id = ?"); + timestampQuery.setParameter(1, dataset.getId()); + + Object[] timestamps = (Object[]) timestampQuery.getSingleResult(); + + // Cast and apply the fresh timestamps to the current dataset + Timestamp freshIndexTime = (Timestamp) timestamps[0]; + Timestamp freshPermissionIndexTime = (Timestamp) timestamps[1]; + Timestamp freshLastExportTime = (Timestamp) timestamps[2]; + + + logger.fine("Updating index time from " + dataset.getIndexTime() + " to " + freshIndexTime); + dataset.setIndexTime(freshIndexTime); + + logger.fine("Updating permission index time from " + dataset.getPermissionIndexTime() + " to " + freshPermissionIndexTime); + dataset.setPermissionIndexTime(freshPermissionIndexTime); + + logger.fine("Updating last export time from " + dataset.getLastExportTime() + " to " + freshLastExportTime); + dataset.setLastExportTime(freshLastExportTime); + + } } diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java b/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java index 93b0ccfef61..92bab58e8d6 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java @@ -132,6 +132,7 @@ public enum VersionState { public static final String ARCHIVAL_STATUS_PENDING = "pending"; public static final String ARCHIVAL_STATUS_SUCCESS = "success"; public static final String ARCHIVAL_STATUS_FAILURE = "failure"; + public static final String ARCHIVAL_STATUS_OBSOLETE = "obsolete"; @Id @GeneratedValue(strategy = GenerationType.IDENTITY) @@ -231,8 +232,9 @@ public enum VersionState { @Transient private DatasetVersionDifference dvd; + //The Json version of the archivalCopyLocation string @Transient - private JsonObject archivalStatus; + private JsonObject archivalCopyLocationJson; public Long getId() { return this.id; @@ -383,25 +385,25 @@ public String getArchivalCopyLocation() { public String getArchivalCopyLocationStatus() { populateArchivalStatus(false); - if(archivalStatus!=null) { - return archivalStatus.getString(ARCHIVAL_STATUS); + if(archivalCopyLocationJson!=null) { + return archivalCopyLocationJson.getString(ARCHIVAL_STATUS); } return null; } public String getArchivalCopyLocationMessage() { populateArchivalStatus(false); - if(archivalStatus!=null) { - return archivalStatus.getString(ARCHIVAL_STATUS_MESSAGE); + if(archivalCopyLocationJson!=null && archivalCopyLocationJson.containsKey(ARCHIVAL_STATUS_MESSAGE)) { + return archivalCopyLocationJson.getString(ARCHIVAL_STATUS_MESSAGE); } return null; } private void populateArchivalStatus(boolean force) { - if(archivalStatus ==null || force) { + if(archivalCopyLocationJson ==null || force) { if(archivalCopyLocation!=null) { try { - archivalStatus = JsonUtil.getJsonObject(archivalCopyLocation); - } catch(Exception e) { + archivalCopyLocationJson = JsonUtil.getJsonObject(archivalCopyLocation); + } catch (Exception e) { logger.warning("DatasetVersion id: " + id + "has a non-JsonObject value, parsing error: " + e.getMessage()); logger.fine(archivalCopyLocation); } @@ -414,6 +416,15 @@ public void setArchivalCopyLocation(String location) { populateArchivalStatus(true); } + // Convenience method to just change the status without changing the location + public void setArchivalStatusOnly(String status) { + populateArchivalStatus(false); + JsonObjectBuilder job = Json.createObjectBuilder(archivalCopyLocationJson); + job.add(DatasetVersion.ARCHIVAL_STATUS, status); + archivalCopyLocationJson = job.build(); + archivalCopyLocation = JsonUtil.prettyPrint(archivalCopyLocationJson); + } + public String getDeaccessionLink() { return deaccessionLink; } diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetVersionServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/DatasetVersionServiceBean.java index 60df1fd3dfd..a5dd724104f 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetVersionServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetVersionServiceBean.java @@ -28,11 +28,14 @@ import jakarta.ejb.EJB; import jakarta.ejb.EJBException; import jakarta.ejb.Stateless; +import jakarta.ejb.TransactionAttribute; +import jakarta.ejb.TransactionAttributeType; import jakarta.inject.Named; import jakarta.json.Json; import jakarta.json.JsonObjectBuilder; import jakarta.persistence.EntityManager; import jakarta.persistence.NoResultException; +import jakarta.persistence.OptimisticLockException; import jakarta.persistence.PersistenceContext; import jakarta.persistence.Query; import jakarta.persistence.TypedQuery; @@ -1333,4 +1336,24 @@ public Long getDatasetVersionCount(Long datasetId, boolean canViewUnpublishedVer return em.createQuery(cq).getSingleResult(); } + + + /** + * Update the archival copy location for a specific version of a dataset. + * Archiving can be long-running and other parallel updates to the datasetversion have likely occurred + * so this method will just re-find the version rather than risking an + * OptimisticLockException and then having to retry in yet another transaction (since the OLE rolls this one back). + * + * @param dv + * The dataset version whose archival copy location we want to update. Must not be {@code null}. + */ + @TransactionAttribute(TransactionAttributeType.REQUIRES_NEW) + public void persistArchivalCopyLocation(DatasetVersion dv) { + DatasetVersion currentVersion = find(dv.getId()); + if (currentVersion != null) { + currentVersion.setArchivalCopyLocation(dv.getArchivalCopyLocation()); + } else { + logger.log(Level.SEVERE, "Could not find DatasetVersion with id={0} to retry persisting archival copy location after OptimisticLockException.", dv.getId()); + } + } } diff --git a/src/main/java/edu/harvard/iq/dataverse/DvObjectServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/DvObjectServiceBean.java index 91c507774e9..607ffc6b958 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DvObjectServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/DvObjectServiceBean.java @@ -222,6 +222,7 @@ public DvObject updateContentIndexTime(DvObject dvObject) { * @todo DRY! Perhaps we should merge this with the older * updateContentIndexTime method. */ + @TransactionAttribute(REQUIRES_NEW) public DvObject updatePermissionIndexTime(DvObject dvObject) { /** * @todo to avoid a possible OptimisticLockException, should we merge diff --git a/src/main/java/edu/harvard/iq/dataverse/EjbDataverseEngine.java b/src/main/java/edu/harvard/iq/dataverse/EjbDataverseEngine.java index 4d6d59cb013..4fa85a543d8 100644 --- a/src/main/java/edu/harvard/iq/dataverse/EjbDataverseEngine.java +++ b/src/main/java/edu/harvard/iq/dataverse/EjbDataverseEngine.java @@ -31,6 +31,9 @@ import java.util.Map; import java.util.Set; + +import jakarta.ejb.AsyncResult; +import jakarta.ejb.Asynchronous; import jakarta.ejb.EJB; import jakarta.ejb.Stateless; import jakarta.inject.Named; @@ -45,6 +48,7 @@ import java.util.Arrays; import java.util.EnumSet; import java.util.Stack; +import java.util.concurrent.Future; import java.util.logging.Level; import java.util.logging.Logger; import jakarta.annotation.Resource; @@ -348,6 +352,27 @@ public R submit(Command aCommand) throws CommandException { logSvc.log(logRec); } } + + /** + * Submits a command for asynchronous execution. + * The command will be executed in a separate thread and won't block the caller. + * + * @param The return type of the command + * @param aCommand The command to execute + * @return A Future representing the pending result + * @throws CommandException if the command cannot be submitted + */ + @Asynchronous + public Future submitAsync(Command aCommand) throws CommandException { + try { + logger.log(Level.INFO, "Submitting async command: {0}", aCommand.getClass().getSimpleName()); + R result = submit(aCommand); + return new AsyncResult<>(result); + } catch (Exception e) { + logger.log(Level.SEVERE, "Async command execution failed: " + aCommand.getClass().getSimpleName(), e); + throw e; + } + } protected void completeCommand(Command command, Object r, Stack called) { diff --git a/src/main/java/edu/harvard/iq/dataverse/FileMetadataVersionsHelper.java b/src/main/java/edu/harvard/iq/dataverse/FileMetadataVersionsHelper.java index 4d408a72c8c..cc632054642 100644 --- a/src/main/java/edu/harvard/iq/dataverse/FileMetadataVersionsHelper.java +++ b/src/main/java/edu/harvard/iq/dataverse/FileMetadataVersionsHelper.java @@ -1,6 +1,7 @@ package edu.harvard.iq.dataverse; import edu.harvard.iq.dataverse.authorization.Permission; +import edu.harvard.iq.dataverse.dataset.DatasetUtil; import edu.harvard.iq.dataverse.engine.command.DataverseRequest; import jakarta.ejb.EJB; import jakarta.ejb.Stateless; @@ -95,18 +96,7 @@ private FileMetadata getPreviousFileMetadata(FileMetadata fileMetadata, FileMeta //TODO: this could use some refactoring to cut down on the number of for loops! private FileMetadata getPreviousFileMetadata(FileMetadata fileMetadata, DatasetVersion currentversion) { List allfiles = allRelatedFiles(fileMetadata); - boolean foundCurrent = false; - DatasetVersion priorVersion = null; - for (DatasetVersion versionLoop : fileMetadata.getDatasetVersion().getDataset().getVersions()) { - if (foundCurrent) { - priorVersion = versionLoop; - break; - } - if (versionLoop.equals(currentversion)) { - foundCurrent = true; - } - - } + DatasetVersion priorVersion = DatasetUtil.getPriorVersion(fileMetadata.getDatasetVersion()); if (priorVersion != null && priorVersion.getFileMetadatasSorted() != null) { for (FileMetadata fmdTest : priorVersion.getFileMetadatasSorted()) { for (DataFile fileTest : allfiles) { diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Admin.java b/src/main/java/edu/harvard/iq/dataverse/api/Admin.java index 18f28569d7d..10aadde57b6 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/Admin.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/Admin.java @@ -2067,6 +2067,7 @@ public Response submitDatasetVersionToArchive(@Context ContainerRequestContext c if(dv==null) { return error(Status.BAD_REQUEST, "Requested version not found."); } + //ToDo - allow forcing with a non-success status for archivers that supportsDelete() if (dv.getArchivalCopyLocation() == null) { String className = settingsService.getValueForKey(SettingsServiceBean.Key.ArchiverClassName); // Note - the user is being sent via the createDataverseRequest(au) call to the @@ -2132,7 +2133,7 @@ public Response archiveAllUnarchivedDatasetVersions(@Context ContainerRequestCon try { AuthenticatedUser au = getRequestAuthenticatedUserOrDie(crc); - + //ToDo - allow forcing with a non-success status for archivers that supportsDelete() List dsl = datasetversionService.getUnarchivedDatasetVersions(); if (dsl != null) { if (listonly) { diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java b/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java index 12dd984775d..691e32e83e7 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java @@ -1271,27 +1271,35 @@ public Response publishDataset(@Context ContainerRequestContext crc, @PathParam( DatasetVersion updateVersion = ds.getLatestVersion(); AbstractSubmitToArchiveCommand archiveCommand = ArchiverUtil.createSubmitToArchiveCommand(className, createDataverseRequest(user), updateVersion); if (archiveCommand != null) { - // Delete the record of any existing copy since it is now out of date/incorrect - updateVersion.setArchivalCopyLocation(null); - /* - * Then try to generate and submit an archival copy. Note that running this - * command within the CuratePublishedDatasetVersionCommand was causing an error: - * "The attribute [id] of class - * [edu.harvard.iq.dataverse.DatasetFieldCompoundValue] is mapped to a primary - * key column in the database. Updates are not allowed." To avoid that, and to - * simplify reporting back to the GUI whether this optional step succeeded, I've - * pulled this out as a separate submit(). - */ - try { - updateVersion = commandEngine.submit(archiveCommand); - if (!updateVersion.getArchivalCopyLocationStatus().equals(DatasetVersion.ARCHIVAL_STATUS_FAILURE)) { - successMsg = BundleUtil.getStringFromBundle("datasetversion.update.archive.success"); - } else { - successMsg = BundleUtil.getStringFromBundle("datasetversion.update.archive.failure"); + String status = updateVersion.getArchivalCopyLocationStatus(); + if ((status == null) || status.equals(DatasetVersion.ARCHIVAL_STATUS_FAILURE)) { + // Delete the record of any existing copy since it is now out of + // date/incorrect + JsonObjectBuilder job = Json.createObjectBuilder(); + job.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_PENDING); + updateVersion.setArchivalCopyLocation(JsonUtil.prettyPrint(job.build())); + datasetVersionSvc.persistArchivalCopyLocation(updateVersion); + /* + * Then try to generate and submit an archival copy. Note that running this + * command within the CuratePublishedDatasetVersionCommand was causing an error: + * "The attribute [id] of class + * [edu.harvard.iq.dataverse.DatasetFieldCompoundValue] is mapped to a primary + * key column in the database. Updates are not allowed." To avoid that, and to + * simplify reporting back to the GUI whether this optional step succeeded, I've + * pulled this out as a separate submit(). + */ + try { + commandEngine.submitAsync(archiveCommand); + successMsg = BundleUtil.getStringFromBundle("datasetversion.archive.inprogress"); + } catch (CommandException ex) { + successMsg = BundleUtil.getStringFromBundle("datasetversion.update.archive.failure") + + " - " + ex.toString(); + logger.severe(ex.getMessage()); } - } catch (CommandException ex) { - successMsg = BundleUtil.getStringFromBundle("datasetversion.update.archive.failure") + " - " + ex.toString(); - logger.severe(ex.getMessage()); + } else if (status.equals(DatasetVersion.ARCHIVAL_STATUS_SUCCESS)) { + // Not automatically replacing the old archival copy as creating it is expensive + updateVersion.setArchivalStatusOnly(DatasetVersion.ARCHIVAL_STATUS_OBSOLETE); + datasetVersionSvc.persistArchivalCopyLocation(updateVersion); } } } catch (CommandException ex) { @@ -1381,17 +1389,18 @@ public Response publishMigratedDataset(@Context ContainerRequestContext crc, Str */ String errorMsg = null; Optional prePubWf = wfService.getDefaultWorkflow(TriggerType.PrePublishDataset); - + DataverseRequest dataverseRequest = createDataverseRequest(user); try { - // ToDo - should this be in onSuccess()? May relate to todo above if (prePubWf.isPresent()) { + // Build context + WorkflowContext context = new WorkflowContext(dataverseRequest, ds, TriggerType.PrePublishDataset, !contactPIDProvider); // Start the workflow, the workflow will call FinalizeDatasetPublication later wfService.start(prePubWf.get(), - new WorkflowContext(createDataverseRequest(user), ds, TriggerType.PrePublishDataset, !contactPIDProvider), + new WorkflowContext(dataverseRequest, ds, TriggerType.PrePublishDataset, !contactPIDProvider), false); } else { FinalizeDatasetPublicationCommand cmd = new FinalizeDatasetPublicationCommand(ds, - createDataverseRequest(user), !contactPIDProvider); + dataverseRequest, !contactPIDProvider); ds = commandEngine.submit(cmd); } } catch (CommandException ex) { @@ -5006,7 +5015,7 @@ public Response getDatasetVersionArchivalStatus(@Context ContainerRequestContext } DataverseRequest req = createDataverseRequest(au); DatasetVersion dsv = getDatasetVersionOrDie(req, versionNumber, findDatasetOrDie(datasetId), uriInfo, - headers, true); + headers); if (dsv.getArchivalCopyLocation() == null) { return error(Status.NOT_FOUND, "This dataset version has not been archived"); @@ -5048,7 +5057,7 @@ public Response setDatasetVersionArchivalStatus(@Context ContainerRequestContext DataverseRequest req = createDataverseRequest(au); DatasetVersion dsv = getDatasetVersionOrDie(req, versionNumber, findDatasetOrDie(datasetId), - uriInfo, headers, true); + uriInfo, headers); if (dsv == null) { return error(Status.NOT_FOUND, "Dataset version not found"); @@ -5095,7 +5104,7 @@ public Response deleteDatasetVersionArchivalStatus(@Context ContainerRequestCont DataverseRequest req = createDataverseRequest(au); DatasetVersion dsv = getDatasetVersionOrDie(req, versionNumber, findDatasetOrDie(datasetId), uriInfo, - headers, true); + headers); if (dsv == null) { return error(Status.NOT_FOUND, "Dataset version not found"); } diff --git a/src/main/java/edu/harvard/iq/dataverse/dataset/DatasetUtil.java b/src/main/java/edu/harvard/iq/dataverse/dataset/DatasetUtil.java index 2ce5471a523..79451a61a84 100644 --- a/src/main/java/edu/harvard/iq/dataverse/dataset/DatasetUtil.java +++ b/src/main/java/edu/harvard/iq/dataverse/dataset/DatasetUtil.java @@ -740,4 +740,21 @@ public static String getLocaleCurationStatusLabelFromString(String label) { } return localizedName; } + + // Find the prior version - relies on version sorting by major/minor numbers + public static DatasetVersion getPriorVersion(DatasetVersion version) { + boolean foundCurrent = false; + DatasetVersion priorVersion = null; + for (DatasetVersion versionLoop : version.getDataset().getVersions()) { + if (foundCurrent) { + priorVersion = versionLoop; + break; + } + if (versionLoop.equals(version)) { + foundCurrent = true; + } + + } + return priorVersion; + } } diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java index 29c27d0396d..e7ccfcf46b7 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java @@ -2,8 +2,9 @@ import edu.harvard.iq.dataverse.DataCitation; import edu.harvard.iq.dataverse.Dataset; +import edu.harvard.iq.dataverse.DatasetFieldConstant; +import edu.harvard.iq.dataverse.DatasetLock.Reason; import edu.harvard.iq.dataverse.DatasetVersion; -import edu.harvard.iq.dataverse.DvObject; import edu.harvard.iq.dataverse.SettingsWrapper; import edu.harvard.iq.dataverse.authorization.Permission; import edu.harvard.iq.dataverse.authorization.users.ApiToken; @@ -15,55 +16,160 @@ import edu.harvard.iq.dataverse.engine.command.exception.CommandException; import edu.harvard.iq.dataverse.pidproviders.doi.datacite.DOIDataCiteRegisterService; import edu.harvard.iq.dataverse.settings.SettingsServiceBean; +import edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key; +import edu.harvard.iq.dataverse.util.ListSplitUtil; import edu.harvard.iq.dataverse.util.bagit.BagGenerator; import edu.harvard.iq.dataverse.util.bagit.OREMap; +import edu.harvard.iq.dataverse.workflow.step.Failure; +import edu.harvard.iq.dataverse.util.json.JsonLDTerm; import edu.harvard.iq.dataverse.workflow.step.WorkflowStepResult; +import jakarta.ejb.TransactionAttribute; +import jakarta.ejb.TransactionAttributeType; +import jakarta.json.JsonObject; +import jakarta.json.Json; +import jakarta.json.JsonObjectBuilder; import java.io.IOException; import java.io.PipedInputStream; import java.io.PipedOutputStream; import java.security.DigestInputStream; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.logging.Logger; @RequiredPermissions(Permission.PublishDataset) public abstract class AbstractSubmitToArchiveCommand extends AbstractCommand { - private final DatasetVersion version; - private final Map requestedSettings = new HashMap(); + protected final DatasetVersion version; + protected final Map requestedSettings = new HashMap(); + protected String spaceName = null; protected boolean success=false; private static final Logger logger = Logger.getLogger(AbstractSubmitToArchiveCommand.class.getName()); private static final int MAX_ZIP_WAIT = 20000; private static final int DEFAULT_THREADS = 2; - + public AbstractSubmitToArchiveCommand(DataverseRequest aRequest, DatasetVersion version) { super(aRequest, version.getDataset()); this.version = version; } @Override + @TransactionAttribute(TransactionAttributeType.REQUIRED) public DatasetVersion execute(CommandContext ctxt) throws CommandException { + // Check for locks while we're still in a transaction + Dataset dataset = version.getDataset(); + if (dataset.getLockFor(Reason.finalizePublication) != null + || dataset.getLockFor(Reason.FileValidationFailed) != null) { + throw new CommandException("Dataset is locked and cannot be archived", this); + } + String settings = ctxt.settings().getValueForKey(SettingsServiceBean.Key.ArchiverSettings); - String[] settingsArray = settings.split(","); - for (String setting : settingsArray) { - setting = setting.trim(); - if (!setting.startsWith(":")) { - logger.warning("Invalid Archiver Setting: " + setting); + List settingsList = ListSplitUtil.split(settings); + for (String settingName : settingsList) { + Key setting = Key.parse(settingName); + if (setting == null) { + logger.warning("Invalid Archiver Setting: " + settingName); } else { - requestedSettings.put(setting, ctxt.settings().get(setting)); + requestedSettings.put(settingName, ctxt.settings().getValueForKey(setting)); } } - + AuthenticatedUser user = getRequest().getAuthenticatedUser(); ApiToken token = ctxt.authentication().findApiTokenByUser(user); if (token == null) { //No un-expired token token = ctxt.authentication().generateApiTokenForUser(user); } - performArchiveSubmission(version, token, requestedSettings); - return ctxt.em().merge(version); + if (!preconditionsMet(version, token, requestedSettings)) { + JsonObjectBuilder statusObjectBuilder = Json.createObjectBuilder(); + statusObjectBuilder.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_FAILURE); + statusObjectBuilder.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, + "Successful archiving of earlier versions is required."); + version.setArchivalCopyLocation(statusObjectBuilder.build().toString()); + // Persist the failure status + persistResult(ctxt, version); + } else { + + String dataCiteXml = getDataCiteXml(version); + OREMap oreMap = new OREMap(version, false); + JsonObject ore = oreMap.getOREMap(); + Map terms = getJsonLDTerms(oreMap); + performArchivingAndPersist(ctxt, version, dataCiteXml, ore, terms, token, requestedSettings); + } + return ctxt.datasetVersion().find(version.getId()); + } + + // While we have a transaction context, get the terms needed to create the baginfo file + public static Map getJsonLDTerms(OREMap oreMap) { + Map terms = new HashMap(); + terms.put(DatasetFieldConstant.datasetContact, oreMap.getContactTerm()); + terms.put(DatasetFieldConstant.datasetContactName, oreMap.getContactNameTerm()); + terms.put(DatasetFieldConstant.datasetContactEmail, oreMap.getContactEmailTerm()); + terms.put(DatasetFieldConstant.description, oreMap.getDescriptionTerm()); + terms.put(DatasetFieldConstant.descriptionText, oreMap.getDescriptionTextTerm()); + + return terms; + } + + /** + * Note that this method may be called from the execute method above OR from a + * workflow in which execute() is never called and therefore in which all + * variables must be sent as method parameters. (Nominally version is set in the + * constructor and could be dropped from the parameter list.) + * @param ctxt + * + * @param version - the DatasetVersion to archive + * @param token - an API Token for the user performing this action + * @param requestedSettings - a map of the names/values for settings required by this archiver (sent because this class is not part of the EJB context (by design) and has no direct access to service beans). + */ + public boolean preconditionsMet(DatasetVersion version, ApiToken token, Map requestedSettings) { + // Check if earlier versions must be archived first + String requireEarlierArchivedValue = requestedSettings.get(SettingsServiceBean.Key.ArchiveOnlyIfEarlierVersionsAreArchived.toString()); + boolean requireEarlierArchived = Boolean.parseBoolean(requireEarlierArchivedValue); + if (requireEarlierArchived) { + + Dataset dataset = version.getDataset(); + List versions = dataset.getVersions(); + + boolean foundCurrent = false; + + // versions are ordered, all versions after the current one have lower + // major/minor version numbers + for (DatasetVersion versionInLoop : versions) { + if (foundCurrent) { + // Once foundCurrent is true, we are looking at prior versions + // Check if this earlier version has been successfully archived + String archivalStatus = versionInLoop.getArchivalCopyLocationStatus(); + if (archivalStatus == null || !archivalStatus.equals(DatasetVersion.ARCHIVAL_STATUS_SUCCESS) + // || !archivalStatus.equals(DatasetVersion.ARCHIVAL_STATUS_OBSOLETE) + ) { + return false; + } + } + if (versionInLoop.equals(version)) { + foundCurrent = true; + } + + } + } + return true; + } + + @TransactionAttribute(TransactionAttributeType.NOT_SUPPORTED) + public WorkflowStepResult performArchivingAndPersist(CommandContext ctxt, DatasetVersion version, String dataCiteXml, JsonObject ore, Map terms, ApiToken token, Map requestedSetttings) { + // This runs OUTSIDE any transaction + BagGenerator.setNumConnections(getNumberOfBagGeneratorThreads()); + WorkflowStepResult wfsr = performArchiveSubmission(version, dataCiteXml, ore, terms, token, requestedSettings); + persistResult(ctxt, version); + return wfsr; + } + + @TransactionAttribute(TransactionAttributeType.REQUIRES_NEW) + private void persistResult(CommandContext ctxt, DatasetVersion versionWithStatus) { + // New transaction just for this quick operation + ctxt.datasetVersion().persistArchivalCopyLocation(versionWithStatus); } /** @@ -72,12 +178,16 @@ public DatasetVersion execute(CommandContext ctxt) throws CommandException { * workflow in which execute() is never called and therefore in which all * variables must be sent as method parameters. (Nominally version is set in the * constructor and could be dropped from the parameter list.) - * + * * @param version - the DatasetVersion to archive + * @param dataCiteXml + * @param ore + * @param terms * @param token - an API Token for the user performing this action * @param requestedSettings - a map of the names/values for settings required by this archiver (sent because this class is not part of the EJB context (by design) and has no direct access to service beans). */ - abstract public WorkflowStepResult performArchiveSubmission(DatasetVersion version, ApiToken token, Map requestedSetttings); + abstract public WorkflowStepResult performArchiveSubmission(DatasetVersion version, String dataCiteXml, JsonObject ore, Map terms, ApiToken token, Map requestedSetttings); + protected int getNumberOfBagGeneratorThreads() { if (requestedSettings.get(BagGenerator.BAG_GENERATOR_THREADS) != null) { @@ -85,7 +195,7 @@ protected int getNumberOfBagGeneratorThreads() { return Integer.valueOf(requestedSettings.get(BagGenerator.BAG_GENERATOR_THREADS)); } catch (NumberFormatException nfe) { logger.warning("Can't parse the value of setting " + BagGenerator.BAG_GENERATOR_THREADS - + " as an integer - using default:" + DEFAULT_THREADS); + + " as an integer - using default:" + DEFAULT_THREADS); } } return DEFAULT_THREADS; @@ -94,24 +204,24 @@ protected int getNumberOfBagGeneratorThreads() { @Override public String describe() { return super.describe() + "DatasetVersion: [" + version.getId() + " (v" - + version.getFriendlyVersionNumber()+")]"; + + version.getFriendlyVersionNumber()+")]"; } - - String getDataCiteXml(DatasetVersion dv) { + + public String getDataCiteXml(DatasetVersion dv) { DataCitation dc = new DataCitation(dv); Map metadata = dc.getDataCiteMetadata(); return DOIDataCiteRegisterService.getMetadataFromDvObject(dv.getDataset().getGlobalId().asString(), metadata, - dv.getDataset()); + dv.getDataset()); } public Thread startBagThread(DatasetVersion dv, PipedInputStream in, DigestInputStream digestInputStream2, - String dataciteXml, ApiToken token) throws IOException, InterruptedException { + String dataciteXml, JsonObject ore, Map terms, ApiToken token) throws IOException, InterruptedException { Thread bagThread = new Thread(new Runnable() { public void run() { try (PipedOutputStream out = new PipedOutputStream(in)) { // Generate bag - BagGenerator bagger = new BagGenerator(new OREMap(dv, false), dataciteXml); - bagger.setNumConnections(getNumberOfBagGeneratorThreads()); + BagGenerator.setNumConnections(getNumberOfBagGeneratorThreads()); + BagGenerator bagger = new BagGenerator(ore, dataciteXml, terms); bagger.setAuthenticationKey(token.getTokenString()); bagger.generateBag(out); success = true; @@ -123,9 +233,9 @@ public void run() { digestInputStream2.close(); } catch (Exception ex) { logger.warning(ex.getLocalizedMessage()); - } + } throw new RuntimeException("Error creating bag: " + e.getMessage()); - } + } } }); bagThread.start(); @@ -147,7 +257,7 @@ public void run() { * the pipe. (Note the PipedInputStream buffer is set at 100K above - I didn't * want to test whether that means that exactly 100K bytes will be available() * for large datasets or not, so the test below is at 90K.) - * + * * An additional sanity check limits the wait to 20K (MAX_ZIP_WAIT) seconds. The BagGenerator * has been used to archive >120K files, 2K directories, and ~600GB files on the * SEAD project (streaming content to disk rather than over an internet @@ -172,15 +282,43 @@ public void run() { public static boolean isArchivable(Dataset dataset, SettingsWrapper settingsWrapper) { return true; - } - - //Check if the chosen archiver imposes single-version-only archiving - in a View context - public static boolean isSingleVersion(SettingsWrapper settingsWrapper) { - return false; - } - - //Check if the chosen archiver imposes single-version-only archiving - in the API - public static boolean isSingleVersion(SettingsServiceBean settingsService) { - return false; - } + } + + //Check if the chosen archiver imposes single-version-only archiving - in a View context + public static boolean isSingleVersion(SettingsWrapper settingsWrapper) { + return false; + } + + //Check if the chosen archiver imposes single-version-only archiving - in the API + public static boolean isSingleVersion(SettingsServiceBean settingsService) { + return false; + } + + /** Whether the archiver can delete existing archival files (and thus can retry when the existing files are incomplete/obsolete) + * A static version supports calls via reflection while the instance method supports inheritance for use on actual command instances (see DatasetPage for both use cases). + * @return + */ + public static boolean supportsDelete() { + return false; + } + + public boolean canDelete() { + return supportsDelete(); + } + + protected String getDataCiteFileName(String spaceName, DatasetVersion dv) { + return spaceName + "_datacite.v" + dv.getFriendlyVersionNumber(); + } + + protected String getFileName(String spaceName, DatasetVersion dv) { + return spaceName + ".v" + dv.getFriendlyVersionNumber(); + } + + protected String getSpaceName(Dataset dataset) { + if (spaceName == null) { + spaceName = dataset.getGlobalId().asString().replace(':', '-').replace('/', '-').replace('.', '-') + .toLowerCase(); + } + return spaceName; + } } diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DRSSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DRSSubmitToArchiveCommand.java index 78e8454255b..903fab63ba9 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DRSSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DRSSubmitToArchiveCommand.java @@ -4,13 +4,19 @@ import edu.harvard.iq.dataverse.DatasetVersion; import edu.harvard.iq.dataverse.Dataverse; import edu.harvard.iq.dataverse.SettingsWrapper; +import edu.harvard.iq.dataverse.DatasetLock.Reason; import edu.harvard.iq.dataverse.authorization.Permission; import edu.harvard.iq.dataverse.authorization.users.ApiToken; +import edu.harvard.iq.dataverse.authorization.users.AuthenticatedUser; import edu.harvard.iq.dataverse.branding.BrandingUtil; import edu.harvard.iq.dataverse.engine.command.Command; +import edu.harvard.iq.dataverse.engine.command.CommandContext; import edu.harvard.iq.dataverse.engine.command.DataverseRequest; import edu.harvard.iq.dataverse.engine.command.RequiredPermissions; +import edu.harvard.iq.dataverse.engine.command.exception.CommandException; import edu.harvard.iq.dataverse.settings.SettingsServiceBean; +import edu.harvard.iq.dataverse.util.bagit.OREMap; +import edu.harvard.iq.dataverse.util.json.JsonLDTerm; import edu.harvard.iq.dataverse.util.json.JsonUtil; import edu.harvard.iq.dataverse.workflow.step.Failure; import edu.harvard.iq.dataverse.workflow.step.WorkflowStepResult; @@ -34,6 +40,8 @@ import java.util.Set; import java.util.logging.Logger; +import jakarta.ejb.TransactionAttribute; +import jakarta.ejb.TransactionAttributeType; import jakarta.json.Json; import jakarta.json.JsonObject; import jakarta.json.JsonObjectBuilder; @@ -70,20 +78,89 @@ public class DRSSubmitToArchiveCommand extends S3SubmitToArchiveCommand implemen private static final String PACKAGE_ID = "package_id"; private static final String SINGLE_VERSION = "single_version"; private static final String DRS_ENDPOINT = "DRS_endpoint"; - + private static final String RSA_KEY = "dataverse.archiver.drs.rsa_key"; private static final String TRUST_CERT = "trust_cert"; private static final String TIMEOUT = "timeout"; + private String archivableAncestorAlias; + public DRSSubmitToArchiveCommand(DataverseRequest aRequest, DatasetVersion version) { super(aRequest, version); } @Override - public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token, - Map requestedSettings) { + @TransactionAttribute(TransactionAttributeType.REQUIRED) + public DatasetVersion execute(CommandContext ctxt) throws CommandException { + + + // Check for locks while we're still in a transaction + Dataset dataset = version.getDataset(); + if (dataset.getLockFor(Reason.finalizePublication) != null + || dataset.getLockFor(Reason.FileValidationFailed) != null) { + throw new CommandException("Dataset is locked and cannot be archived", this); + } + + String settings = ctxt.settings().getValueForKey(SettingsServiceBean.Key.ArchiverSettings); + String[] settingsArray = settings.split(","); + for (String setting : settingsArray) { + setting = setting.trim(); + if (!setting.startsWith(":")) { + logger.warning("Invalid Archiver Setting: " + setting); + } else { + requestedSettings.put(setting, ctxt.settings().get(setting)); + } + } + + // Compute archivable ancestor while we're in a transaction and entities are managed + JsonObject drsConfigObject = null; + try { + drsConfigObject = JsonUtil.getJsonObject(requestedSettings.get(DRS_CONFIG)); + } catch (Exception e) { + logger.warning("Unable to parse " + DRS_CONFIG + " setting as a Json object"); + } + + if (drsConfigObject != null) { + JsonObject adminMetadata = drsConfigObject.getJsonObject(ADMIN_METADATA); + if (adminMetadata != null) { + JsonObject collectionsObj = adminMetadata.getJsonObject(COLLECTIONS); + if (collectionsObj != null) { + Set collections = collectionsObj.keySet(); + Dataverse ancestor = dataset.getOwner(); + // Compute this while entities are still managed + archivableAncestorAlias = getArchivableAncestor(ancestor, collections); + } + } + } + + AuthenticatedUser user = getRequest().getAuthenticatedUser(); + ApiToken token = ctxt.authentication().findApiTokenByUser(user); + if (token == null) { + //No un-expired token + token = ctxt.authentication().generateApiTokenForUser(user); + } + if (!preconditionsMet(version, token, requestedSettings)) { + JsonObjectBuilder statusObjectBuilder = Json.createObjectBuilder(); + statusObjectBuilder.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_FAILURE); + statusObjectBuilder.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, + "Successful archiving of earlier versions is required."); + version.setArchivalCopyLocation(statusObjectBuilder.build().toString()); + } else { + + String dataCiteXml = getDataCiteXml(version); + OREMap oreMap = new OREMap(version, false); + JsonObject ore = oreMap.getOREMap(); + Map terms = getJsonLDTerms(oreMap); + performArchivingAndPersist(ctxt, version, dataCiteXml, ore, terms, token, requestedSettings); + } + return ctxt.em().merge(version); + } + + @Override + public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, String dataciteXml, JsonObject ore, + Map terms, ApiToken token, Map requestedSettings) { logger.fine("In DRSSubmitToArchiveCommand..."); JsonObject drsConfigObject = null; @@ -97,7 +174,7 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken t Set collections = adminMetadata.getJsonObject(COLLECTIONS).keySet(); Dataset dataset = dv.getDataset(); Dataverse ancestor = dataset.getOwner(); - String alias = getArchivableAncestor(ancestor, collections); + String alias = archivableAncestorAlias; // Use the pre-computed alias instead of calling getArchivableAncestor again String spaceName = getSpaceName(dataset); String packageId = getFileName(spaceName, dv); @@ -106,14 +183,14 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken t for (DatasetVersion version : dataset.getVersions()) { if (version.getArchivalCopyLocation() != null) { return new Failure("DRS Archiver fail: version " + version.getFriendlyVersionNumber() - + " already archived."); + + " already archived."); } } } JsonObject collectionConfig = adminMetadata.getJsonObject(COLLECTIONS).getJsonObject(alias); - WorkflowStepResult s3Result = super.performArchiveSubmission(dv, token, requestedSettings); + WorkflowStepResult s3Result = super.performArchiveSubmission(dv, dataciteXml, ore, terms, token, requestedSettings); JsonObjectBuilder statusObject = Json.createObjectBuilder(); statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_FAILURE); @@ -157,9 +234,9 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken t // use the TrustSelfSignedStrategy to allow Self Signed Certificates try { SSLContext sslContext = SSLContextBuilder.create().loadTrustMaterial(new TrustAllStrategy()) - .build(); + .build(); client = HttpClients.custom().setSSLContext(sslContext) - .setSSLHostnameVerifier(NoopHostnameVerifier.INSTANCE).build(); + .setSSLHostnameVerifier(NoopHostnameVerifier.INSTANCE).build(); } catch (KeyManagementException e) { // TODO Auto-generated catch block e.printStackTrace(); @@ -187,14 +264,14 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken t //RSAPublicKey publicKey; /* * If public key is needed: encoded = Base64.decodeBase64(publicKeyPEM); - * + * * KeyFactory keyFactory = KeyFactory.getInstance("RS256"); X509EncodedKeySpec * keySpec = new X509EncodedKeySpec(encoded); return (RSAPublicKey) * keyFactory.generatePublic(keySpec); RSAPublicKey publicKey = new * RSAPublicKey(System.getProperty(RS256_KEY)); */ Algorithm algorithmRSA = Algorithm.RSA256(null, privKey); - + String body = drsConfigString; String jwtString = createJWTString(algorithmRSA, BrandingUtil.getInstallationBrandName(), body, jwtTimeout); logger.fine("JWT: " + jwtString); @@ -208,41 +285,41 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken t try (CloseableHttpResponse response = client.execute(ingestPost)) { int code = response.getStatusLine().getStatusCode(); String responseBody = new String(response.getEntity().getContent().readAllBytes(), - StandardCharsets.UTF_8); + StandardCharsets.UTF_8); if (code == 202) { logger.fine("Status: " + code); logger.fine("Response" + responseBody); JsonObject responseObject = JsonUtil.getJsonObject(responseBody); if (responseObject.containsKey(DatasetVersion.ARCHIVAL_STATUS) - && responseObject.containsKey(DatasetVersion.ARCHIVAL_STATUS_MESSAGE)) { + && responseObject.containsKey(DatasetVersion.ARCHIVAL_STATUS_MESSAGE)) { String status = responseObject.getString(DatasetVersion.ARCHIVAL_STATUS); if (status.equals(DatasetVersion.ARCHIVAL_STATUS_PENDING) || status.equals(DatasetVersion.ARCHIVAL_STATUS_FAILURE) - || status.equals(DatasetVersion.ARCHIVAL_STATUS_SUCCESS)) { + || status.equals(DatasetVersion.ARCHIVAL_STATUS_SUCCESS)) { statusObject.addAll(Json.createObjectBuilder(responseObject)); switch (status) { - case DatasetVersion.ARCHIVAL_STATUS_PENDING: - logger.info("DRS Ingest successfully started for: " + packageId + " : " + case DatasetVersion.ARCHIVAL_STATUS_PENDING: + logger.info("DRS Ingest successfully started for: " + packageId + " : " + responseObject.toString()); - break; - case DatasetVersion.ARCHIVAL_STATUS_FAILURE: - logger.severe("DRS Ingest Failed for: " + packageId + " : " + break; + case DatasetVersion.ARCHIVAL_STATUS_FAILURE: + logger.severe("DRS Ingest Failed for: " + packageId + " : " + responseObject.toString()); - return new Failure("DRS Archiver fail in Ingest call"); - case DatasetVersion.ARCHIVAL_STATUS_SUCCESS: - // We don't expect this from DRS - logger.warning("Unexpected Status: " + status); + return new Failure("DRS Archiver fail in Ingest call"); + case DatasetVersion.ARCHIVAL_STATUS_SUCCESS: + // We don't expect this from DRS + logger.warning("Unexpected Status: " + status); } } else { logger.severe("DRS Ingest Failed for: " + packageId + " with returned status: " - + status); + + status); return new Failure( - "DRS Archiver fail in Ingest call with returned status: " + status); + "DRS Archiver fail in Ingest call with returned status: " + status); } } else { logger.severe("DRS Ingest Failed for: " + packageId - + " - response does not include status and message"); + + " - response does not include status and message"); return new Failure( - "DRS Archiver fail in Ingest call \" - response does not include status and message"); + "DRS Archiver fail in Ingest call - response does not include status and message"); } } else { logger.severe("DRS Ingest Failed for: " + packageId + " with status code: " + code); @@ -256,11 +333,11 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken t } } catch (URISyntaxException e) { return new Failure( - "DRS Archiver workflow step failed: unable to parse " + DRS_ENDPOINT ); + "DRS Archiver workflow step failed: unable to parse " + DRS_ENDPOINT ); } catch (JWTCreationException exception) { // Invalid Signing configuration / Couldn't convert Claims. return new Failure( - "DRS Archiver JWT Creation failure: " + exception.getMessage() ); + "DRS Archiver JWT Creation failure: " + exception.getMessage() ); } // execute @@ -279,7 +356,7 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken t dv.setArchivalCopyLocation(statusObject.build().toString()); return new Failure("DRS Archiver fail in initial S3 Archiver transfer"); } - + } else { logger.fine("DRS Archiver: No matching collection found - will not archive: " + packageId); return WorkflowStepResult.OK; @@ -301,7 +378,7 @@ protected String getDataCiteFileName(String spaceName, DatasetVersion dv) { return spaceName + ("_datacite.v" + dv.getFriendlyVersionNumber()).replace('.','_'); } - + public static String createJWTString(Algorithm algorithmRSA, String installationBrandName, String body, int expirationInMinutes) throws IOException { String canonicalBody = new JsonCanonicalizer(body).getEncodedString(); logger.fine("Canonical body: " + canonicalBody); @@ -310,8 +387,8 @@ public static String createJWTString(Algorithm algorithmRSA, String installation installationBrandName = BrandingUtil.getInstallationBrandName(); } return JWT.create().withIssuer(installationBrandName).withIssuedAt(Date.from(Instant.now())) - .withExpiresAt(Date.from(Instant.now().plusSeconds(60 * expirationInMinutes))) - .withKeyId("defaultDataverse").withClaim("bodySHA256Hash", digest).sign(algorithmRSA); + .withExpiresAt(Date.from(Instant.now().plusSeconds(60 * expirationInMinutes))) + .withKeyId("defaultDataverse").withClaim("bodySHA256Hash", digest).sign(algorithmRSA); } private static String getArchivableAncestor(Dataverse ancestor, Set collections) { @@ -351,16 +428,16 @@ public static boolean isArchivable(Dataset d, SettingsWrapper sw) { } return false; } - + // DRS Archiver supports single-version semantics if the SINGLE_VERSION key in // the DRS_CONFIG is true // These methods make that choices visible on the page (cached via // SettingsWrapper) or in the API (using SettingServiceBean), both using the // same underlying logic - + public static boolean isSingleVersion(SettingsWrapper sw) { - String config = sw.get(DRS_CONFIG, null); - return isSingleVersion(config); + String config = sw.get(DRS_CONFIG, null); + return isSingleVersion(config); } public static boolean isSingleVersion(SettingsServiceBean ss) { diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java index fe4a25091d7..5384d8b668c 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java @@ -2,7 +2,6 @@ import edu.harvard.iq.dataverse.Dataset; import edu.harvard.iq.dataverse.DatasetVersion; -import edu.harvard.iq.dataverse.DatasetLock.Reason; import edu.harvard.iq.dataverse.authorization.Permission; import edu.harvard.iq.dataverse.authorization.users.ApiToken; import edu.harvard.iq.dataverse.engine.command.DataverseRequest; @@ -10,13 +9,21 @@ import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.DuraCloudContext; import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.DuraCloudHost; import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.DuraCloudPort; + +import edu.harvard.iq.dataverse.util.bagit.BagGenerator; +import edu.harvard.iq.dataverse.util.json.JsonLDTerm; import edu.harvard.iq.dataverse.workflow.step.Failure; import edu.harvard.iq.dataverse.workflow.step.WorkflowStepResult; +import java.io.File; +import java.io.FileOutputStream; import java.io.IOException; +import java.io.InputStream; import java.io.PipedInputStream; import java.io.PipedOutputStream; import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; import java.security.DigestInputStream; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; @@ -49,188 +56,213 @@ public DuraCloudSubmitToArchiveCommand(DataverseRequest aRequest, DatasetVersion } @Override - public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token, - Map requestedSettings) { + public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, String dataciteXml, jakarta.json.JsonObject ore, + Map terms, ApiToken token, Map requestedSettings) { String port = requestedSettings.get(DURACLOUD_PORT) != null ? requestedSettings.get(DURACLOUD_PORT) - : DEFAULT_PORT; + : DEFAULT_PORT; String dpnContext = requestedSettings.get(DURACLOUD_CONTEXT) != null ? requestedSettings.get(DURACLOUD_CONTEXT) - : DEFAULT_CONTEXT; + : DEFAULT_CONTEXT; String host = requestedSettings.get(DURACLOUD_HOST); - + if (host != null) { Dataset dataset = dv.getDataset(); // ToDo - change after HDC 3A changes to status reporting // This will make the archivalCopyLocation non-null after a failure which should // stop retries - - if (dataset.getLockFor(Reason.finalizePublication) == null - && dataset.getLockFor(Reason.FileValidationFailed) == null) { - // Use Duracloud client classes to login - ContentStoreManager storeManager = new ContentStoreManagerImpl(host, port, dpnContext); - Credential credential = new Credential(System.getProperty("duracloud.username"), - System.getProperty("duracloud.password")); - storeManager.login(credential); + + // Use Duracloud client classes to login + ContentStoreManager storeManager = new ContentStoreManagerImpl(host, port, dpnContext); + Credential credential = new Credential(System.getProperty("duracloud.username"), + System.getProperty("duracloud.password")); + storeManager.login(credential); + /* + * Aliases can contain upper case characters which are not allowed in space + * names. Similarly, aliases can contain '_' which isn't allowed in a space + * name. The line below replaces any upper case chars with lowercase and + * replaces any '_' with '.-' . The '-' after the dot assures we don't break the + * rule that + * "The last period in a aspace may not immediately be followed by a number". + * (Although we could check, it seems better to just add '.-' all the time.As + * written the replaceAll will also change any chars not valid in a spaceName to + * '.' which would avoid code breaking if the alias constraints change. That + * said, this line may map more than one alias to the same spaceName, e.g. + * "test" and "Test" aliases both map to the "test" space name. This does not + * break anything but does potentially put bags from more than one collection in + * the same space. + */ + String spaceName = dataset.getOwner().getAlias().toLowerCase().replaceAll("[^a-z0-9-]", ".dcsafe"); + //This archiver doesn't use the standard spaceName, but does use it to generate the file name + String baseFileName = getFileName(getSpaceName(dataset), dv); + + ContentStore store; + // Set a failure status that will be updated if we succeed + JsonObjectBuilder statusObject = Json.createObjectBuilder(); + statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_FAILURE); + statusObject.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, "Bag not transferred"); + + Path tempBagFile = null; + + try { /* - * Aliases can contain upper case characters which are not allowed in space - * names. Similarly, aliases can contain '_' which isn't allowed in a space - * name. The line below replaces any upper case chars with lowercase and - * replaces any '_' with '.-' . The '-' after the dot assures we don't break the - * rule that - * "The last period in a aspace may not immediately be followed by a number". - * (Although we could check, it seems better to just add '.-' all the time.As - * written the replaceAll will also change any chars not valid in a spaceName to - * '.' which would avoid code breaking if the alias constraints change. That - * said, this line may map more than one alias to the same spaceName, e.g. - * "test" and "Test" aliases both map to the "test" space name. This does not - * break anything but does potentially put bags from more than one collection in - * the same space. + * If there is a failure in creating a space, it is likely that a prior version + * has not been fully processed (snapshot created, archiving completed and files + * and space deleted - currently manual operations done at the project's + * duracloud website) */ - String spaceName = dataset.getOwner().getAlias().toLowerCase().replaceAll("[^a-z0-9-]", ".dcsafe"); - String baseFileName = dataset.getGlobalId().asString().replace(':', '-').replace('/', '-') - .replace('.', '-').toLowerCase() + "_v" + dv.getFriendlyVersionNumber(); - - ContentStore store; - //Set a failure status that will be updated if we succeed - JsonObjectBuilder statusObject = Json.createObjectBuilder(); - statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_FAILURE); - statusObject.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, "Bag not transferred"); - - try { - /* - * If there is a failure in creating a space, it is likely that a prior version - * has not been fully processed (snapshot created, archiving completed and files - * and space deleted - currently manual operations done at the project's - * duracloud website) - */ - store = storeManager.getPrimaryContentStore(); - // Create space to copy archival files to - if (!store.spaceExists(spaceName)) { - store.createSpace(spaceName); - } - String dataciteXml = getDataCiteXml(dv); - - MessageDigest messageDigest = MessageDigest.getInstance("MD5"); - try (PipedInputStream dataciteIn = new PipedInputStream(); - DigestInputStream digestInputStream = new DigestInputStream(dataciteIn, messageDigest)) { - // Add datacite.xml file - - Thread dcThread = new Thread(new Runnable() { - public void run() { - try (PipedOutputStream dataciteOut = new PipedOutputStream(dataciteIn)) { - - dataciteOut.write(dataciteXml.getBytes(StandardCharsets.UTF_8)); - dataciteOut.close(); - success=true; - } catch (Exception e) { - logger.severe("Error creating datacite.xml: " + e.getMessage()); - // TODO Auto-generated catch block - e.printStackTrace(); - } + store = storeManager.getPrimaryContentStore(); + // Create space to copy archival files to + if (!store.spaceExists(spaceName)) { + store.createSpace(spaceName); + } + + MessageDigest messageDigest = MessageDigest.getInstance("MD5"); + try (PipedInputStream dataciteIn = new PipedInputStream(); + DigestInputStream digestInputStream = new DigestInputStream(dataciteIn, messageDigest)) { + // Add datacite.xml file + + Thread dcThread = new Thread(new Runnable() { + public void run() { + try (PipedOutputStream dataciteOut = new PipedOutputStream(dataciteIn)) { + + dataciteOut.write(dataciteXml.getBytes(StandardCharsets.UTF_8)); + dataciteOut.close(); + success = true; + } catch (Exception e) { + logger.severe("Error creating datacite.xml: " + e.getMessage()); + // TODO Auto-generated catch block + e.printStackTrace(); } - }); - dcThread.start(); - // Have seen Pipe Closed errors for other archivers when used as a workflow - // without this delay loop - int i = 0; - while (digestInputStream.available() <= 0 && i < 100) { - Thread.sleep(10); - i++; } - String checksum = store.addContent(spaceName, baseFileName + "_datacite.xml", digestInputStream, - -1l, null, null, null); - logger.fine("Content: datacite.xml added with checksum: " + checksum); - dcThread.join(); - String localchecksum = Hex.encodeHexString(digestInputStream.getMessageDigest().digest()); - if (!success || !checksum.equals(localchecksum)) { - logger.severe("Failure on " + baseFileName); - logger.severe(success ? checksum + " not equal to " + localchecksum : "failed to transfer to DuraCloud"); + }); + dcThread.start(); + // Have seen Pipe Closed errors for other archivers when used as a workflow + // without this delay loop + int i = 0; + while (digestInputStream.available() <= 0 && i < 100) { + Thread.sleep(10); + i++; + } + String checksum = store.addContent(spaceName, baseFileName + "_datacite.xml", digestInputStream, + -1l, null, null, null); + logger.fine("Content: datacite.xml added with checksum: " + checksum); + dcThread.join(); + String localchecksum = Hex.encodeHexString(digestInputStream.getMessageDigest().digest()); + if (!success || !checksum.equals(localchecksum)) { + logger.severe("Failure on " + baseFileName); + logger.severe(success ? checksum + " not equal to " + localchecksum + : "failed to transfer to DuraCloud"); + try { + store.deleteContent(spaceName, baseFileName + "_datacite.xml"); + } catch (ContentStoreException cse) { + logger.warning(cse.getMessage()); + } + return new Failure("Error in transferring DataCite.xml file to DuraCloud", + "DuraCloud Submission Failure: incomplete metadata transfer"); + } + + // Store BagIt file + success = false; + String fileName = baseFileName + ".zip"; + + // Add BagIt ZIP file + // Although DuraCloud uses SHA-256 internally, it's API uses MD5 to verify the + // transfer + Path bagFile = null; + + tempBagFile = Files.createTempFile("dataverse-bag-", ".zip"); + logger.fine("Creating bag in temporary file: " + tempBagFile.toString()); + // Generate bag + BagGenerator bagger = new BagGenerator(ore, dataciteXml, terms); + bagger.setAuthenticationKey(token.getTokenString()); + + // Generate bag to temporary file using the provided ore JsonObject + try (FileOutputStream fos = new FileOutputStream(tempBagFile.toFile())) { + if (!bagger.generateBag(fos)) { + throw new IOException("Bag generation failed"); + } + } + + // Store BagIt file + long bagSize = Files.size(tempBagFile); + logger.fine("Bag created successfully, size: " + bagSize + " bytes"); + + // Now upload the bag file + messageDigest = MessageDigest.getInstance("MD5"); + try (InputStream is = Files.newInputStream(bagFile); + DigestInputStream bagDigestInputStream = new DigestInputStream(is, messageDigest)) { + checksum = store.addContent(spaceName, fileName, bagDigestInputStream, + bagFile.toFile().length(), "application/zip", null, null); + localchecksum = Hex.encodeHexString(bagDigestInputStream.getMessageDigest().digest()); + + if (checksum != null && checksum.equals(localchecksum)) { + logger.fine("Content: " + fileName + " added with checksum: " + checksum); + success = true; + } else { + logger.severe("Failure on " + fileName); + logger.severe(checksum + " not equal to " + localchecksum); try { + store.deleteContent(spaceName, fileName); store.deleteContent(spaceName, baseFileName + "_datacite.xml"); } catch (ContentStoreException cse) { logger.warning(cse.getMessage()); } - return new Failure("Error in transferring DataCite.xml file to DuraCloud", - "DuraCloud Submission Failure: incomplete metadata transfer"); - } - - // Store BagIt file - success = false; - String fileName = baseFileName + ".zip"; - - // Add BagIt ZIP file - // Although DuraCloud uses SHA-256 internally, it's API uses MD5 to verify the - // transfer - - messageDigest = MessageDigest.getInstance("MD5"); - try (PipedInputStream in = new PipedInputStream(100000); - DigestInputStream digestInputStream2 = new DigestInputStream(in, messageDigest)) { - Thread bagThread = startBagThread(dv, in, digestInputStream2, dataciteXml, token); - checksum = store.addContent(spaceName, fileName, digestInputStream2, -1l, null, null, null); - bagThread.join(); - if (success) { - logger.fine("Content: " + fileName + " added with checksum: " + checksum); - localchecksum = Hex.encodeHexString(digestInputStream2.getMessageDigest().digest()); - } - if (!success || !checksum.equals(localchecksum)) { - logger.severe("Failure on " + fileName); - logger.severe(success ? checksum + " not equal to " + localchecksum : "failed to transfer to DuraCloud"); - try { - store.deleteContent(spaceName, fileName); - store.deleteContent(spaceName, baseFileName + "_datacite.xml"); - } catch (ContentStoreException cse) { - logger.warning(cse.getMessage()); - } - return new Failure("Error in transferring Zip file to DuraCloud", - "DuraCloud Submission Failure: incomplete archive transfer"); - } + return new Failure("Error in transferring Zip file to DuraCloud", + "DuraCloud Submission Failure: incomplete archive transfer"); } + } - logger.fine("DuraCloud Submission step: Content Transferred"); + logger.fine("DuraCloud Submission step: Content Transferred"); - // Document the location of dataset archival copy location (actually the URL - // where you can - // view it as an admin) - StringBuffer sb = new StringBuffer("https://"); - sb.append(host); - if (!port.equals("443")) { - sb.append(":" + port); - } - sb.append("/duradmin/spaces/sm/"); - sb.append(store.getStoreId()); - sb.append("/" + spaceName + "/" + fileName); - statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_SUCCESS); - statusObject.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, sb.toString()); - - logger.fine("DuraCloud Submission step complete: " + sb.toString()); - } catch (ContentStoreException | IOException e) { - // TODO Auto-generated catch block - logger.warning(e.getMessage()); - e.printStackTrace(); - return new Failure("Error in transferring file to DuraCloud", - "DuraCloud Submission Failure: archive file not transferred"); - } catch (InterruptedException e) { - logger.warning(e.getLocalizedMessage()); - e.printStackTrace(); + // Document the location of dataset archival copy location (actually the URL + // where you can + // view it as an admin) + StringBuffer sb = new StringBuffer("https://"); + sb.append(host); + if (!port.equals(DEFAULT_PORT)) { + sb.append(":" + port); } - } catch (ContentStoreException e) { + sb.append("/duradmin/spaces/sm/"); + sb.append(store.getStoreId()); + sb.append("/" + spaceName + "/" + fileName); + statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_SUCCESS); + statusObject.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, sb.toString()); + + logger.fine("DuraCloud Submission step complete: " + sb.toString()); + } catch (ContentStoreException | IOException e) { + // TODO Auto-generated catch block logger.warning(e.getMessage()); e.printStackTrace(); - String mesg = "DuraCloud Submission Failure"; - if (!(1 == dv.getVersion()) || !(0 == dv.getMinorVersionNumber())) { - mesg = mesg + ": Prior Version archiving not yet complete?"; - } - return new Failure("Unable to create DuraCloud space with name: " + baseFileName, mesg); - } catch (NoSuchAlgorithmException e) { - logger.severe("MD5 MessageDigest not available!"); + return new Failure("Error in transferring file to DuraCloud", + "DuraCloud Submission Failure: archive file not transferred"); + } catch (InterruptedException e) { + logger.warning(e.getLocalizedMessage()); + e.printStackTrace(); } - finally { - dv.setArchivalCopyLocation(statusObject.build().toString()); + } catch (ContentStoreException e) { + logger.warning(e.getMessage()); + e.printStackTrace(); + String mesg = "DuraCloud Submission Failure"; + if (!(1 == dv.getVersion()) || !(0 == dv.getMinorVersionNumber())) { + mesg = mesg + ": Prior Version archiving not yet complete?"; + } + return new Failure("Unable to create DuraCloud space with name: " + baseFileName, mesg); + } catch (NoSuchAlgorithmException e) { + logger.severe("MD5 MessageDigest not available!"); + } catch (Exception e) { + logger.warning(e.getLocalizedMessage()); + e.printStackTrace(); + return new Failure("Error in transferring file to DuraCloud", + "DuraCloud Submission Failure: internal error"); + } finally { + if (tempBagFile != null) { + try { + Files.deleteIfExists(tempBagFile); + } catch (IOException e) { + logger.warning("Failed to delete temporary bag file: " + tempBagFile + " : " + e.getMessage()); + } } - } else { - logger.warning( - "DuraCloud Submision Workflow aborted: Dataset locked for finalizePublication, or because file validation failed"); - return new Failure("Dataset locked"); + dv.setArchivalCopyLocation(statusObject.build().toString()); } return WorkflowStepResult.OK; } else { diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/FinalizeDatasetPublicationCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/FinalizeDatasetPublicationCommand.java index 1ef68ae4853..51e37efe2b2 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/FinalizeDatasetPublicationCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/FinalizeDatasetPublicationCommand.java @@ -22,15 +22,12 @@ import edu.harvard.iq.dataverse.engine.command.DataverseRequest; import edu.harvard.iq.dataverse.engine.command.RequiredPermissions; import edu.harvard.iq.dataverse.engine.command.exception.CommandException; -import edu.harvard.iq.dataverse.export.ExportService; import edu.harvard.iq.dataverse.pidproviders.PidProvider; -import edu.harvard.iq.dataverse.pidproviders.PidUtil; import edu.harvard.iq.dataverse.privateurl.PrivateUrl; -import edu.harvard.iq.dataverse.settings.SettingsServiceBean; import edu.harvard.iq.dataverse.util.BundleUtil; +import edu.harvard.iq.dataverse.workflow.WorkflowContext; import edu.harvard.iq.dataverse.workflow.WorkflowContext.TriggerType; -import java.awt.datatransfer.StringSelection; import java.io.IOException; import java.sql.Timestamp; import java.util.*; @@ -47,7 +44,6 @@ import org.apache.logging.log4j.util.Strings; import org.apache.solr.client.solrj.SolrServerException; - /** * * Takes the last internal steps in publishing a dataset. @@ -245,18 +241,10 @@ public Dataset execute(CommandContext ctxt) throws CommandException { //Remove any pre-pub workflow lock (not needed as WorkflowServiceBean.workflowComplete() should already have removed it after setting the finalizePublication lock?) ctxt.datasets().removeDatasetLocks(ds, DatasetLock.Reason.Workflow); - //Should this be in onSuccess()? - ctxt.workflows().getDefaultWorkflow(TriggerType.PostPublishDataset).ifPresent(wf -> { - try { - ctxt.workflows().start(wf, buildContext(ds, TriggerType.PostPublishDataset, datasetExternallyReleased), false); - } catch (CommandException ex) { - ctxt.datasets().removeDatasetLocks(ds, DatasetLock.Reason.Workflow); - logger.log(Level.SEVERE, "Error invoking post-publish workflow: " + ex.getMessage(), ex); - } - }); - Dataset readyDataset = ctxt.em().merge(ds); + setDataset(readyDataset); + // Finally, unlock the dataset (leaving any post-publish workflow lock in place) ctxt.datasets().removeDatasetLocks(readyDataset, DatasetLock.Reason.finalizePublication); if (readyDataset.isLockedFor(DatasetLock.Reason.InReview) ) { @@ -288,6 +276,21 @@ public boolean onSuccess(CommandContext ctxt, Object r) { } catch (Exception e) { logger.warning("Failure to send dataset published messages for : " + dataset.getId() + " : " + e.getMessage()); } + + final Dataset ds = dataset; + ctxt.workflows().getDefaultWorkflow(TriggerType.PostPublishDataset).ifPresent(wf -> { + // Build context with the lock attached + WorkflowContext context = buildContext(ds, TriggerType.PostPublishDataset, datasetExternallyReleased); + try { + ctxt.workflows().start(wf, context, false); + } catch (CommandException e) { + logger.log(Level.SEVERE, "Error invoking post-publish workflow: " + e.getMessage(), e); + } + }); + // Metadata export: + ctxt.datasets().reExportDatasetAsync(dataset); + + ctxt.index().asyncIndexDataset(dataset, true); //re-indexing dataverses that have additional subjects if (!dataversesToIndex.isEmpty()){ @@ -303,23 +306,6 @@ public boolean onSuccess(CommandContext ctxt, Object r) { } } - // Metadata export: - - try { - ExportService instance = ExportService.getInstance(); - instance.exportAllFormats(dataset); - dataset = ctxt.datasets().merge(dataset); - } catch (Exception ex) { - // Something went wrong! - // Just like with indexing, a failure to export is not a fatal - // condition. We'll just log the error as a warning and keep - // going: - logger.log(Level.WARNING, "Finalization: exception caught while exporting: "+ex.getMessage(), ex); - // ... but it is important to only update the export time stamp if the - // export was indeed successful. - } - ctxt.index().asyncIndexDataset(dataset, true); - return retVal; } diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java index 7dfb9f07e19..5bf46970984 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java @@ -7,7 +7,6 @@ import com.google.cloud.storage.StorageException; import com.google.cloud.storage.StorageOptions; import edu.harvard.iq.dataverse.Dataset; -import edu.harvard.iq.dataverse.DatasetLock.Reason; import edu.harvard.iq.dataverse.DatasetVersion; import edu.harvard.iq.dataverse.authorization.Permission; import edu.harvard.iq.dataverse.authorization.users.ApiToken; @@ -16,18 +15,27 @@ import edu.harvard.iq.dataverse.settings.JvmSettings; import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.GoogleCloudBucket; import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.GoogleCloudProject; +import edu.harvard.iq.dataverse.util.bagit.BagGenerator; +import edu.harvard.iq.dataverse.util.bagit.BagGenerator.FileEntry; +import edu.harvard.iq.dataverse.util.json.JsonLDTerm; import edu.harvard.iq.dataverse.workflow.step.Failure; import edu.harvard.iq.dataverse.workflow.step.WorkflowStepResult; import org.apache.commons.codec.binary.Hex; +import org.apache.commons.compress.parallel.InputStreamSupplier; import jakarta.json.Json; +import jakarta.json.JsonObject; import jakarta.json.JsonObjectBuilder; import java.io.File; import java.io.FileInputStream; +import java.io.FileOutputStream; import java.io.IOException; +import java.io.InputStream; import java.io.PipedInputStream; import java.io.PipedOutputStream; import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; import java.security.DigestInputStream; import java.security.MessageDigest; import java.util.Map; @@ -44,144 +52,239 @@ public GoogleCloudSubmitToArchiveCommand(DataverseRequest aRequest, DatasetVersi super(aRequest, version); } + public static boolean supportsDelete() { + return true; + } + @Override + public boolean canDelete() { + return supportsDelete(); + } + @Override - public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token, Map requestedSettings) { + public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, String dataciteXml, JsonObject ore, + Map terms, ApiToken token, Map requestedSettings) { logger.fine("In GoogleCloudSubmitToArchiveCommand..."); String bucketName = requestedSettings.get(GOOGLECLOUD_BUCKET); String projectName = requestedSettings.get(GOOGLECLOUD_PROJECT); logger.fine("Project: " + projectName + " Bucket: " + bucketName); if (bucketName != null && projectName != null) { Storage storage; - //Set a failure status that will be updated if we succeed + // Set a failure status that will be updated if we succeed JsonObjectBuilder statusObject = Json.createObjectBuilder(); statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_FAILURE); statusObject.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, "Bag not transferred"); - + String cloudKeyFile = JvmSettings.FILES_DIRECTORY.lookup() + File.separator + "googlecloudkey.json"; - + + // Create temporary file for bag + Path tempBagFile = null; + try (FileInputStream cloudKeyStream = new FileInputStream(cloudKeyFile)) { storage = StorageOptions.newBuilder() - .setCredentials(ServiceAccountCredentials.fromStream(cloudKeyStream)) - .setProjectId(projectName) - .build() - .getService(); + .setCredentials(ServiceAccountCredentials.fromStream(cloudKeyStream)).setProjectId(projectName) + .build().getService(); Bucket bucket = storage.get(bucketName); Dataset dataset = dv.getDataset(); - if (dataset.getLockFor(Reason.finalizePublication) == null) { - - String spaceName = dataset.getGlobalId().asString().replace(':', '-').replace('/', '-') - .replace('.', '-').toLowerCase(); - - String dataciteXml = getDataCiteXml(dv); - MessageDigest messageDigest = MessageDigest.getInstance("MD5"); - try (PipedInputStream dataciteIn = new PipedInputStream(); - DigestInputStream digestInputStream = new DigestInputStream(dataciteIn, messageDigest)) { - // Add datacite.xml file - - Thread dcThread = new Thread(new Runnable() { - public void run() { - try (PipedOutputStream dataciteOut = new PipedOutputStream(dataciteIn)) { - - dataciteOut.write(dataciteXml.getBytes(StandardCharsets.UTF_8)); - dataciteOut.close(); - success = true; - } catch (Exception e) { - logger.severe("Error creating datacite.xml: " + e.getMessage()); - // TODO Auto-generated catch block - e.printStackTrace(); - // throw new RuntimeException("Error creating datacite.xml: " + e.getMessage()); - } - } - }); - dcThread.start(); - // Have seen Pipe Closed errors for other archivers when used as a workflow - // without this delay loop - int i = 0; - while (digestInputStream.available() <= 0 && i < 100) { - Thread.sleep(10); - i++; - } - Blob dcXml = bucket.create(spaceName + "/datacite.v" + dv.getFriendlyVersionNumber() + ".xml", digestInputStream, "text/xml", Bucket.BlobWriteOption.doesNotExist()); - - dcThread.join(); - String checksum = dcXml.getMd5ToHexString(); - logger.fine("Content: datacite.xml added with checksum: " + checksum); - String localchecksum = Hex.encodeHexString(digestInputStream.getMessageDigest().digest()); - if (!success || !checksum.equals(localchecksum)) { - logger.severe("Failure on " + spaceName); - logger.severe(success ? checksum + " not equal to " + localchecksum : "datacite.xml transfer did not succeed"); - try { - dcXml.delete(Blob.BlobSourceOption.generationMatch()); - } catch (StorageException se) { - logger.warning(se.getMessage()); + + String spaceName = getSpaceName(dataset); + + // Check for and delete existing files for this version + String dataciteFileName = spaceName + "/" + getDataCiteFileName(spaceName, dv) + ".xml"; + String bagFileName = spaceName + "/" + getFileName(spaceName,dv) + ".zip"; + + logger.fine("Checking for existing files in archive..."); + + try { + Blob existingDatacite = bucket.get(dataciteFileName); + if (existingDatacite != null && existingDatacite.exists()) { + logger.fine("Found existing datacite.xml, deleting: " + dataciteFileName); + existingDatacite.delete(); + logger.fine("Deleted existing datacite.xml"); + } + } catch (StorageException se) { + logger.warning("Error checking/deleting existing datacite.xml: " + se.getMessage()); + } + + try { + Blob existingBag = bucket.get(bagFileName); + if (existingBag != null && existingBag.exists()) { + logger.fine("Found existing bag file, deleting: " + bagFileName); + existingBag.delete(); + logger.fine("Deleted existing bag file"); + } + } catch (StorageException se) { + logger.warning("Error checking/deleting existing bag file: " + se.getMessage()); + } + + // Upload datacite.xml + MessageDigest messageDigest = MessageDigest.getInstance("MD5"); + try (PipedInputStream dataciteIn = new PipedInputStream(); + DigestInputStream digestInputStream = new DigestInputStream(dataciteIn, messageDigest)) { + // Add datacite.xml file + + Thread dcThread = new Thread(new Runnable() { + public void run() { + try (PipedOutputStream dataciteOut = new PipedOutputStream(dataciteIn)) { + + dataciteOut.write(dataciteXml.getBytes(StandardCharsets.UTF_8)); + dataciteOut.close(); + success = true; + } catch (Exception e) { + logger.severe("Error creating datacite.xml: " + e.getMessage()); + e.printStackTrace(); } - return new Failure("Error in transferring DataCite.xml file to GoogleCloud", - "GoogleCloud Submission Failure: incomplete metadata transfer"); } + }); + dcThread.start(); + // Have seen Pipe Closed errors for other archivers when used as a workflow + // without this delay loop + int i = 0; + while (digestInputStream.available() <= 0 && i < 100) { + Thread.sleep(10); + i++; + } + Blob dcXml = bucket.create(dataciteFileName, digestInputStream, "text/xml", + Bucket.BlobWriteOption.doesNotExist()); - // Store BagIt file - success = false; - String fileName = spaceName + ".v" + dv.getFriendlyVersionNumber() + ".zip"; - - // Add BagIt ZIP file - // Google uses MD5 as one way to verify the - // transfer - messageDigest = MessageDigest.getInstance("MD5"); - try (PipedInputStream in = new PipedInputStream(100000); - DigestInputStream digestInputStream2 = new DigestInputStream(in, messageDigest)) { - Thread bagThread = startBagThread(dv, in, digestInputStream2, dataciteXml, token); - Blob bag = bucket.create(spaceName + "/" + fileName, digestInputStream2, "application/zip", - Bucket.BlobWriteOption.doesNotExist()); - if (bag.getSize() == 0) { - throw new IOException("Empty Bag"); - } - bagThread.join(); - - checksum = bag.getMd5ToHexString(); - logger.fine("Bag: " + fileName + " added with checksum: " + checksum); - localchecksum = Hex.encodeHexString(digestInputStream2.getMessageDigest().digest()); - if (!success || !checksum.equals(localchecksum)) { - logger.severe(success ? checksum + " not equal to " + localchecksum - : "bag transfer did not succeed"); - try { - bag.delete(Blob.BlobSourceOption.generationMatch()); - } catch (StorageException se) { - logger.warning(se.getMessage()); - } - return new Failure("Error in transferring Zip file to GoogleCloud", - "GoogleCloud Submission Failure: incomplete archive transfer"); - } + dcThread.join(); + String checksum = dcXml.getMd5ToHexString(); + logger.fine("Content: datacite.xml added with checksum: " + checksum); + String localchecksum = Hex.encodeHexString(digestInputStream.getMessageDigest().digest()); + if (!success || !checksum.equals(localchecksum)) { + logger.severe("Failure on " + spaceName); + logger.severe(success ? checksum + " not equal to " + localchecksum + : "datacite.xml transfer did not succeed"); + try { + dcXml.delete(Blob.BlobSourceOption.generationMatch()); + } catch (StorageException se) { + logger.warning(se.getMessage()); } + return new Failure("Error in transferring DataCite.xml file to GoogleCloud", + "GoogleCloud Submission Failure: incomplete metadata transfer"); + } + } - logger.fine("GoogleCloud Submission step: Content Transferred"); + tempBagFile = Files.createTempFile("dataverse-bag-", ".zip"); + logger.fine("Creating bag in temporary file: " + tempBagFile.toString()); - // Document the location of dataset archival copy location (actually the URL - // where you can view it as an admin) - // Changed to point at bucket where the zip and datacite.xml are visible + BagGenerator bagger = new BagGenerator(ore, dataciteXml, terms); + bagger.setAuthenticationKey(token.getTokenString()); + // Generate bag to temporary file using the provided ore JsonObject + try (FileOutputStream fos = new FileOutputStream(tempBagFile.toFile())) { + if (!bagger.generateBag(fos)) { + throw new IOException("Bag generation failed"); + } + } + + // Store BagIt file + long bagSize = Files.size(tempBagFile); + logger.fine("Bag created successfully, size: " + bagSize + " bytes"); + + if (bagSize == 0) { + throw new IOException("Generated bag file is empty"); + } + + // Upload bag file and calculate checksum during upload + messageDigest = MessageDigest.getInstance("MD5"); + String localChecksum; + + try (FileInputStream fis = new FileInputStream(tempBagFile.toFile()); + DigestInputStream dis = new DigestInputStream(fis, messageDigest)) { + + logger.fine("Uploading bag to GoogleCloud: " + bagFileName); + + Blob bag = bucket.create(bagFileName, dis, "application/zip", + Bucket.BlobWriteOption.doesNotExist()); + + if (bag.getSize() == 0) { + throw new IOException("Uploaded bag has zero size"); + } + + // Get checksum after upload completes + localChecksum = Hex.encodeHexString(dis.getMessageDigest().digest()); + String remoteChecksum = bag.getMd5ToHexString(); + + logger.fine("Bag: " + bagFileName + " uploaded"); + logger.fine("Local checksum: " + localChecksum); + logger.fine("Remote checksum: " + remoteChecksum); + + if (!localChecksum.equals(remoteChecksum)) { + logger.severe("Bag checksum mismatch!"); + logger.severe("Local: " + localChecksum + " != Remote: " + remoteChecksum); + try { + bag.delete(Blob.BlobSourceOption.generationMatch()); + } catch (StorageException se) { + logger.warning(se.getMessage()); + } + return new Failure("Error in transferring Zip file to GoogleCloud", + "GoogleCloud Submission Failure: bag checksum mismatch"); + } + } + + logger.fine("GoogleCloud Submission step: Content Transferred Successfully"); - StringBuffer sb = new StringBuffer("https://console.cloud.google.com/storage/browser/"); - sb.append(bucketName + "/" + spaceName); - statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_SUCCESS); - statusObject.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, sb.toString()); - + // Now upload any files that were too large for the bag + for (FileEntry entry : bagger.getOversizedFiles()) { + String childPath = entry.getChildPath(entry.getChildTitle()); + String fileKey = spaceName + "/" + childPath; + logger.fine("Uploading oversized file to GoogleCloud: " + fileKey); + messageDigest = MessageDigest.getInstance("MD5"); + InputStreamSupplier supplier = bagger.getInputStreamSupplier(entry.getDataUrl()); + try (InputStream is = supplier.get(); + DigestInputStream dis = new DigestInputStream(is, messageDigest)) { + Blob oversizedFileBlob = bucket.create(fileKey, dis, Bucket.BlobWriteOption.doesNotExist()); + if (oversizedFileBlob.getSize() == 0) { + throw new IOException("Uploaded oversized file has zero size: " + fileKey); + } + localChecksum = Hex.encodeHexString(dis.getMessageDigest().digest()); + String remoteChecksum = oversizedFileBlob.getMd5ToHexString(); + logger.fine("Oversized file: " + fileKey + " uploaded"); + logger.fine("Local checksum: " + localChecksum); + logger.fine("Remote checksum: " + remoteChecksum); + if (!localChecksum.equals(remoteChecksum)) { + logger.severe("Oversized file checksum mismatch!"); + logger.severe("Local: " + localChecksum + " != Remote: " + remoteChecksum); + try { + oversizedFileBlob.delete(Blob.BlobSourceOption.generationMatch()); + } catch (StorageException se) { + logger.warning(se.getMessage()); + } + return new Failure("Error in transferring oversized file to GoogleCloud", + "GoogleCloud Submission Failure: oversized file transfer incomplete"); + } + } catch (IOException e) { + logger.warning("Failed to upload oversized file: " + childPath + " : " + e.getMessage()); + return new Failure("Error uploading oversized file to Google Cloud: " + childPath); } - } else { - logger.warning("GoogleCloud Submision Workflow aborted: Dataset locked for pidRegister"); - return new Failure("Dataset locked"); } + + // Document the location of dataset archival copy location (actually the URL + // to the bucket). + statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_SUCCESS); + statusObject.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, + String.format("https://storage.cloud.google.com/%s/%s", bucketName, spaceName)); + } catch (Exception e) { logger.warning(e.getLocalizedMessage()); e.printStackTrace(); return new Failure("GoogleCloud Submission Failure", - e.getLocalizedMessage() + ": check log for details"); + e.getLocalizedMessage() + ": check log for details"); } finally { + if (tempBagFile != null) { + try { + Files.deleteIfExists(tempBagFile); + } catch (IOException e) { + logger.warning("Failed to delete temporary bag file: " + tempBagFile + " : " + e.getMessage()); + } + } dv.setArchivalCopyLocation(statusObject.build().toString()); } return WorkflowStepResult.OK; } else { - return new Failure("GoogleCloud Submission not configured - no \":GoogleCloudBucket\" and/or \":GoogleCloudProject\"."); + return new Failure( + "GoogleCloud Submission not configured - no \":GoogleCloudBucket\" and/or \":GoogleCloudProject\"."); } } diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/LocalSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/LocalSubmitToArchiveCommand.java index 462879f2ec9..1bf1e1be48d 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/LocalSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/LocalSubmitToArchiveCommand.java @@ -2,7 +2,6 @@ import edu.harvard.iq.dataverse.Dataset; import edu.harvard.iq.dataverse.DatasetVersion; -import edu.harvard.iq.dataverse.DatasetLock.Reason; import edu.harvard.iq.dataverse.authorization.Permission; import edu.harvard.iq.dataverse.authorization.users.ApiToken; import edu.harvard.iq.dataverse.engine.command.Command; @@ -10,7 +9,8 @@ import edu.harvard.iq.dataverse.engine.command.RequiredPermissions; import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.BagItLocalPath; import edu.harvard.iq.dataverse.util.bagit.BagGenerator; -import edu.harvard.iq.dataverse.util.bagit.OREMap; +import edu.harvard.iq.dataverse.util.bagit.BagGenerator.FileEntry; +import edu.harvard.iq.dataverse.util.json.JsonLDTerm; import edu.harvard.iq.dataverse.workflow.step.Failure; import edu.harvard.iq.dataverse.workflow.step.WorkflowStepResult; @@ -19,10 +19,12 @@ import java.util.logging.Logger; import jakarta.json.Json; +import jakarta.json.JsonObject; import jakarta.json.JsonObjectBuilder; import java.io.File; import java.io.FileOutputStream; +import java.io.InputStream; import org.apache.commons.io.FileUtils; @@ -35,62 +37,115 @@ public LocalSubmitToArchiveCommand(DataverseRequest aRequest, DatasetVersion ver super(aRequest, version); } + public static boolean supportsDelete() { + return true; + } + @Override + public boolean canDelete() { + return supportsDelete(); + } + @Override - public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token, - Map requestedSettings) { - logger.fine("In LocalCloudSubmitToArchive..."); + public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, String dataciteXml, JsonObject ore, + Map terms, ApiToken token, Map requestedSettings) { + logger.fine("In LocalSubmitToArchive..."); String localPath = requestedSettings.get(BagItLocalPath.toString()); String zipName = null; - - //Set a failure status that will be updated if we succeed + + // Set a failure status that will be updated if we succeed JsonObjectBuilder statusObject = Json.createObjectBuilder(); statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_FAILURE); statusObject.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, "Bag not transferred"); - + try { Dataset dataset = dv.getDataset(); - if (dataset.getLockFor(Reason.finalizePublication) == null - && dataset.getLockFor(Reason.FileValidationFailed) == null) { - - String spaceName = dataset.getGlobalId().asString().replace(':', '-').replace('/', '-') - .replace('.', '-').toLowerCase(); - - String dataciteXml = getDataCiteXml(dv); - - FileUtils.writeStringToFile( - new File(localPath + "/" + spaceName + "-datacite.v" + dv.getFriendlyVersionNumber() + ".xml"), - dataciteXml, StandardCharsets.UTF_8); - BagGenerator bagger = new BagGenerator(new OREMap(dv, false), dataciteXml); - bagger.setNumConnections(getNumberOfBagGeneratorThreads()); - bagger.setAuthenticationKey(token.getTokenString()); - zipName = localPath + "/" + spaceName + "v" + dv.getFriendlyVersionNumber() + ".zip"; - //ToDo: generateBag(File f, true) seems to do the same thing (with a .tmp extension) - since we don't have to use a stream here, could probably just reuse the existing code? - bagger.generateBag(new FileOutputStream(zipName + ".partial")); - - File srcFile = new File(zipName + ".partial"); - File destFile = new File(zipName); - - if (srcFile.renameTo(destFile)) { - logger.fine("Localhost Submission step: Content Transferred"); - statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_SUCCESS); - statusObject.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, "file://" + zipName); + String spaceName = getSpaceName(dataset); + + // Define file paths + String dataciteFileName = localPath + "/" + getDataCiteFileName(spaceName, dv) + ".xml"; + zipName = localPath + "/" + getFileName(spaceName, dv) + ".zip"; + + // Check for and delete existing files for this version + logger.fine("Checking for existing files in archive..."); + + File existingDatacite = new File(dataciteFileName); + if (existingDatacite.exists()) { + logger.fine("Found existing datacite.xml, deleting: " + dataciteFileName); + if (existingDatacite.delete()) { + logger.fine("Deleted existing datacite.xml"); + } else { + logger.warning("Failed to delete existing datacite.xml: " + dataciteFileName); + } + } + + File existingBag = new File(zipName); + if (existingBag.exists()) { + logger.fine("Found existing bag file, deleting: " + zipName); + if (existingBag.delete()) { + logger.fine("Deleted existing bag file"); + } else { + logger.warning("Failed to delete existing bag file: " + zipName); + } + } + + // Also check for and delete the .partial file if it exists + File existingPartial = new File(zipName + ".partial"); + if (existingPartial.exists()) { + logger.fine("Found existing partial bag file, deleting: " + zipName + ".partial"); + if (existingPartial.delete()) { + logger.fine("Deleted existing partial bag file"); } else { - logger.warning("Unable to move " + zipName + ".partial to " + zipName); + logger.warning("Failed to delete existing partial bag file: " + zipName + ".partial"); + } + } + + // Write datacite.xml file + FileUtils.writeStringToFile(new File(dataciteFileName), dataciteXml, StandardCharsets.UTF_8); + logger.fine("Datacite XML written to: " + dataciteFileName); + + // Generate bag + BagGenerator bagger = new BagGenerator(ore, dataciteXml, terms); + bagger.setAuthenticationKey(token.getTokenString()); + + boolean bagSuccess = bagger.generateBag(new FileOutputStream(zipName + ".partial")); + + if (!bagSuccess) { + logger.severe("Bag generation failed for " + zipName); + return new Failure("Local Submission Failure", "Bag generation failed"); + } + // Now download any files that were too large for the bag + for (FileEntry entry : bagger.getOversizedFiles()) { + String childPath = entry.getChildPath(entry.getChildTitle()); + File destFile = new File(localPath, + localPath + "/" + spaceName + "v" + dv.getFriendlyVersionNumber() + "/" + childPath); + logger.fine("Downloading oversized file to " + destFile.getAbsolutePath()); + destFile.getParentFile().mkdirs(); + try (InputStream is = bagger.getInputStreamSupplier(entry.getDataUrl()).get()) { + FileUtils.copyInputStreamToFile(is, destFile); } + } + + File srcFile = new File(zipName + ".partial"); + File destFile = new File(zipName); + + if (srcFile.renameTo(destFile)) { + logger.fine("Localhost Submission step: Content Transferred to " + zipName); + statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_SUCCESS); + statusObject.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, "file://" + zipName); } else { - logger.warning( - "Localhost Submision Workflow aborted: Dataset locked for finalizePublication, or because file validation failed"); - return new Failure("Dataset locked"); + logger.severe("Unable to move " + zipName + ".partial to " + zipName); + return new Failure("Local Submission Failure", "Unable to rename partial file to final file"); } } catch (Exception e) { logger.warning("Failed to archive " + zipName + " : " + e.getLocalizedMessage()); e.printStackTrace(); + return new Failure("Local Submission Failure", e.getLocalizedMessage() + ": check log for details"); } finally { dv.setArchivalCopyLocation(statusObject.build().toString()); } - + return WorkflowStepResult.OK; } diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/PublishDatasetCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/PublishDatasetCommand.java index 915ef6ea2a1..b0a5b9cd3a0 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/PublishDatasetCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/PublishDatasetCommand.java @@ -12,11 +12,13 @@ import edu.harvard.iq.dataverse.engine.command.exception.IllegalCommandException; import edu.harvard.iq.dataverse.util.BundleUtil; import edu.harvard.iq.dataverse.workflow.Workflow; +import edu.harvard.iq.dataverse.workflow.WorkflowContext; import edu.harvard.iq.dataverse.workflow.WorkflowContext.TriggerType; import jakarta.persistence.OptimisticLockException; import java.util.Optional; +import java.util.logging.Level; import java.util.logging.Logger; import static java.util.stream.Collectors.joining; import static edu.harvard.iq.dataverse.engine.command.impl.PublishDatasetResult.Status; @@ -106,20 +108,17 @@ public PublishDatasetResult execute(CommandContext ctxt) throws CommandException } } - //ToDo - should this be in onSuccess()? May relate to todo above Optional prePubWf = ctxt.workflows().getDefaultWorkflow(TriggerType.PrePublishDataset); - if ( prePubWf.isPresent() ) { + if (prePubWf.isPresent()) { // We start a workflow try { theDataset = ctxt.em().merge(theDataset); ctxt.em().flush(); - ctxt.workflows().start(prePubWf.get(), - buildContext(theDataset, TriggerType.PrePublishDataset, datasetExternallyReleased), true); + return new PublishDatasetResult(theDataset, Status.Workflow); } catch (OptimisticLockException e) { throw new CommandException(e.getMessage(), e, this); } - } else{ // We will skip trying to register the global identifiers for datafiles // if "dependent" file-level identifiers are requested, AND the naming @@ -131,7 +130,7 @@ public PublishDatasetResult execute(CommandContext ctxt) throws CommandException // than the configured limit number of files, then call Finalize // asychronously (default is 10) // ... - // Additionaly in 4.9.3 we have added a system variable to disable + // Additionally in 4.9.3 we have added a system variable to disable // registering file PIDs on the installation level. boolean registerGlobalIdsForFiles = ctxt.systemConfig().isFilePIDsEnabledForCollection(getDataset().getOwner()) && @@ -257,10 +256,22 @@ public boolean onSuccess(CommandContext ctxt, Object r) { dataset = ((PublishDatasetResult) r).getDataset(); } + final Dataset ds = dataset; + if (dataset != null) { + Optional prePubWf = ctxt.workflows().getDefaultWorkflow(TriggerType.PrePublishDataset); - //A pre-publication workflow will call FinalizeDatasetPublicationCommand itself when it completes - if (! prePubWf.isPresent() ) { + // A pre-publication workflow will call FinalizeDatasetPublicationCommand itself when it completes + if (prePubWf.isPresent()) { + WorkflowContext context = buildContext(ds, TriggerType.PrePublishDataset, datasetExternallyReleased); + try { + ctxt.workflows().start(prePubWf.get(), context, true); + } catch (CommandException e) { + logger.log(Level.SEVERE, "Error invoking pre-publish workflow: " + e.getMessage(), e); + return false; + } + } + else { logger.fine("From onSuccess, calling FinalizeDatasetPublicationCommand for dataset " + dataset.getGlobalId().asString()); ctxt.datasets().callFinalizePublishCommandAsynchronously(dataset.getId(), ctxt, request, datasetExternallyReleased); } diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/S3SubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/S3SubmitToArchiveCommand.java index 65531d775c8..9d79d813476 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/S3SubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/S3SubmitToArchiveCommand.java @@ -2,24 +2,28 @@ import edu.harvard.iq.dataverse.Dataset; import edu.harvard.iq.dataverse.DatasetVersion; -import edu.harvard.iq.dataverse.DatasetLock.Reason; import edu.harvard.iq.dataverse.authorization.Permission; import edu.harvard.iq.dataverse.authorization.users.ApiToken; import edu.harvard.iq.dataverse.engine.command.DataverseRequest; import edu.harvard.iq.dataverse.engine.command.RequiredPermissions; import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.S3ArchiverConfig; import edu.harvard.iq.dataverse.util.bagit.BagGenerator; -import edu.harvard.iq.dataverse.util.bagit.OREMap; +import edu.harvard.iq.dataverse.util.bagit.BagGenerator.FileEntry; import edu.harvard.iq.dataverse.util.json.JsonUtil; +import edu.harvard.iq.dataverse.util.json.JsonLDTerm; import edu.harvard.iq.dataverse.workflow.step.Failure; import edu.harvard.iq.dataverse.workflow.step.WorkflowStepResult; -import java.io.ByteArrayInputStream; import java.io.File; -import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; import java.nio.charset.StandardCharsets; +import java.util.List; import java.util.Map; import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.logging.Level; import java.util.logging.Logger; import jakarta.annotation.Resource; @@ -28,6 +32,7 @@ import jakarta.json.JsonObject; import jakarta.json.JsonObjectBuilder; +import org.apache.commons.compress.parallel.InputStreamSupplier; import org.eclipse.microprofile.config.Config; import org.eclipse.microprofile.config.ConfigProvider; @@ -38,25 +43,25 @@ import software.amazon.awssdk.auth.credentials.ProfileCredentialsProvider; import software.amazon.awssdk.auth.credentials.StaticCredentialsProvider; import software.amazon.awssdk.core.async.AsyncRequestBody; -import software.amazon.awssdk.core.sync.RequestBody; import software.amazon.awssdk.regions.Region; import software.amazon.awssdk.services.s3.S3AsyncClient; import software.amazon.awssdk.services.s3.S3AsyncClientBuilder; -import software.amazon.awssdk.services.s3.S3Client; -import software.amazon.awssdk.services.s3.model.GetObjectAttributesRequest; -import software.amazon.awssdk.services.s3.model.GetObjectAttributesResponse; -import software.amazon.awssdk.services.s3.model.ObjectAttributes; +import software.amazon.awssdk.services.s3.model.DeleteObjectRequest; +import software.amazon.awssdk.services.s3.model.DeleteObjectResponse; +import software.amazon.awssdk.services.s3.model.HeadObjectRequest; +import software.amazon.awssdk.services.s3.model.NoSuchKeyException; import software.amazon.awssdk.services.s3.model.PutObjectRequest; import software.amazon.awssdk.services.s3.model.PutObjectResponse; -import software.amazon.awssdk.services.s3.S3ClientBuilder; -import software.amazon.awssdk.services.s3.S3Configuration; import software.amazon.awssdk.http.async.SdkAsyncHttpClient; import software.amazon.awssdk.http.nio.netty.NettyNioAsyncHttpClient; import software.amazon.awssdk.utils.StringUtils; import software.amazon.awssdk.transfer.s3.S3TransferManager; import software.amazon.awssdk.transfer.s3.model.CompletedFileUpload; +import software.amazon.awssdk.transfer.s3.model.CompletedUpload; import software.amazon.awssdk.transfer.s3.model.FileUpload; +import software.amazon.awssdk.transfer.s3.model.Upload; import software.amazon.awssdk.transfer.s3.model.UploadFileRequest; +import software.amazon.awssdk.transfer.s3.model.UploadRequest; @RequiredPermissions(Permission.PublishDataset) public class S3SubmitToArchiveCommand extends AbstractSubmitToArchiveCommand { @@ -70,16 +75,24 @@ public class S3SubmitToArchiveCommand extends AbstractSubmitToArchiveCommand { private static final Config config = ConfigProvider.getConfig(); protected S3AsyncClient s3 = null; private S3TransferManager tm = null; - private String spaceName = null; + protected String bucketName = null; public S3SubmitToArchiveCommand(DataverseRequest aRequest, DatasetVersion version) { super(aRequest, version); } + public static boolean supportsDelete() { + return true; + } + @Override + public boolean canDelete() { + return supportsDelete(); + } + @Override - public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token, - Map requestedSettings) { + public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, String dataciteXml, JsonObject ore, + Map terms, ApiToken token, Map requestedSettings) { logger.fine("In S3SubmitToArchiveCommand..."); JsonObject configObject = null; @@ -98,83 +111,171 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken t JsonObjectBuilder statusObject = Json.createObjectBuilder(); statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_FAILURE); statusObject.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, "Bag not transferred"); + ExecutorService executor = Executors.newCachedThreadPool(); try { Dataset dataset = dv.getDataset(); - if (dataset.getLockFor(Reason.finalizePublication) == null) { + spaceName = getSpaceName(dataset); + + // Define keys for datacite.xml and bag file + String dcKey = spaceName + "/" + getDataCiteFileName(spaceName, dv) + ".xml"; + String bagKey = spaceName + "/" + getFileName(spaceName, dv) + ".zip"; + + // Check for and delete existing files for this version + logger.fine("Checking for existing files in archive..."); + + try { + HeadObjectRequest headDcRequest = HeadObjectRequest.builder() + .bucket(bucketName) + .key(dcKey) + .build(); + + s3.headObject(headDcRequest).join(); + + // If we get here, the object exists, so delete it + logger.fine("Found existing datacite.xml, deleting: " + dcKey); + DeleteObjectRequest deleteDcRequest = DeleteObjectRequest.builder() + .bucket(bucketName) + .key(dcKey) + .build(); - spaceName = getSpaceName(dataset); - String dataciteXml = getDataCiteXml(dv); - // Add datacite.xml file - String dcKey = spaceName + "/" + getDataCiteFileName(spaceName, dv) + ".xml"; + CompletableFuture deleteDcFuture = s3.deleteObject(deleteDcRequest); + DeleteObjectResponse deleteDcResponse = deleteDcFuture.join(); - PutObjectRequest putRequest = PutObjectRequest.builder() - .bucket(bucketName) - .key(dcKey) - .build(); + if (deleteDcResponse.sdkHttpResponse().isSuccessful()) { + logger.fine("Deleted existing datacite.xml"); + } else { + logger.warning("Failed to delete existing datacite.xml: " + dcKey); + } + } catch (Exception e) { + if (e.getCause() instanceof NoSuchKeyException) { + logger.fine("No existing datacite.xml found"); + } else { + logger.warning("Error checking/deleting existing datacite.xml: " + e.getMessage()); + } + } - CompletableFuture putFuture = s3.putObject(putRequest, - AsyncRequestBody.fromString(dataciteXml, StandardCharsets.UTF_8)); + try { + HeadObjectRequest headBagRequest = HeadObjectRequest.builder() + .bucket(bucketName) + .key(bagKey) + .build(); - // Wait for the put operation to complete - PutObjectResponse putResponse = putFuture.join(); + s3.headObject(headBagRequest).join(); - if (!putResponse.sdkHttpResponse().isSuccessful()) { - logger.warning("Could not write datacite xml to S3"); - return new Failure("S3 Archiver failed writing datacite xml file"); + // If we get here, the object exists, so delete it + logger.fine("Found existing bag file, deleting: " + bagKey); + DeleteObjectRequest deleteBagRequest = DeleteObjectRequest.builder() + .bucket(bucketName) + .key(bagKey) + .build(); + + CompletableFuture deleteBagFuture = s3.deleteObject(deleteBagRequest); + DeleteObjectResponse deleteBagResponse = deleteBagFuture.join(); + + if (deleteBagResponse.sdkHttpResponse().isSuccessful()) { + logger.fine("Deleted existing bag file"); + } else { + logger.warning("Failed to delete existing bag file: " + bagKey); + } + } catch (Exception e) { + if (e.getCause() instanceof NoSuchKeyException) { + logger.fine("No existing bag file found"); + } else { + logger.warning("Error checking/deleting existing bag file: " + e.getMessage()); } + } + + // Add datacite.xml file + PutObjectRequest putRequest = PutObjectRequest.builder() + .bucket(bucketName) + .key(dcKey) + .build(); + + CompletableFuture putFuture = s3.putObject(putRequest, + AsyncRequestBody.fromString(dataciteXml, StandardCharsets.UTF_8)); + + // Wait for the put operation to complete + PutObjectResponse putResponse = putFuture.join(); + + if (!putResponse.sdkHttpResponse().isSuccessful()) { + logger.warning("Could not write datacite xml to S3"); + return new Failure("S3 Archiver failed writing datacite xml file"); + } + + // Store BagIt file + String fileName = getFileName(spaceName, dv); + + // Generate bag + BagGenerator bagger = new BagGenerator(ore, dataciteXml, terms); + bagger.setAuthenticationKey(token.getTokenString()); + if (bagger.generateBag(fileName, false)) { + File bagFile = bagger.getBagFile(fileName); + + UploadFileRequest uploadFileRequest = UploadFileRequest.builder() + .putObjectRequest(req -> req.bucket(bucketName).key(bagKey)).source(bagFile.toPath()) + .build(); + + FileUpload fileUpload = tm.uploadFile(uploadFileRequest); - // Store BagIt file - String fileName = getFileName(spaceName, dv); + CompletedFileUpload uploadResult = fileUpload.completionFuture().join(); - String bagKey = spaceName + "/" + fileName + ".zip"; - // Add BagIt ZIP file - // Google uses MD5 as one way to verify the - // transfer + if (uploadResult.response().sdkHttpResponse().isSuccessful()) { + logger.fine("S3 Submission step: Content Transferred"); - // Generate bag - BagGenerator bagger = new BagGenerator(new OREMap(dv, false), dataciteXml); - bagger.setAuthenticationKey(token.getTokenString()); - if (bagger.generateBag(fileName, false)) { - File bagFile = bagger.getBagFile(fileName); + List bigFiles = bagger.getOversizedFiles(); - UploadFileRequest uploadFileRequest = UploadFileRequest.builder() - .putObjectRequest(req -> req.bucket(bucketName).key(bagKey)).source(bagFile.toPath()) - .build(); + for (FileEntry entry : bigFiles) { + String childPath = entry.getChildPath(entry.getChildTitle()); + String fileKey = spaceName + "/" + childPath; + InputStreamSupplier supplier = bagger.getInputStreamSupplier(entry.getDataUrl()); + try (InputStream is = supplier.get()) { - FileUpload fileUpload = tm.uploadFile(uploadFileRequest); + PutObjectRequest filePutRequest = PutObjectRequest.builder().bucket(bucketName) + .key(fileKey).build(); - CompletedFileUpload uploadResult = fileUpload.completionFuture().join(); + UploadRequest uploadRequest = UploadRequest.builder().putObjectRequest(filePutRequest) + .requestBody(AsyncRequestBody.fromInputStream(is, entry.getSize(), executor)) + .build(); - if (uploadResult.response().sdkHttpResponse().isSuccessful()) { - logger.fine("S3 Submission step: Content Transferred"); + Upload upload = tm.upload(uploadRequest); + CompletedUpload completedUpload = upload.completionFuture().join(); - statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_SUCCESS); - statusObject.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, - String.format("https://%s.s3.amazonaws.com/%s", bucketName, bagKey)); - } else { - logger.severe("Error sending file to S3: " + fileName); - return new Failure("Error in transferring Bag file to S3", - "S3 Submission Failure: incomplete transfer"); + if (completedUpload.response().sdkHttpResponse().isSuccessful()) { + logger.fine("Successfully uploaded oversized file: " + fileKey); + } else { + logger.warning("Failed to upload oversized file: " + fileKey); + return new Failure("Error uploading oversized file to S3: " + fileKey); + } + } catch (IOException e) { + logger.log(Level.WARNING, "Failed to get input stream for oversized file: " + fileKey, + e); + return new Failure("Error getting input stream for oversized file: " + fileKey); + } } + + statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_SUCCESS); + statusObject.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, + String.format("https://%s.s3.amazonaws.com/%s", bucketName, bagKey)); } else { - logger.warning("Could not write local Bag file " + fileName); - return new Failure("S3 Archiver fail writing temp local bag"); + logger.severe("Error sending file to S3: " + fileName); + return new Failure("Error in transferring Bag file to S3", + "S3 Submission Failure: incomplete transfer"); } - } else { - logger.warning( - "S3 Archiver Submision Workflow aborted: Dataset locked for publication/pidRegister"); - return new Failure("Dataset locked"); + logger.warning("Could not write local Bag file " + fileName); + return new Failure("S3 Archiver fail writing temp local bag"); } + } catch (Exception e) { logger.warning(e.getLocalizedMessage()); e.printStackTrace(); return new Failure("S3 Archiver Submission Failure", - e.getLocalizedMessage() + ": check log for details"); + e.getLocalizedMessage() + ": check log for details"); } finally { + executor.shutdown(); if (tm != null) { tm.close(); } @@ -183,24 +284,8 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken t return WorkflowStepResult.OK; } else { return new Failure( - "S3 Submission not configured - no \":S3ArchivalProfile\" and/or \":S3ArchivalConfig\" or no bucket-name defined in config."); - } - } - - protected String getDataCiteFileName(String spaceName, DatasetVersion dv) { - return spaceName + "_datacite.v" + dv.getFriendlyVersionNumber(); - } - - protected String getFileName(String spaceName, DatasetVersion dv) { - return spaceName + ".v" + dv.getFriendlyVersionNumber(); - } - - protected String getSpaceName(Dataset dataset) { - if (spaceName == null) { - spaceName = dataset.getGlobalId().asString().replace(':', '-').replace('/', '-').replace('.', '-') - .toLowerCase(); + "S3 Submission not configured - no \":S3ArchivalProfile\" and/or \":S3ArchivalConfig\" or no bucket-name defined in config."); } - return spaceName; } private S3AsyncClient createClient(JsonObject configObject) { @@ -232,11 +317,11 @@ private S3AsyncClient createClient(JsonObject configObject) { String accessKey = config.getOptionalValue("dataverse.s3archiver.access-key", String.class).orElse(""); String secretKey = config.getOptionalValue("dataverse.s3archiver.secret-key", String.class).orElse(""); AwsCredentialsProvider staticCredentials = StaticCredentialsProvider - .create(AwsBasicCredentials.create(accessKey, secretKey)); + .create(AwsBasicCredentials.create(accessKey, secretKey)); AwsCredentialsProvider credentialsProviderChain = AwsCredentialsProviderChain.builder() - .addCredentialsProvider(profileCredentials).addCredentialsProvider(staticCredentials) - .addCredentialsProvider(DefaultCredentialsProvider.create()).build(); + .addCredentialsProvider(profileCredentials).addCredentialsProvider(staticCredentials) + .addCredentialsProvider(DefaultCredentialsProvider.create()).build(); s3CB.credentialsProvider(credentialsProviderChain); s3 = s3CB.build(); diff --git a/src/main/java/edu/harvard/iq/dataverse/harvest/server/OAIRecordServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/harvest/server/OAIRecordServiceBean.java index cc15d4c978b..1128746c06b 100644 --- a/src/main/java/edu/harvard/iq/dataverse/harvest/server/OAIRecordServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/harvest/server/OAIRecordServiceBean.java @@ -26,6 +26,7 @@ import static jakarta.ejb.TransactionAttributeType.REQUIRES_NEW; import jakarta.inject.Named; import jakarta.persistence.EntityManager; +import jakarta.persistence.OptimisticLockException; import jakarta.persistence.PersistenceContext; import jakarta.persistence.TypedQuery; import jakarta.persistence.TemporalType; @@ -262,7 +263,9 @@ public void exportAllFormatsInNewTransaction(Dataset dataset) throws ExportExcep try { ExportService exportServiceInstance = ExportService.getInstance(); exportServiceInstance.exportAllFormats(dataset); - dataset = datasetService.merge(dataset); + datasetService.setLastExportTimeInNewTransaction(dataset.getId(), dataset.getLastExportTime()); + } catch (OptimisticLockException ole) { + datasetService.setLastExportTimeInNewTransaction(dataset.getId(), dataset.getLastExportTime()); } catch (Exception e) { logger.log(Level.FINE, "Caught unknown exception while trying to export", e); throw new ExportException(e.getMessage()); diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java index 63ae3625a96..0d0687507df 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java @@ -165,6 +165,9 @@ public class IndexServiceBean { @EJB DatasetFieldServiceBean datasetFieldService; + @EJB + IndexServiceBean self; + @Inject DatasetVersionFilesServiceBean datasetVersionFilesServiceBean; @@ -502,7 +505,7 @@ public void indexDvObject(DvObject objectIn) throws SolrServerException, IOExce public void indexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) throws SolrServerException, IOException { doIndexDataset(dataset, doNormalSolrDocCleanUp); - updateLastIndexedTime(dataset.getId()); + self.updateLastIndexedTime(dataset.getId()); } private void doIndexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) throws SolrServerException, IOException { @@ -1874,15 +1877,15 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset, Set d } @Asynchronous - private void updateLastIndexedTime(Long id) { + public void updateLastIndexedTime(Long id) { // indexing is often in a transaction with update statements // if we flush on query (flush-mode auto), we want to prevent locking // -> update the dataset asynchronously in a new transaction - updateLastIndexedTimeInNewTransaction(id); + self.updateLastIndexedTimeInNewTransaction(id); } @TransactionAttribute(REQUIRES_NEW) - private void updateLastIndexedTimeInNewTransaction(Long id) { + public void updateLastIndexedTimeInNewTransaction(Long id) { /// Dataset updatedDataset = /// (Dataset)dvObjectService.updateContentIndexTime(dataset); /// updatedDataset = null; diff --git a/src/main/java/edu/harvard/iq/dataverse/settings/FeatureFlags.java b/src/main/java/edu/harvard/iq/dataverse/settings/FeatureFlags.java index 2e86fae610e..fdbdb257dbe 100644 --- a/src/main/java/edu/harvard/iq/dataverse/settings/FeatureFlags.java +++ b/src/main/java/edu/harvard/iq/dataverse/settings/FeatureFlags.java @@ -249,6 +249,19 @@ public enum FeatureFlags { * @since Dataverse 6.9 */ ONLY_UPDATE_DATACITE_WHEN_NEEDED("only-update-datacite-when-needed"), + /** + * Indicates whether archival bag creation should be triggered (if configured) when a version + * is updated and was already successfully archived, i.e via the Update-Current-Version publication option. + * Since archiving can be resource intensive, it may not be worthwhile to automatically re-archive for the + * types of minor changes "Update-Current-Version" is intended for. Note that this flag is only effective + * for archivers that support deletion of existing files. When the flag is false, or the archiver cannot + * delete, the existing archival status will be changed to "Obsolete". + * + * * @apiNote Raise flag by setting "dataverse.feature.archive-on-version-update" + * + * @since Dataverse 6.10 + */ + ARCHIVE_ON_VERSION_UPDATE("archive-on-version-update"), ; diff --git a/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java b/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java index 8816ffb52c4..1799b2ef6d5 100644 --- a/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java +++ b/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java @@ -275,6 +275,11 @@ public enum JvmSettings { BAGIT_SOURCE_ORG_NAME(SCOPE_BAGIT_SOURCEORG, "name"), BAGIT_SOURCEORG_ADDRESS(SCOPE_BAGIT_SOURCEORG, "address"), BAGIT_SOURCEORG_EMAIL(SCOPE_BAGIT_SOURCEORG, "email"), + SCOPE_BAGIT_ZIP(SCOPE_BAGIT, "zip"), + BAGIT_ZIP_MAX_FILE_SIZE(SCOPE_BAGIT_ZIP, "max-file-size"), + BAGIT_ZIP_MAX_DATA_SIZE(SCOPE_BAGIT_ZIP, "max-data-size"), + BAGIT_ZIP_HOLEY(SCOPE_BAGIT_ZIP, "holey"), + BAGIT_ARCHIVE_ON_VERSION_UPDATE(SCOPE_BAGIT, "archive-on-version-update"), // STORAGE USE SETTINGS SCOPE_STORAGEUSE(PREFIX, "storageuse"), diff --git a/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java index 3786c7222a3..e8ab462e8e9 100644 --- a/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java @@ -485,6 +485,12 @@ Whether Harvesting (OAI) service is enabled */ ArchiverClassName, + /* + * Only create an archival Bag for a dataset version if all prior versions have + * been successfully archived + */ + ArchiveOnlyIfEarlierVersionsAreArchived, + /** * Custom settings for each archiver. See list below. */ @@ -800,16 +806,13 @@ public static SettingsServiceBean.Key parse(String key) { // Cut off the ":" we verified is present before String normalizedKey = key.substring(1); - // Iterate through all the known keys and return on match (case sensitive!) // We are case sensitive here because Dataverse implicitely uses case sensitive keys everywhere! - for (SettingsServiceBean.Key k : SettingsServiceBean.Key.values()) { - if (k.name().equals(normalizedKey)) { - return k; - } + try { + return SettingsServiceBean.Key.valueOf(normalizedKey); + } catch (IllegalArgumentException e) { + // Fall through on no match - return null for invalid keys + return null; } - - // Fall through on no match - return null; } } diff --git a/src/main/java/edu/harvard/iq/dataverse/util/ArchiverUtil.java b/src/main/java/edu/harvard/iq/dataverse/util/ArchiverUtil.java index 18ec6243d5a..7d03004f3f7 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/ArchiverUtil.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/ArchiverUtil.java @@ -71,5 +71,16 @@ public static boolean isSomeVersionArchived(Dataset dataset) { return someVersionArchived; } + + /** + * Checks if a version has been successfully archived. + * + * @param version the version to check + * @return true if the version has been successfully archived, false otherwise + */ + public static boolean isVersionArchived(DatasetVersion version) { + String status = version.getArchivalCopyLocationStatus(); + return status != null && status.equals(DatasetVersion.ARCHIVAL_STATUS_SUCCESS); + } } \ No newline at end of file diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index bf48787cdd7..f0f863dcd38 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -23,10 +23,13 @@ import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Calendar; +import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; import java.util.Set; import java.util.TreeSet; import java.util.Map.Entry; @@ -48,7 +51,6 @@ import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream; import org.apache.commons.compress.archivers.zip.ZipFile; import org.apache.commons.compress.parallel.InputStreamSupplier; -import org.apache.commons.compress.utils.IOUtils; import org.apache.hc.client5.http.ClientProtocolException; import org.apache.hc.client5.http.classic.methods.HttpGet; import org.apache.hc.client5.http.config.RequestConfig; @@ -76,6 +78,7 @@ import com.google.gson.JsonSyntaxException; import edu.harvard.iq.dataverse.DataFile; +import edu.harvard.iq.dataverse.DatasetFieldConstant; import edu.harvard.iq.dataverse.DataFile.ChecksumType; import edu.harvard.iq.dataverse.pidproviders.PidUtil; import edu.harvard.iq.dataverse.settings.JvmSettings; @@ -84,7 +87,6 @@ import edu.harvard.iq.dataverse.util.SystemConfig; import edu.harvard.iq.dataverse.util.json.JsonLDTerm; import jakarta.enterprise.inject.spi.CDI; -import java.util.Optional; /** * Creates an archival zipped Bag for long-term storage. It is intended to @@ -99,6 +101,10 @@ public class BagGenerator { private static final Logger logger = Logger.getLogger(BagGenerator.class.getCanonicalName()); + static final String CRLF = "\r\n"; + + protected static final int MAX_RETRIES = 5; + private ParallelScatterZipCreator scatterZipCreator = null; private ScatterZipOutputStream dirs = null; @@ -110,9 +116,9 @@ public class BagGenerator { private int timeout = 300; private RequestConfig config = RequestConfig.custom() - .setConnectionRequestTimeout(Timeout.ofSeconds(timeout)) - .setResponseTimeout(Timeout.ofSeconds(timeout)) - .build(); + .setConnectionRequestTimeout(Timeout.ofSeconds(timeout)) + .setResponseTimeout(Timeout.ofSeconds(timeout)) + .build(); protected CloseableHttpClient client; private PoolingHttpClientConnectionManager cm = null; @@ -137,13 +143,22 @@ public class BagGenerator { private boolean usetemp = false; + private Map terms; + private static int numConnections = 2; public static final String BAG_GENERATOR_THREADS = BagGeneratorThreads.toString(); - private OREMap oremap; - static PrintWriter pw = null; + // Size limits and holey Bags + private long maxDataFileSize = Long.MAX_VALUE; + private long maxTotalDataSize = Long.MAX_VALUE; + private long currentBagDataSize = 0; + private StringBuilder fetchFileContent = new StringBuilder(); + private boolean usingFetchFile = false; + private boolean createHoleyBag = false; + private List oversizedFiles = new ArrayList<>(); + // Bag-info.txt field labels private static final String CONTACT_NAME = "Contact-Name: "; private static final String CONTACT_EMAIL = "Contact-Email: "; @@ -156,9 +171,11 @@ public class BagGenerator { private static final String BAG_SIZE = "Bag-Size: "; private static final String PAYLOAD_OXUM = "Payload-Oxum: "; private static final String INTERNAL_SENDER_IDENTIFIER = "Internal-Sender-Identifier: "; - private static final String DATAVERSE_BAG_VERSION = "Dataverse-Bag-Version: "; - // Implement exponential backoff with jitter + /** THIS NUMBER SHOULD CHANGE ANY TIME THE BAG CONTENTS ARE CHANGED */ + private static final String DATAVERSE_BAG_VERSION = "Dataverse-Bag-Version: 1.0"; + + // Implement exponential backoff with jitter static final long baseWaitTimeMs = 1000; // Start with 1 second static final long maxWaitTimeMs = 30000; // Cap at 30 seconds @@ -175,15 +192,18 @@ public class BagGenerator { * and zipping are done in parallel, using a connection pool. The required space * on disk is ~ n+1/n of the final bag size, e.g. 125% of the bag size for a * 4-way parallel zip operation. + * @param oremapObject - OAI-ORE Map file as a JSON object + * @param dataciteXml - DataCite XML file as a string + * @param terms - Map of schema.org/terms to their corresponding JsonLDTerm objects * * @throws Exception * @throws JsonSyntaxException */ - public BagGenerator(OREMap oreMap, String dataciteXml) throws JsonSyntaxException, Exception { - this.oremap = oreMap; - this.oremapObject = oreMap.getOREMap(); + public BagGenerator(jakarta.json.JsonObject oremapObject, String dataciteXml, Map terms) throws JsonSyntaxException, Exception { + this.oremapObject = oremapObject; this.dataciteXml = dataciteXml; + this.terms = terms; try { /* @@ -206,23 +226,33 @@ public BagGenerator(OREMap oreMap, String dataciteXml) throws JsonSyntaxExceptio ); Registry registry = RegistryBuilder.create() - .register("http", PlainConnectionSocketFactory.getSocketFactory()) - .register("https", sslConnectionFactory).build(); + .register("http", PlainConnectionSocketFactory.getSocketFactory()) + .register("https", sslConnectionFactory).build(); cm = new PoolingHttpClientConnectionManager(registry); cm.setDefaultMaxPerRoute(numConnections); cm.setMaxTotal(numConnections > 20 ? numConnections : 20); client = HttpClients.custom() - .setConnectionManager(cm) - .setDefaultRequestConfig(config) - .build(); + .setConnectionManager(cm) + .setDefaultRequestConfig(config) + .build(); scatterZipCreator = new ParallelScatterZipCreator(Executors.newFixedThreadPool(numConnections)); } catch (NoSuchAlgorithmException | KeyManagementException e) { logger.warning("Failed to initialize HTTP client"); e.printStackTrace(); } + initializeHoleyBagLimits(); + } + + private void initializeHoleyBagLimits() { + this.maxDataFileSize = JvmSettings.BAGIT_ZIP_MAX_FILE_SIZE.lookupOptional(Long.class).orElse(Long.MAX_VALUE); + this.maxTotalDataSize = JvmSettings.BAGIT_ZIP_MAX_DATA_SIZE.lookupOptional(Long.class).orElse(Long.MAX_VALUE); + this.createHoleyBag = JvmSettings.BAGIT_ZIP_HOLEY.lookupOptional(Boolean.class).orElse(false); + logger.fine("BagGenerator size limits - maxDataFileSize: " + maxDataFileSize + + ", maxTotalDataSize: " + maxTotalDataSize + + ", createHoleyBag: " + createHoleyBag); } public void setIgnoreHashes(boolean val) { @@ -242,7 +272,7 @@ public static void println(String s) { /* * Full workflow to generate new BagIt bag from ORE Map Url and to write the bag * to the provided output stream (Ex: File OS, FTP OS etc.). - * + * * @return success true/false */ public boolean generateBag(OutputStream outputStream) throws Exception { @@ -252,7 +282,7 @@ public boolean generateBag(OutputStream outputStream) throws Exception { // The oremapObject is javax.json.JsonObject and we need // com.google.gson.JsonObject for the aggregation object aggregation = (JsonObject) JsonParser - .parseString(oremapObject.getJsonObject(JsonLDTerm.ore("describes").getLabel()).toString()); + .parseString(oremapObject.getJsonObject(JsonLDTerm.ore("describes").getLabel()).toString()); String pidUrlString = aggregation.get("@id").getAsString(); String pidString = PidUtil.parseAsGlobalID(pidUrlString).asString(); @@ -283,7 +313,15 @@ public boolean generateBag(OutputStream outputStream) throws Exception { resourceUsed = new Boolean[aggregates.size() + 1]; // Process current container (the aggregation itself) and its // children - processContainer(aggregation, currentPath); + // Recursively collect all files from the entire tree, start with an empty set of processedContainers + List allFiles = new ArrayList<>(); + collectAllFiles(aggregation, currentPath, allFiles, false); + + // Sort files by size (smallest first) + Collections.sort(allFiles); + + // Process all files in sorted order + processAllFiles(allFiles); } // Create manifest files // pid-mapping.txt - a DataOne recommendation to connect ids and @@ -292,7 +330,7 @@ public boolean generateBag(OutputStream outputStream) throws Exception { boolean first = true; for (Entry pidEntry : pidMap.entrySet()) { if (!first) { - pidStringBuffer.append("\r\n"); + pidStringBuffer.append(CRLF); } else { first = false; } @@ -307,14 +345,14 @@ public boolean generateBag(OutputStream outputStream) throws Exception { first = true; for (Entry sha1Entry : checksumMap.entrySet()) { if (!first) { - sha1StringBuffer.append("\r\n"); + sha1StringBuffer.append(CRLF); } else { first = false; } String path = sha1Entry.getKey(); sha1StringBuffer.append(sha1Entry.getValue() + " " + path); } - if (hashtype == null) { // No files - still want to send an empty manifest to nominally comply with BagIT specification requirement. + if(hashtype == null) { // No files - still want to send an empty manifest to nominally comply with BagIT specification requirement. try { // Use the current type if we can retrieve it hashtype = CDI.current().select(SystemConfig.class).get().getFileFixityChecksumAlgorithm(); @@ -364,6 +402,8 @@ public boolean generateBag(OutputStream outputStream) throws Exception { logger.fine("Creating bag: " + bagName); + writeFetchFile(); + ZipArchiveOutputStream zipArchiveOutputStream = new ZipArchiveOutputStream(outputStream); /* @@ -442,57 +482,54 @@ public boolean generateBag(String bagName, boolean temp) { public void validateBag(String bagId) { logger.info("Validating Bag"); - ZipFile zf = null; - InputStream is = null; try { File bagFile = getBagFile(bagId); - zf = ZipFile.builder().setFile(bagFile).get(); - ZipArchiveEntry entry = zf.getEntry(getValidName(bagId) + "/manifest-sha1.txt"); - if (entry != null) { - logger.info("SHA1 hashes used"); - hashtype = DataFile.ChecksumType.SHA1; - } else { - entry = zf.getEntry(getValidName(bagId) + "/manifest-sha512.txt"); + try (ZipFile zf = ZipFile.builder().setFile(bagFile).get()) { + ZipArchiveEntry entry = zf.getEntry(getValidName(bagId) + "/manifest-sha1.txt"); if (entry != null) { - logger.info("SHA512 hashes used"); - hashtype = DataFile.ChecksumType.SHA512; + logger.info("SHA1 hashes used"); + hashtype = DataFile.ChecksumType.SHA1; } else { - entry = zf.getEntry(getValidName(bagId) + "/manifest-sha256.txt"); + entry = zf.getEntry(getValidName(bagId) + "/manifest-sha512.txt"); if (entry != null) { - logger.info("SHA256 hashes used"); - hashtype = DataFile.ChecksumType.SHA256; + logger.info("SHA512 hashes used"); + hashtype = DataFile.ChecksumType.SHA512; } else { - entry = zf.getEntry(getValidName(bagId) + "/manifest-md5.txt"); + entry = zf.getEntry(getValidName(bagId) + "/manifest-sha256.txt"); if (entry != null) { - logger.info("MD5 hashes used"); - hashtype = DataFile.ChecksumType.MD5; + logger.info("SHA256 hashes used"); + hashtype = DataFile.ChecksumType.SHA256; + } else { + entry = zf.getEntry(getValidName(bagId) + "/manifest-md5.txt"); + if (entry != null) { + logger.info("MD5 hashes used"); + hashtype = DataFile.ChecksumType.MD5; + } } } } + if (entry == null) + throw new IOException("No manifest file found"); + try (InputStream is = zf.getInputStream(entry)) { + BufferedReader br = new BufferedReader(new InputStreamReader(is)); + String line = br.readLine(); + while (line != null) { + logger.fine("Hash entry: " + line); + int breakIndex = line.indexOf(' '); + String hash = line.substring(0, breakIndex); + String path = line.substring(breakIndex + 1); + logger.fine("Adding: " + path + " with hash: " + hash); + checksumMap.put(path, hash); + line = br.readLine(); + } + } } - if (entry == null) - throw new IOException("No manifest file found"); - is = zf.getInputStream(entry); - BufferedReader br = new BufferedReader(new InputStreamReader(is)); - String line = br.readLine(); - while (line != null) { - logger.fine("Hash entry: " + line); - int breakIndex = line.indexOf(' '); - String hash = line.substring(0, breakIndex); - String path = line.substring(breakIndex + 1); - logger.fine("Adding: " + path + " with hash: " + hash); - checksumMap.put(path, hash); - line = br.readLine(); - } - IOUtils.closeQuietly(is); logger.info("HashMap Map contains: " + checksumMap.size() + " entries"); checkFiles(checksumMap, bagFile); } catch (IOException io) { logger.log(Level.SEVERE, "Could not validate Hashes", io); } catch (Exception e) { logger.log(Level.SEVERE, "Could not validate Hashes", e); - } finally { - IOUtils.closeQuietly(zf); } return; } @@ -515,7 +552,7 @@ public File getBagFile(String bagID) throws Exception { private void validateBagFile(File bagFile) throws IOException { // Run a confirmation test - should verify all files and hashes - + // Check files calculates the hashes and file sizes and reports on // whether hashes are correct checkFiles(checksumMap, bagFile); @@ -529,18 +566,31 @@ public static String getValidName(String bagName) { return bagName.replaceAll("\\W", "-"); } - private void processContainer(JsonObject item, String currentPath) throws IOException { + // Collect all files recursively and process containers to create dirs in the zip + private void collectAllFiles(JsonObject item, String currentPath, List allFiles, boolean addTitle) + throws IOException { JsonArray children = getChildren(item); - HashSet titles = new HashSet(); + + if (addTitle) { //For any sub-collections (non-Dataverse) + String title = null; + if (item.has(JsonLDTerm.dcTerms("Title").getLabel())) { + title = item.get("Title").getAsString(); + } else if (item.has(JsonLDTerm.schemaOrg("name").getLabel())) { + title = item.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString(); + } + logger.fine("Collecting files from " + title + "/ at path " + currentPath); + currentPath = currentPath + title + "/"; + } + // Mark this container as processed + String containerId = item.get("@id").getAsString(); + + // Create directory and update tracking for this container int containerIndex = -1; try { createDir(currentPath); - // Add containers to pid map and mark as 'used', but no sha1 hash - // value - containerIndex = getUnusedIndexOf(item.get("@id").getAsString()); + containerIndex = getUnusedIndexOf(containerId); resourceUsed[containerIndex] = true; - pidMap.put(item.get("@id").getAsString(), currentPath); - + pidMap.put(containerId, currentPath); } catch (InterruptedException | IOException | ExecutionException e) { e.printStackTrace(); logger.severe(e.getMessage()); @@ -548,8 +598,8 @@ private void processContainer(JsonObject item, String currentPath) throws IOExce resourceUsed[containerIndex] = false; } throw new IOException("Unable to create bag"); - } + for (int i = 0; i < children.size(); i++) { // Find the ith child in the overall array of aggregated @@ -564,115 +614,188 @@ private void processContainer(JsonObject item, String currentPath) throws IOExce // Aggregation is at index 0, so need to shift by 1 for aggregates // entries JsonObject child = aggregates.get(index - 1).getAsJsonObject(); + // Dataverse does not currently use containers - this is for other variants/future use if (childIsContainer(child)) { - // create dir and process children - // processContainer will mark this item as used - processContainer(child, currentPath); + // Recursively collect files from this container + collectAllFiles(child, currentPath, allFiles, true); } else { - resourceUsed[index] = true; - // add item - // ToDo - String dataUrl = child.get(JsonLDTerm.schemaOrg("sameAs").getLabel()).getAsString(); - logger.fine("File url: " + dataUrl); - String childTitle = child.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString(); - if (titles.contains(childTitle)) { - logger.warning("**** Multiple items with the same title in: " + currentPath); - logger.warning("**** Will cause failure in hash and size validation in: " + bagID); - } else { - titles.add(childTitle); + + // Get file size + Long fileSize = null; + if (child.has(JsonLDTerm.filesize.getLabel())) { + fileSize = child.get(JsonLDTerm.filesize.getLabel()).getAsLong(); } - String childPath = currentPath + childTitle; - JsonElement directoryLabel = child.get(JsonLDTerm.DVCore("directoryLabel").getLabel()); - if (directoryLabel != null) { - childPath = currentPath + directoryLabel.getAsString() + "/" + childTitle; + if (fileSize == null) { + logger.severe("File size missing for child: " + childId); + throw new IOException("Unable to create bag due to missing file size"); } - - - String childHash = null; - if (child.has(JsonLDTerm.checksum.getLabel())) { - ChecksumType childHashType = ChecksumType - .fromUri(child.getAsJsonObject(JsonLDTerm.checksum.getLabel()).get("@type").getAsString()); - if (hashtype == null) { - // If one wasn't set as a default, pick up what the first child with one uses - hashtype = childHashType; - } - if (hashtype != null && !hashtype.equals(childHashType)) { - logger.warning("Multiple hash values in use - will calculate " + hashtype.toString() - + " hashes for " + childTitle); - } else { - childHash = child.getAsJsonObject(JsonLDTerm.checksum.getLabel()).get("@value").getAsString(); - if (checksumMap.containsValue(childHash)) { - // Something else has this hash - logger.warning("Duplicate/Collision: " + child.get("@id").getAsString() + " has SHA1 Hash: " - + childHash + " in: " + bagID); - } - logger.fine("Adding " + childPath + " with hash " + childHash + " to checksumMap"); - checksumMap.put(childPath, childHash); - } + + // Store minimal info for sorting - JsonObject is just a reference + allFiles.add(new FileEntry(fileSize, child, currentPath, index)); + } + } + } + + + // Process all files in sorted order + private void processAllFiles(List sortedFiles) + throws IOException, ExecutionException, InterruptedException { + + // Track titles to detect duplicates + Set titles = new HashSet<>(); + + if ((hashtype == null) | ignorehashes) { + hashtype = DataFile.ChecksumType.SHA512; + } + + for (FileEntry entry : sortedFiles) { + // Extract all needed information from the JsonObject reference + JsonObject child = entry.jsonObject; + + String childTitle = entry.getChildTitle(); + + // Check for duplicate titles + if (titles.contains(childTitle)) { + logger.warning("**** Multiple items with the same title in: " + entry.currentPath); + logger.warning("**** Will cause failure in hash and size validation in: " + bagID); + } else { + titles.add(childTitle); + } + + String childPath= entry.getChildPath(childTitle); + + // Get hash if exists + String childHash = null; + if (child.has(JsonLDTerm.checksum.getLabel())) { + ChecksumType childHashType = ChecksumType + .fromUri(child.getAsJsonObject(JsonLDTerm.checksum.getLabel()).get("@type").getAsString()); + if (hashtype == null) { + hashtype = childHashType; } - if ((hashtype == null) | ignorehashes) { - // Pick sha512 when ignoring hashes or none exist - hashtype = DataFile.ChecksumType.SHA512; + if (hashtype != null && !hashtype.equals(childHashType)) { + logger.warning("Multiple hash values in use - will calculate " + hashtype.toString() + + " hashes for " + childTitle); + } else { + childHash = child.getAsJsonObject(JsonLDTerm.checksum.getLabel()).get("@value").getAsString(); } - try { - if ((childHash == null) | ignorehashes) { - // Generate missing hashInputStream inputStream = null; - try (InputStream inputStream = getInputStreamSupplier(dataUrl).get()) { - - if (hashtype != null) { - if (hashtype.equals(DataFile.ChecksumType.SHA1)) { - childHash = DigestUtils.sha1Hex(inputStream); - } else if (hashtype.equals(DataFile.ChecksumType.SHA256)) { - childHash = DigestUtils.sha256Hex(inputStream); - } else if (hashtype.equals(DataFile.ChecksumType.SHA512)) { - childHash = DigestUtils.sha512Hex(inputStream); - } else if (hashtype.equals(DataFile.ChecksumType.MD5)) { - childHash = DigestUtils.md5Hex(inputStream); - } - } + } - } catch (IOException e) { - logger.severe("Failed to read " + childPath); - throw e; - } - if (childHash != null) { - JsonObject childHashObject = new JsonObject(); - childHashObject.addProperty("@type", hashtype.toString()); - childHashObject.addProperty("@value", childHash); - child.add(JsonLDTerm.checksum.getLabel(), (JsonElement) childHashObject); + resourceUsed[entry.resourceIndex] = true; + String dataUrl = entry.getDataUrl(); - checksumMap.put(childPath, childHash); - } else { - logger.warning("Unable to calculate a " + hashtype + " for " + dataUrl); + try { + if ((childHash == null) | ignorehashes) { + // Generate missing hash + + try (InputStream inputStream = getInputStreamSupplier(dataUrl).get()){ + if (hashtype != null) { + if (hashtype.equals(DataFile.ChecksumType.SHA1)) { + childHash = DigestUtils.sha1Hex(inputStream); + } else if (hashtype.equals(DataFile.ChecksumType.SHA256)) { + childHash = DigestUtils.sha256Hex(inputStream); + } else if (hashtype.equals(DataFile.ChecksumType.SHA512)) { + childHash = DigestUtils.sha512Hex(inputStream); + } else if (hashtype.equals(DataFile.ChecksumType.MD5)) { + childHash = DigestUtils.md5Hex(inputStream); + } } + + } catch (IOException e) { + logger.severe("Failed to read " + childPath); + throw e; } - logger.fine("Requesting: " + childPath + " from " + dataUrl); - createFileFromURL(childPath, dataUrl); - dataCount++; - if (dataCount % 1000 == 0) { - logger.info("Retrieval in progress: " + dataCount + " files retrieved"); + if (childHash != null) { + JsonObject childHashObject = new JsonObject(); + childHashObject.addProperty("@type", hashtype.toString()); + childHashObject.addProperty("@value", childHash); + child.add(JsonLDTerm.checksum.getLabel(), (JsonElement) childHashObject); + + checksumMap.put(childPath, childHash); + } else { + logger.warning("Unable to calculate a " + hashtype + " for " + dataUrl); } - if (child.has(JsonLDTerm.filesize.getLabel())) { - Long size = child.get(JsonLDTerm.filesize.getLabel()).getAsLong(); - totalDataSize += size; - if (size > maxFileSize) { - maxFileSize = size; - } + } else { + // Hash already exists, add to checksumMap + if (checksumMap.containsValue(childHash)) { + logger.warning("Duplicate/Collision: " + child.get("@id").getAsString() + + " has hash: " + childHash + " in: " + bagID); } - if (child.has(JsonLDTerm.schemaOrg("fileFormat").getLabel())) { - mimetypes.add(child.get(JsonLDTerm.schemaOrg("fileFormat").getLabel()).getAsString()); + logger.fine("Adding " + childPath + " with hash " + childHash + " to checksumMap"); + checksumMap.put(childPath, childHash); + } + // Add file to bag or fetch file + if (!addToZip(entry.size)) { + if(createHoleyBag) { + logger.fine("Adding to fetch file: " + childPath + " from " + dataUrl + + " (size: " + entry.size + " bytes)"); + addToFetchFile(dataUrl, entry.size, childPath); + usingFetchFile = true; + } else { + // Add to list for archiver to retrieve + oversizedFiles.add(entry); + logger.fine("Adding " + childPath + " to oversized files list for archiver"); } + } else { + logger.fine("Requesting: " + childPath + " from " + dataUrl + + " (size: " + entry.size + " bytes)"); + createFileFromURL(childPath, dataUrl); + currentBagDataSize += entry.size; + } - } catch (Exception e) { - resourceUsed[index] = false; - e.printStackTrace(); - throw new IOException("Unable to create bag"); + dataCount++; + if (dataCount % 1000 == 0) { + logger.info("Retrieval in progress: " + dataCount + " files retrieved"); } - // Check for nulls! - pidMap.put(child.get("@id").getAsString(), childPath); + totalDataSize += entry.size; + if (entry.size > maxFileSize) { + maxFileSize = entry.size; + } + + if (child.has(JsonLDTerm.schemaOrg("fileFormat").getLabel())) { + mimetypes.add(child.get(JsonLDTerm.schemaOrg("fileFormat").getLabel()).getAsString()); + } + } catch (Exception e) { + resourceUsed[entry.resourceIndex] = false; + e.printStackTrace(); + throw new IOException("Unable to create bag"); } + + pidMap.put(child.get("@id").getAsString(), childPath); + } + } + + // Helper method to determine if file should go to fetch file + private boolean addToZip(long fileSize) { + + // Check individual file size limit + if (fileSize > maxDataFileSize) { + logger.fine("File size " + fileSize + " exceeds max data file size " + maxDataFileSize); + return false; + } + + // Check total bag size limit + if (currentBagDataSize + fileSize > maxTotalDataSize) { + logger.fine("Adding file would exceed max total data size. Current: " + currentBagDataSize + + ", File: " + fileSize + ", Max: " + maxTotalDataSize); + return false; + } + + return true; + } + + // Method to append to fetch file content + private void addToFetchFile(String url, long size, String filename) { + // Format: URL size filename + fetchFileContent.append(url).append(" ").append(Long.toString(size)).append(" ").append(filename).append(CRLF); + } + + // Method to write fetch file to bag (call this before finalizing the bag) + private void writeFetchFile() throws IOException, ExecutionException, InterruptedException { + if (usingFetchFile && fetchFileContent.length() > 0) { + logger.info("Creating fetch.txt file for holey bag"); + createFileFromString("fetch.txt", fetchFileContent.toString()); } } @@ -719,7 +842,7 @@ public InputStream get() { } private void createFileFromString(final String relPath, final String content) - throws IOException, ExecutionException, InterruptedException { + throws IOException, ExecutionException, InterruptedException { ZipArchiveEntry archiveEntry = new ZipArchiveEntry(bagName + "/" + relPath); archiveEntry.setMethod(ZipEntry.DEFLATED); @@ -733,7 +856,7 @@ public InputStream get() { } private void createFileFromURL(final String relPath, final String uri) - throws IOException, ExecutionException, InterruptedException { + throws IOException, ExecutionException, InterruptedException { ZipArchiveEntry archiveEntry = new ZipArchiveEntry(bagName + "/" + relPath); archiveEntry.setMethod(ZipEntry.DEFLATED); @@ -782,7 +905,7 @@ public void addEntry(ZipArchiveEntry zipArchiveEntry, InputStreamSupplier stream } public void writeTo(ZipArchiveOutputStream zipArchiveOutputStream) - throws IOException, ExecutionException, InterruptedException { + throws IOException, ExecutionException, InterruptedException { logger.fine("Writing dirs"); dirs.writeTo(zipArchiveOutputStream); dirs.close(); @@ -791,8 +914,6 @@ public void writeTo(ZipArchiveOutputStream zipArchiveOutputStream) logger.fine("Files written"); } - static final String CRLF = "\r\n"; - private String generateInfoFile() { logger.fine("Generating info file"); StringBuffer info = new StringBuffer(); @@ -802,13 +923,13 @@ private String generateInfoFile() { * formal vocabulary and label in the oremap may change so we need to find the * labels used. */ - JsonLDTerm contactTerm = oremap.getContactTerm(); + JsonLDTerm contactTerm = terms.get(DatasetFieldConstant.datasetContact); if ((contactTerm != null) && aggregation.has(contactTerm.getLabel())) { JsonElement contacts = aggregation.get(contactTerm.getLabel()); - JsonLDTerm contactNameTerm = oremap.getContactNameTerm(); - JsonLDTerm contactEmailTerm = oremap.getContactEmailTerm(); - + JsonLDTerm contactNameTerm = terms.get(DatasetFieldConstant.datasetContactName); + JsonLDTerm contactEmailTerm = terms.get(DatasetFieldConstant.datasetContactEmail); + if (contacts.isJsonArray()) { JsonArray contactsArray = contacts.getAsJsonArray(); for (int i = 0; i < contactsArray.size(); i++) { @@ -852,7 +973,7 @@ private String generateInfoFile() { } String orgName = JvmSettings.BAGIT_SOURCE_ORG_NAME.lookupOptional(String.class) - .orElse("Dataverse Installation ()"); + .orElse("Dataverse Installation ()"); String orgAddress = JvmSettings.BAGIT_SOURCEORG_ADDRESS.lookupOptional(String.class).orElse(""); String orgEmail = JvmSettings.BAGIT_SOURCEORG_EMAIL.lookupOptional(String.class).orElse(""); @@ -873,13 +994,13 @@ private String generateInfoFile() { * a formal vocabulary and label in the oremap may change so we need to find the * labels used. */ - JsonLDTerm descriptionTerm = oremap.getDescriptionTerm(); - JsonLDTerm descriptionTextTerm = oremap.getDescriptionTextTerm(); + JsonLDTerm descriptionTerm = terms.get(DatasetFieldConstant.description); + JsonLDTerm descriptionTextTerm = terms.get(DatasetFieldConstant.descriptionText); if (descriptionTerm == null) { logger.warning("No description available for BagIt Info file"); } else { info.append(multilineWrap(EXTERNAL_DESCRIPTION - + getSingleValue(aggregation.get(descriptionTerm.getLabel()), descriptionTextTerm.getLabel()))); + + getSingleValue(aggregation.get(descriptionTerm.getLabel()), descriptionTextTerm.getLabel()))); info.append(CRLF); } @@ -900,18 +1021,17 @@ private String generateInfoFile() { info.append(Long.toString(dataCount)); info.append(CRLF); - info.append("Internal-Sender-Identifier: "); String catalog = orgName + " Catalog"; if (aggregation.has(JsonLDTerm.schemaOrg("includedInDataCatalog").getLabel())) { catalog = aggregation.get(JsonLDTerm.schemaOrg("includedInDataCatalog").getLabel()).getAsString(); } info.append(multilineWrap(INTERNAL_SENDER_IDENTIFIER + catalog + ":" - + aggregation.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString())); + + aggregation.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString())); info.append(CRLF); // Add a version number for our bag type - should be updated with any change to // the bag content/structure - info.append(DATAVERSE_BAG_VERSION + "1.0"); + info.append(DATAVERSE_BAG_VERSION); info.append(CRLF); return info.toString(); @@ -919,7 +1039,10 @@ private String generateInfoFile() { static private String multilineWrap(String value) { // Normalize line breaks and ensure all lines after the first are indented - String[] lines = value.split("\\r?\\n"); + // Handle various line separator characters: + // LF (U+000A), CR (U+000D), CR+LF, VT (U+000B), FF (U+000C), + // NEL (U+0085), LS (U+2028), PS (U+2029) + String[] lines = value.split("\\r\\n|\\r|\\n|\\u000B|\\u000C|\\u0085|\\u2028|\\u2029"); StringBuilder wrappedValue = new StringBuilder(); for (int i = 0; i < lines.length; i++) { // Skip empty lines - RFC8493 (section 7.3) doesn't allow truly empty lines, @@ -974,7 +1097,7 @@ public static String lineWrap(final String str, int wrapLength, String newLineSt int spaceToWrapAt = -1; Matcher matcher = patternToWrapOn.matcher(str.substring(offset, - Math.min((int) Math.min(Integer.MAX_VALUE, offset + currentWrapLength + 1L), inputLineLength))); + Math.min((int) Math.min(Integer.MAX_VALUE, offset + currentWrapLength + 1L), inputLineLength))); if (matcher.find()) { if (matcher.start() == 0) { matcherSize = matcher.end(); @@ -1004,41 +1127,41 @@ public static String lineWrap(final String str, int wrapLength, String newLineSt isFirstLine = false; } else // really long word or URL - if (wrapLongWords) { - if (matcherSize == 0) { - offset--; - } - // wrap really long word one line at a time - wrappedLine.append(str, offset, currentWrapLength + offset); - wrappedLine.append(newLineStr); - offset += currentWrapLength; - matcherSize = -1; - isFirstLine = false; - } else { - // do not wrap really long word, just extend beyond limit - matcher = patternToWrapOn.matcher(str.substring(offset + currentWrapLength)); - if (matcher.find()) { - matcherSize = matcher.end() - matcher.start(); - spaceToWrapAt = matcher.start() + offset + currentWrapLength; - } - - if (spaceToWrapAt >= 0) { - if (matcherSize == 0 && offset != 0) { + if (wrapLongWords) { + if (matcherSize == 0) { offset--; } - wrappedLine.append(str, offset, spaceToWrapAt); + // wrap really long word one line at a time + wrappedLine.append(str, offset, currentWrapLength + offset); wrappedLine.append(newLineStr); - offset = spaceToWrapAt + 1; + offset += currentWrapLength; + matcherSize = -1; isFirstLine = false; } else { - if (matcherSize == 0 && offset != 0) { - offset--; + // do not wrap really long word, just extend beyond limit + matcher = patternToWrapOn.matcher(str.substring(offset + currentWrapLength)); + if (matcher.find()) { + matcherSize = matcher.end() - matcher.start(); + spaceToWrapAt = matcher.start() + offset + currentWrapLength; + } + + if (spaceToWrapAt >= 0) { + if (matcherSize == 0 && offset != 0) { + offset--; + } + wrappedLine.append(str, offset, spaceToWrapAt); + wrappedLine.append(newLineStr); + offset = spaceToWrapAt + 1; + isFirstLine = false; + } else { + if (matcherSize == 0 && offset != 0) { + offset--; + } + wrappedLine.append(str, offset, str.length()); + offset = inputLineLength; + matcherSize = -1; } - wrappedLine.append(str, offset, str.length()); - offset = inputLineLength; - matcherSize = -1; } - } } if (matcherSize == 0 && offset < inputLineLength) { @@ -1056,7 +1179,7 @@ public static String lineWrap(final String str, int wrapLength, String newLineSt * objects containing key/values whereas a single value is sent as one object. * For cases where multiple values are sent, create a concatenated string so * that information is not lost. - * + * * @param jsonElement - the root json object * @param key - the key to find a value(s) for * @return - a single string @@ -1064,9 +1187,10 @@ public static String lineWrap(final String str, int wrapLength, String newLineSt String getSingleValue(JsonElement jsonElement, String key) { String val = ""; if (jsonElement.isJsonObject()) { - JsonObject jsonObject=jsonElement.getAsJsonObject(); + JsonObject jsonObject = jsonElement.getAsJsonObject(); val = jsonObject.get(key).getAsString(); } else if (jsonElement.isJsonArray()) { + Iterator iter = jsonElement.getAsJsonArray().iterator(); ArrayList stringArray = new ArrayList(); while (iter.hasNext()) { @@ -1114,6 +1238,7 @@ private static JsonArray getChildren(JsonObject parent) { // Logic to decide if this is a container - // first check for children, then check for source-specific type indicators + // Dataverse does not currently use containers - this is for other variants/future use private static boolean childIsContainer(JsonObject item) { if (getChildren(item).size() != 0) { return true; @@ -1176,17 +1301,16 @@ private HttpGet createNewGetRequest(URI url, String returnType) { * * Caller must close the stream when done. */ - InputStreamSupplier getInputStreamSupplier(final String uriString) { + public InputStreamSupplier getInputStreamSupplier(final String uriString) { return new InputStreamSupplier() { public InputStream get() { try { URI uri = new URI(uriString); - int tries = 0; - while (tries < 5) { + while (tries < MAX_RETRIES) { - logger.fine("Get # " + tries + " for " + uriString); + logger.finest("Get # " + tries + " for " + uriString); HttpGet getFile = createNewGetRequest(uri, null); try { @@ -1220,7 +1344,7 @@ public void close() throws IOException { response.close(); logger.warning("Attempt: " + tries + " - Unexpected Status when retrieving " + uriString - + " : " + statusCode); + + " : " + statusCode); tries++; try { // Calculate exponential backoff: 2^tries * baseWaitTimeMs (1 sec) @@ -1238,29 +1362,29 @@ public void close() throws IOException { } catch (InterruptedException ie) { logger.log(Level.SEVERE, "InterruptedException during retry delay for file: " + uriString, ie); Thread.currentThread().interrupt(); // Restore interrupt status - tries += 5; // Skip remaining attempts + tries += MAX_RETRIES; // Skip remaining attempts } } } catch (ClientProtocolException e) { - tries += 5; + tries += MAX_RETRIES; logger.log(Level.SEVERE, "ClientProtocolException when retrieving file: " + uriString + " (attempt " + tries + ")", e); } catch (SocketTimeoutException e) { // Specific handling for timeout exceptions tries++; - logger.log(Level.SEVERE, "SocketTimeoutException when retrieving file: " + uriString + " (attempt " + tries + " of 5) - Request exceeded timeout", e); - if (tries == 5) { + logger.log(Level.SEVERE, "SocketTimeoutException when retrieving file: " + uriString + " (attempt " + tries + " of " + MAX_RETRIES + ") - Request exceeded timeout", e); + if (tries == MAX_RETRIES) { logger.log(Level.SEVERE, "FINAL FAILURE: File could not be retrieved after all retries due to timeouts: " + uriString, e); } } catch (InterruptedIOException e) { // Catches interruptions during I/O operations - tries += 5; + tries += MAX_RETRIES; logger.log(Level.SEVERE, "InterruptedIOException when retrieving file: " + uriString + " - Operation was interrupted", e); Thread.currentThread().interrupt(); // Restore interrupt status } catch (IOException e) { // Retry if this is a potentially temporary error such as a timeout tries++; - logger.log(Level.WARNING, "IOException when retrieving file: " + uriString + " (attempt " + tries + " of 5)", e); - if (tries == 5) { + logger.log(Level.WARNING, "IOException when retrieving file: " + uriString + " (attempt " + tries + " of " + MAX_RETRIES+ ")", e); + if (tries == MAX_RETRIES) { logger.log(Level.SEVERE, "FINAL FAILURE: File could not be retrieved after all retries: " + uriString, e); } } @@ -1275,6 +1399,12 @@ public void close() throws IOException { }; } + + + public List getOversizedFiles() { + return oversizedFiles; + } + /** * Adapted from org/apache/commons/io/FileUtils.java change to SI - add 2 digits * of precision @@ -1322,7 +1452,54 @@ public void setAuthenticationKey(String tokenString) { public static void setNumConnections(int numConnections) { BagGenerator.numConnections = numConnections; - logger.fine("All BagGenerators will use " + numConnections + " threads"); + logger.fine("All BagGenerators will now use " + numConnections + " threads"); } + // Inner class to hold file information before processing + public static class FileEntry implements Comparable { + final long size; + final JsonObject jsonObject; // Direct reference, not a copy + final String currentPath; // Parent directory path + final int resourceIndex; // Still need this for resourceUsed tracking + + FileEntry(long size, JsonObject jsonObject, String currentPath, int resourceIndex) { + this.size = size; + this.jsonObject = jsonObject; + this.currentPath = currentPath; + this.resourceIndex = resourceIndex; + } + + public String getDataUrl() { + return suppressDownloadCounts(jsonObject.get(JsonLDTerm.schemaOrg("sameAs").getLabel()).getAsString()); + } + + public String getChildTitle() { + return jsonObject.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString(); + } + + public String getChildPath(String title) { + // Build full path using stored currentPath + String childPath = currentPath + title; + JsonElement directoryLabel = jsonObject.get(JsonLDTerm.DVCore("directoryLabel").getLabel()); + if (directoryLabel != null) { + childPath = currentPath + directoryLabel.getAsString() + "/" + title; + } + return childPath; + } + + private String suppressDownloadCounts(String uriString) { + // Adding gbrecs to suppress counting this access as a download (archiving is + // not a download indicating scientific use) + return uriString + (uriString.contains("?") ? "&" : "?") + "gbrecs=true"; + } + + @Override + public int compareTo(FileEntry other) { + return Long.compare(this.size, other.size); + } + + public long getSize() { + return size; + } + } } \ No newline at end of file diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java index 426d5c9aa5f..0d99a5bddd1 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java @@ -506,11 +506,16 @@ private static void addCvocValue(String val, JsonArrayBuilder vals, JsonObject c for (String prefix : context.keySet()) { localContext.putIfAbsent(prefix, context.getString(prefix)); } - JsonObjectBuilder job = Json.createObjectBuilder(datasetFieldService.getExternalVocabularyValue(val)); - job.add("@id", val); - JsonObject extVal = job.build(); - logger.fine("Adding: " + extVal); - vals.add(extVal); + JsonObject cachedValue = datasetFieldService.getExternalVocabularyValue(val); + if (cachedValue != null) { + JsonObjectBuilder job = Json.createObjectBuilder(cachedValue); + job.add("@id", val); + JsonObject extVal = job.build(); + logger.fine("Adding: " + extVal); + vals.add(extVal); + } else { + vals.add(val); + } } else { vals.add(val); } diff --git a/src/main/java/edu/harvard/iq/dataverse/workflow/WorkflowServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/workflow/WorkflowServiceBean.java index ae1175f0e1d..c2281f32fa4 100644 --- a/src/main/java/edu/harvard/iq/dataverse/workflow/WorkflowServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/workflow/WorkflowServiceBean.java @@ -42,6 +42,7 @@ import jakarta.inject.Inject; import jakarta.persistence.EntityManager; import jakarta.persistence.PersistenceContext; +import jakarta.persistence.Query; import jakarta.persistence.TypedQuery; /** @@ -133,7 +134,6 @@ public void start(Workflow wf, WorkflowContext ctxt, boolean findDataset) throws * (e.g. if this method is not asynchronous) * */ - if (!findDataset) { /* * Sleep here briefly to make sure the database update from the callers @@ -150,13 +150,12 @@ public void start(Workflow wf, WorkflowContext ctxt, boolean findDataset) throws logger.warning("Failed to sleep for a second."); } } - //Refresh will only em.find the dataset if findDataset is true. (otherwise the dataset is em.merged) + ctxt = refresh(ctxt, retrieveRequestedSettings( wf.getRequiredSettings()), getCurrentApiToken(ctxt.getRequest().getAuthenticatedUser()), findDataset); lockDataset(ctxt, new DatasetLock(DatasetLock.Reason.Workflow, ctxt.getRequest().getAuthenticatedUser())); forward(wf, ctxt); } - private ApiToken getCurrentApiToken(AuthenticatedUser au) { if (au != null) { CommandContext ctxt = engine.getContext(); @@ -180,12 +179,12 @@ private Map retrieveRequestedSettings(Map requir break; } case "boolean": { - retrievedSettings.put(setting, settings.isTrue(settingType, false)); + retrievedSettings.put(setting, settings.isTrue(setting, false)); break; } case "long": { retrievedSettings.put(setting, - settings.getValueForKeyAsLong(SettingsServiceBean.Key.valueOf(setting))); + settings.getValueForKeyAsLong(SettingsServiceBean.Key.parse(setting))); break; } } @@ -211,7 +210,6 @@ public void resume(PendingWorkflowInvocation pending, String body) { } - @Asynchronous private void forward(Workflow wf, WorkflowContext ctxt) { executeSteps(wf, ctxt, 0); } @@ -245,7 +243,6 @@ private void doResume(PendingWorkflowInvocation pending, String body) { } } - @Asynchronous private void rollback(Workflow wf, WorkflowContext ctxt, Failure failure, int lastCompletedStepIdx) { ctxt = refresh(ctxt); final List steps = wf.getSteps(); @@ -290,7 +287,7 @@ private void executeSteps(Workflow wf, WorkflowContext ctxt, int initialStepIdx try { if (res == WorkflowStepResult.OK) { logger.log(Level.INFO, "Workflow {0} step {1}: OK", new Object[]{ctxt.getInvocationId(), stepIdx}); - em.merge(ctxt.getDataset()); + // The dataset is merged in refresh(ctxt) ctxt = refresh(ctxt); } else if (res instanceof Failure) { logger.log(Level.WARNING, "Workflow {0} failed: {1}", new Object[]{ctxt.getInvocationId(), ((Failure) res).getReason()}); @@ -309,7 +306,6 @@ private void executeSteps(Workflow wf, WorkflowContext ctxt, int initialStepIdx return; } } - workflowCompleted(wf, ctxt); } @@ -318,22 +314,18 @@ private void executeSteps(Workflow wf, WorkflowContext ctxt, int initialStepIdx // Internal methods to run each step in its own transaction. // - @TransactionAttribute(TransactionAttributeType.REQUIRES_NEW) WorkflowStepResult runStep( WorkflowStep step, WorkflowContext ctxt ) { return step.run(ctxt); } - @TransactionAttribute(TransactionAttributeType.REQUIRES_NEW) WorkflowStepResult resumeStep( WorkflowStep step, WorkflowContext ctxt, Map localData, String externalData ) { return step.resume(ctxt, localData, externalData); } - @TransactionAttribute(TransactionAttributeType.REQUIRES_NEW) void rollbackStep( WorkflowStep step, WorkflowContext ctxt, Failure reason ) { step.rollback(ctxt, reason); } - @TransactionAttribute(TransactionAttributeType.REQUIRES_NEW) void lockDataset(WorkflowContext ctxt, DatasetLock datasetLock) throws CommandException { /* * Note that this method directly adds a lock to the database rather than adding @@ -351,7 +343,6 @@ void lockDataset(WorkflowContext ctxt, DatasetLock datasetLock) throws CommandEx ctxt.setLockId(datasetLock.getId()); } - @TransactionAttribute(TransactionAttributeType.REQUIRES_NEW) void unlockDataset(WorkflowContext ctxt) throws CommandException { /* * Since the lockDataset command above directly persists a lock to the database, @@ -384,42 +375,48 @@ private void pauseAndAwait(Workflow wf, WorkflowContext ctxt, Pending pendingRes private void workflowCompleted(Workflow wf, WorkflowContext ctxt) { logger.log(Level.INFO, "Workflow {0} completed.", ctxt.getInvocationId()); + + // Read fresh timestamps from DB - parallel index/exports may have occurred while the workflow ran + // (Nominally the workflow lock should have stopped other changes). + Dataset dataset = ctxt.getDataset(); + + datasets.updateIndexingAndExportTimes(dataset); + - try { - if ( ctxt.getType() == TriggerType.PrePublishDataset ) { + try { + if (ctxt.getType() == TriggerType.PrePublishDataset) { ctxt = refresh(ctxt); - //Now lock for FinalizePublication - this block mirrors that in PublishDatasetCommand + dataset = ctxt.getDataset(); + // Now lock for FinalizePublication - this block mirrors that in PublishDatasetCommand AuthenticatedUser user = ctxt.getRequest().getAuthenticatedUser(); DatasetLock lock = new DatasetLock(DatasetLock.Reason.finalizePublication, user); - Dataset dataset = ctxt.getDataset(); lock.setDataset(dataset); - boolean registerGlobalIdsForFiles = - systemConfig.isFilePIDsEnabledForCollection(ctxt.getDataset().getOwner()) && - dvObjects.getEffectivePidGenerator(dataset).canCreatePidsLike(dataset.getGlobalId()); - + boolean registerGlobalIdsForFiles = systemConfig.isFilePIDsEnabledForCollection(ctxt.getDataset().getOwner()) && + dvObjects.getEffectivePidGenerator(dataset).canCreatePidsLike(dataset.getGlobalId()); + boolean validatePhysicalFiles = systemConfig.isDatafileValidationOnPublishEnabled(); - String info = "Publishing the dataset; "; + String info = "Publishing the dataset; "; info += registerGlobalIdsForFiles ? "Registering PIDs for Datafiles; " : ""; info += validatePhysicalFiles ? "Validating Datafiles Asynchronously" : ""; lock.setInfo(info); lockDataset(ctxt, lock); ctxt.getDataset().addLock(lock); - + unlockDataset(ctxt); - ctxt.setLockId(null); //the workflow lock - //Refreshing merges the dataset + ctxt.setLockId(null); // the workflow lock + // Refreshing merges the dataset ctxt = refresh(ctxt); - //Then call Finalize + // Then call Finalize engine.submit(new FinalizeDatasetPublicationCommand(ctxt.getDataset(), ctxt.getRequest(), ctxt.getDatasetExternallyReleased())); } else { logger.fine("Removing workflow lock"); unlockDataset(ctxt); } - } catch (CommandException ex) { - logger.log(Level.SEVERE, "Exception finalizing workflow " + ctxt.getInvocationId() +": " + ex.getMessage(), ex); - rollback(wf, ctxt, new Failure("Exception while finalizing the publication: " + ex.getMessage()), wf.steps.size()-1); - } - + } catch (CommandException ex) { + logger.log(Level.SEVERE, "Exception finalizing workflow " + ctxt.getInvocationId() + ": " + ex.getMessage(), ex); + rollback(wf, ctxt, new Failure("Exception while finalizing the publication: " + ex.getMessage()), wf.steps.size() - 1); + } + } public List listWorkflows() { diff --git a/src/main/java/edu/harvard/iq/dataverse/workflow/internalspi/ArchivalSubmissionWorkflowStep.java b/src/main/java/edu/harvard/iq/dataverse/workflow/internalspi/ArchivalSubmissionWorkflowStep.java index b0567bff107..ccf6bd12a88 100644 --- a/src/main/java/edu/harvard/iq/dataverse/workflow/internalspi/ArchivalSubmissionWorkflowStep.java +++ b/src/main/java/edu/harvard/iq/dataverse/workflow/internalspi/ArchivalSubmissionWorkflowStep.java @@ -1,9 +1,14 @@ package edu.harvard.iq.dataverse.workflow.internalspi; +import edu.harvard.iq.dataverse.Dataset; +import edu.harvard.iq.dataverse.DatasetLock.Reason; +import edu.harvard.iq.dataverse.DatasetVersion; import edu.harvard.iq.dataverse.engine.command.DataverseRequest; import edu.harvard.iq.dataverse.engine.command.impl.AbstractSubmitToArchiveCommand; import edu.harvard.iq.dataverse.settings.SettingsServiceBean; import edu.harvard.iq.dataverse.util.ArchiverUtil; +import edu.harvard.iq.dataverse.util.bagit.OREMap; +import edu.harvard.iq.dataverse.util.json.JsonLDTerm; import edu.harvard.iq.dataverse.workflow.WorkflowContext; import edu.harvard.iq.dataverse.workflow.step.Failure; import edu.harvard.iq.dataverse.workflow.step.WorkflowStep; @@ -14,6 +19,7 @@ import java.util.logging.Level; import java.util.logging.Logger; +import jakarta.json.JsonObject; import jakarta.servlet.http.HttpServletRequest; /** @@ -45,11 +51,53 @@ public WorkflowStepResult run(WorkflowContext context) { } } + Dataset d = context.getDataset(); + if (d.isLockedFor(Reason.FileValidationFailed)) { + logger.severe("Dataset locked for file validation failure - will not archive"); + return new Failure("File Validation Lock", "Dataset has file validation problem - will not archive"); + } DataverseRequest dvr = new DataverseRequest(context.getRequest().getAuthenticatedUser(), (HttpServletRequest) null); String className = requestedSettings.get(SettingsServiceBean.Key.ArchiverClassName.toString()); AbstractSubmitToArchiveCommand archiveCommand = ArchiverUtil.createSubmitToArchiveCommand(className, dvr, context.getDataset().getReleasedVersion()); if (archiveCommand != null) { - return (archiveCommand.performArchiveSubmission(context.getDataset().getReleasedVersion(), context.getApiToken(), requestedSettings)); + // Generate the required components for archiving + DatasetVersion version = context.getDataset().getReleasedVersion(); + if (!archiveCommand.preconditionsMet(version, context.getApiToken(), requestedSettings)) { + return new Failure("Earlier versions must be successfully archived first", + "Archival prerequisites not met"); + } + + // Generate DataCite XML + String dataCiteXml = archiveCommand.getDataCiteXml(version); + + // Generate OREMap + OREMap oreMap = new OREMap(version, false); + JsonObject ore = oreMap.getOREMap(); + + // Get JSON-LD terms + Map terms = AbstractSubmitToArchiveCommand.getJsonLDTerms(oreMap); + + // Call the updated method with all required parameters + /* + * Note: because this must complete before the workflow can complete and update the version status + * in the db a long-running archive submission via workflow could hit a transaction timeout and fail. + * The commands themselves have been updated to run archive submission outside of any transaction + * and update the status in a separate transaction, so archiving a given version that way could + * succeed where this workflow failed. + * + * Another difference when running in a workflow - this step has no way to set the archiving status to + * pending as is done when running archiving from the UI/API. Instead, there is a generic workflow + * lock on the dataset. + */ + + return archiveCommand.performArchiveSubmission( + version, + dataCiteXml, + ore, + terms, + context.getApiToken(), + requestedSettings + ); } else { logger.severe("No Archiver instance could be created for name: " + className); return new Failure("No Archiver", "Could not create instance of class: " + className); diff --git a/src/main/java/propertyFiles/Bundle.properties b/src/main/java/propertyFiles/Bundle.properties index f801c762752..4c1661a4428 100644 --- a/src/main/java/propertyFiles/Bundle.properties +++ b/src/main/java/propertyFiles/Bundle.properties @@ -1643,7 +1643,7 @@ dataset.share.datasetShare=Share Dataset dataset.share.datasetShare.tip=Share this dataset on your favorite social media networks. dataset.share.datasetShare.shareText=View this dataset. dataset.locked.message=Dataset Locked -dataset.locked.message.details=This dataset is locked until publication. +dataset.locked.message.details=This dataset is temporarily locked while background processing related to publication completes. dataset.locked.inReview.message=Submitted for Review dataset.locked.ingest.message=The tabular data files uploaded are being processed and converted into the archival format dataset.unlocked.ingest.message=The tabular files have been ingested. @@ -1674,7 +1674,6 @@ dataset.compute.computeBatchListHeader=Compute Batch dataset.compute.computeBatchRestricted=This dataset contains restricted files you may not compute on because you have not been granted access. dataset.delete.error=Could not deaccession the dataset because the {0} update failed. dataset.publish.workflow.message=Publish in Progress -dataset.publish.workflow.inprogress=This dataset is locked until publication. dataset.pidRegister.workflow.inprogress=The dataset is locked while the persistent identifiers are being registered or updated, and/or the physical files are being validated. dataset.versionUI.draft=Draft dataset.versionUI.inReview=In Review @@ -2135,6 +2134,7 @@ file.dataFilesTab.versions.headers.contributors.withheld=Contributor name(s) wit file.dataFilesTab.versions.headers.published=Published on file.dataFilesTab.versions.headers.archived=Archival Status file.dataFilesTab.versions.headers.archived.success=Archived +file.dataFilesTab.versions.headers.archived.obsolete=Original Version Archived file.dataFilesTab.versions.headers.archived.pending=Pending file.dataFilesTab.versions.headers.archived.failure=Failed file.dataFilesTab.versions.headers.archived.notarchived=Not Archived @@ -2693,6 +2693,7 @@ dataset.notlinked.msg=There was a problem linking this dataset to yours: dataset.linking.popop.already.linked.note=Note: This dataset is already linked to the following dataverse(s): dataset.linking.popup.not.linked.note=Note: This dataset is not linked to any of your accessible dataverses datasetversion.archive.success=Archival copy of Version successfully submitted +datasetversion.archive.inprogress= Data Project archiving has been started datasetversion.archive.failure=Error in submitting an archival copy datasetversion.update.failure=Dataset Version Update failed. Changes are still in the DRAFT version. datasetversion.update.archive.failure=Dataset Version Update succeeded, but the attempt to update the archival copy failed. diff --git a/src/main/webapp/dataset-versions.xhtml b/src/main/webapp/dataset-versions.xhtml index 9e5f0a9b24d..5211ae234e1 100644 --- a/src/main/webapp/dataset-versions.xhtml +++ b/src/main/webapp/dataset-versions.xhtml @@ -169,13 +169,20 @@ + + + + + + - + + - + diff --git a/src/main/webapp/resources/css/structure.css b/src/main/webapp/resources/css/structure.css index cd2e7d33d10..27cb0d7e8bf 100644 --- a/src/main/webapp/resources/css/structure.css +++ b/src/main/webapp/resources/css/structure.css @@ -936,6 +936,9 @@ div.dvnDifferanceTable .versionValue { } div[id$="versionsTable"] tbody {word-break:break-word;} +.archive-submit-link { + display: block; +} /* DATATABLE + DROPDOWN BUTTON + OVERFLOW VISIBLE */ thead.ui-datatable-scrollable-theadclone {display:none} diff --git a/src/test/java/edu/harvard/iq/dataverse/api/BagIT.java b/src/test/java/edu/harvard/iq/dataverse/api/BagIT.java index 16c44003f35..b649ad6bb95 100644 --- a/src/test/java/edu/harvard/iq/dataverse/api/BagIT.java +++ b/src/test/java/edu/harvard/iq/dataverse/api/BagIT.java @@ -87,7 +87,7 @@ public void testBagItExport() throws IOException { .replace('.', '-').toLowerCase(); // spacename: doi-10-5072-fk2-fosg5q - String pathToZip = bagitExportDir + "/" + spaceName + "v1.0" + ".zip"; + String pathToZip = bagitExportDir + "/" + spaceName + ".v1.0" + ".zip"; try { // give the bag time to generate diff --git a/src/test/java/edu/harvard/iq/dataverse/api/SwordIT.java b/src/test/java/edu/harvard/iq/dataverse/api/SwordIT.java index 709908ac6eb..22dfe61da07 100644 --- a/src/test/java/edu/harvard/iq/dataverse/api/SwordIT.java +++ b/src/test/java/edu/harvard/iq/dataverse/api/SwordIT.java @@ -954,7 +954,8 @@ public void testDeleteFiles() { reindexDataset4ToFindDatabaseId.then().assertThat() .statusCode(OK.getStatusCode()); Integer datasetId4 = JsonPath.from(reindexDataset4ToFindDatabaseId.asString()).getInt("data.id"); - + UtilIT.sleepForReindex(datasetPersistentId4, apiToken, 5); + Response destroyDataset4 = UtilIT.destroyDataset(datasetId4, apiToken); destroyDataset4.prettyPrint(); destroyDataset4.then().assertThat() diff --git a/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorInfoFileTest.java b/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorInfoFileTest.java index dbbf3241318..05e83b8540d 100644 --- a/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorInfoFileTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorInfoFileTest.java @@ -1,6 +1,7 @@ package edu.harvard.iq.dataverse.util.bagit; +import edu.harvard.iq.dataverse.engine.command.impl.AbstractSubmitToArchiveCommand; import edu.harvard.iq.dataverse.util.json.JsonLDTerm; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -52,7 +53,7 @@ private void initializeBagGenerator() throws Exception { when(mockOreMap.getOREMap()).thenReturn(oremapObject); // Initialize BagGenerator with test data - bagGenerator = new BagGenerator(mockOreMap, ""); + bagGenerator = new BagGenerator(oremapObject, "", AbstractSubmitToArchiveCommand.getJsonLDTerms(mockOreMap)); setPrivateField(bagGenerator, "aggregation", (com.google.gson.JsonObject) JsonParser .parseString(oremapObject.getJsonObject(JsonLDTerm.ore("describes").getLabel()).toString())); setPrivateField(bagGenerator, "totalDataSize", 1024000L); diff --git a/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java b/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java index 19d478f4b0d..6595404b755 100644 --- a/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java @@ -157,4 +157,101 @@ void wrapsAtWordBoundary_withLabelLength() { String out = callMultilineWrap(input); assertThat(out).isEqualTo(expected); } -} \ No newline at end of file + + // Tests for additional line separator characters + + @Test + void multiline_withCR_normalizedAndIndented() { + String input = "Line1\rLine2\rLine3"; + String expected = "Line1\r\n Line2\r\n Line3"; + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void multiline_withCRLF_normalizedAndIndented() { + String input = "Line1\r\nLine2\r\nLine3"; + String expected = "Line1\r\n Line2\r\n Line3"; + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void multiline_withVT_normalizedAndIndented() { + // VT (U+000B) - Vertical Tab + String input = "Line1\u000BLine2\u000BLine3"; + String expected = "Line1\r\n Line2\r\n Line3"; + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void multiline_withFF_normalizedAndIndented() { + // FF (U+000C) - Form Feed + String input = "Line1\u000CLine2\u000CLine3"; + String expected = "Line1\r\n Line2\r\n Line3"; + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void multiline_withNEL_normalizedAndIndented() { + // NEL (U+0085) - Next Line + String input = "Line1\u0085Line2\u0085Line3"; + String expected = "Line1\r\n Line2\r\n Line3"; + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void multiline_withLS_normalizedAndIndented() { + // LS (U+2028) - Line Separator + String input = "Line1\u2028Line2\u2028Line3"; + String expected = "Line1\r\n Line2\r\n Line3"; + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void multiline_withPS_normalizedAndIndented() { + // PS (U+2029) - Paragraph Separator + String input = "Line1\u2029Line2\u2029Line3"; + String expected = "Line1\r\n Line2\r\n Line3"; + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void multiline_mixedSeparators_normalizedAndIndented() { + // Test with a mix of different line separators + String input = "Line1\nLine2\rLine3\r\nLine4\u000BLine5\u000CLine6\u0085Line7\u2028Line8\u2029Line9"; + String expected = "Line1\r\n Line2\r\n Line3\r\n Line4\r\n Line5\r\n Line6\r\n Line7\r\n Line8\r\n Line9"; + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void emptyLines_withVariousSeparators_trimmedAndSkipped() { + // Test empty lines with different separators + String input = "Line1\n\nLine3\r\rLine5\u000B\u000BLine7"; + String expected = "Line1\r\n Line3\r\n Line5\r\n Line7"; + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void longLine_withCRLF_wrapsAndIndents() { + String input = "a".repeat(100) + "\r\n" + "b".repeat(100); + String expected = "a".repeat(79) + "\r\n " + "a".repeat(21) + "\r\n " + "b".repeat(79) + "\r\n " + "b".repeat(21); + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void longLine_withMixedSeparators_wrapsAndIndents() { + String input = "a".repeat(100) + "\n" + "b".repeat(100) + "\r" + "c".repeat(100); + String expected = "a".repeat(79) + "\r\n " + "a".repeat(21) + "\r\n " + "b".repeat(79) + "\r\n " + "b".repeat(21) + "\r\n " + "c".repeat(79) + "\r\n " + "c".repeat(21); + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } +}