diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index d2dd034d1..f27608ffc 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -1,112 +1,107 @@ -# Architecture of SHARE/Trove -> NOTE: this document requires update (big ol' TODO) - +# Architecture of SHARE/trove This document is a starting point and reference to familiarize yourself with this codebase. ## Bird's eye view -In short, SHARE/Trove takes metadata records (in any supported input format), -ingests them, and makes them available in any supported output format. -``` - ┌───────────────────────────────────────────┐ - │ Ingest │ - │ ┌──────┐ │ - │ ┌─────────────────────────┐ ┌──►Format├─┼────┐ - │ │ Normalize │ │ └──────┘ │ │ - │ │ │ │ │ ▼ -┌───────┐ │ │ ┌─────────┐ ┌────────┐ │ │ ┌──────┐ │ save as -│Harvest├─┬─┼─┼─►Transform├──►Regulate├─┼─┬─┼──►Format├─┼─┬─►FormattedMetadataRecord -└───────┘ │ │ │ └─────────┘ └────────┘ │ │ │ └──────┘ │ │ - │ │ │ │ │ . │ │ ┌───────┐ - │ │ └─────────────────────────┘ │ . │ └──►Indexer│ - │ │ │ . │ └───────┘ - │ └─────────────────────────────┼─────────────┘ some formats also - │ │ indexed separately - ▼ ▼ - save as save as - RawDatum NormalizedData +In short, SHARE/trove holds metadata records that describe things and makes those records available for searching, browsing, and subscribing. + +![overview of shtrove: metadata records in, search/browse/subscribe out](./project/static/img/shtroverview.png) + + +## Parts +a look at the tangles of communication between different parts of the system: + +```mermaid +graph LR; + subgraph shtrove; + subgraph web[api/web server]; + ingest; + search; + browse; + rss; + atom; + oaipmh; + end; + worker["background worker (celery)"]; + indexer["indexer daemon"]; + rabbitmq["task queue (rabbitmq)"]; + postgres["database (postgres)"]; + elasticsearch; + web---rabbitmq; + web---postgres; + web---elasticsearch; + worker---rabbitmq; + worker---postgres; + worker---elasticsearch; + indexer---rabbitmq; + indexer---postgres; + indexer---elasticsearch; + end; + source["metadata source (e.g. osf.io backend)"]; + user["web user, either by browsing directly or via web app (like osf.io)"]; + subscribers["feed subscription tools"]; + source-->ingest; + user-->search; + user-->browse; + subscribers-->rss; + subscribers-->atom; + subscribers-->oaipmh; ``` ## Code map A brief look at important areas of code as they happen to exist now. -### Static configuration - -`share/schema/` describes the "normalized" metadata schema/format that all -metadata records are converted into when ingested. - -`share/sources/` describes a starting set of metadata sources that the system -could harvest metadata from -- these will be put in the database and can be -updated or added to over time. - -`project/settings.py` describes system-level settings which can be set by -environment variables (and their default values), as well as settings -which cannot. - -`share/models/` describes the data layer using the [Django](https://www.djangoproject.com/) ORM. - -`share/subjects.yaml` describes the "central taxonomy" of subjects allowed -in `Subject.name` fields of `NormalizedData`. - -### Harvest and ingest - -`share/harvest/` and `share/harvesters/` describe how metadata records -are pulled from other metadata repositories. - -`share/transform/` and `share/transformers/` describe how raw data (possibly -in any format) are transformed to the "normalized" schema. +- `trove`: django app for rdf-based apis + - `trove.digestive_tract`: most of what happens after ingestion + - stores records and identifiers in the database + - initiates indexing + - `trove.extract`: parsing ingested metadata records into resource descriptions + - `trove.derive`: from a given resource description, create special non-rdf serializations + - `trove.render`: from an api response modeled as rdf graph, render the requested mediatype + - `trove.models`: database models for identifiers and resource descriptions + - `trove.trovesearch`: builds rdf-graph responses for trove search apis (using `IndexStrategy` implementations from `share.search`) + - `trove.vocab`: identifies and describes concepts used elsewhere + - `trove.vocab.trove`: describes types, properties, and api paths in the trove api + - `trove.vocab.osfmap`: describes metadata from osf.io (currently the only metadata ingested) + - `trove.openapi`: generate openapi json for the trove api from thesaurus in `trove.vocab.trove` +- `share`: django app with search indexes and remnants of sharev2 + - `share.models`: database models for external sources, users, and other system book-keeping + - `share.oaipmh`: provide data via [OAI-PMH](https://www.openarchives.org/OAI/openarchivesprotocol.html) + - `share.search`: all interaction with elasticsearch + - `share.search.index_strategy`: abstract base class `IndexStrategy` with multiple implementations, for different approaches to indexing the same data + - `share.search.daemon`: the "indexer daemon", an optimized background worker for batch-processing updates and sending to all active index strategies + - `share.search.index_messenger`: for sending messages to the indexer daemon +- `api`: django app with remnants of the legacy sharev2 api + - `api.views.feeds`: allows custom RSS and Atom feeds + - otherwise, subject to possible deprecation +- `osf_oauth2_adapter`: django app for login via osf.io +- `project`: the actual django project + - default settings at `project.settings` + - pulls together code from other directories implemented as django apps (`share`, `trove`, `api`, and `osf_oauth2_adapter`) -`share/regulate/` describes rules which are applied to every normalized datum, -regardless where or what format it originally come from. -`share/metadata_formats/` describes how a normalized datum can be formatted -into any supported output format. - -`share/tasks/` runs the harvest/ingest pipeline and stores each task's status -(including debugging info, if errored) as a `HarvestJob` or `IngestJob`. - -### Outward-facing views - -`share/search/` describes how the search indexes are structured, managed, and -updated when new metadata records are introduced -- this provides a view for -discovering items based on whatever search criteria. - -`share/oaipmh/` describes the [OAI-PMH](https://www.openarchives.org/OAI/openarchivesprotocol.html) -view for harvesting metadata from SHARE/Trove in bulk. - -`api/` describes a mostly REST-ful API that's useful for inspecting records for -a specific item of interest. - -### Internals - -`share/admin/` is a Django-app for administrative access to the SHARE database -and pipeline logs - -`osf_oauth2_adapter/` is a Django app to support logging in to SHARE via OSF +## Cross-cutting concerns -### Testing +### Resource descriptions -`tests/` are tests. +Uses the [resource description framework](https://www.w3.org/TR/rdf11-primer/#section-Introduction): +- the content of each ingested metadata record is an rdf graph focused on a specific resource +- all api responses from `trove` views are (experimentally) modeled as rdf graphs, which may be rendered a variety of ways -## Cross-cutting concerns +### Identifiers -### Immutable metadata +Whenever feasible, use full URI strings to identify resources, concepts, types, and properties that may be exposed outwardly. -Metadata records at all stages of the pipeline (`RawDatum`, `NormalizedData`, -`FormattedMetadataRecord`) should be considered immutable -- any updates -result in a new record being created, not an old record being altered. +Prefer using open, standard, well-defined namespaces wherever possible ([DCAT](https://www.w3.org/TR/vocab-dcat-3/) is a good place to start; see `trove.vocab.namespaces` for others already in use). When app-specific concepts must be defined, use the `TROVE` namespace (`https://share.osf.io/vocab/2023/trove/`). -Multiple records which describe the same item/object are grouped by a -"source-unique identifier" or "suid" -- essentially a two-tuple -`(source, identifier)` that uniquely and persistently identifies an item in -the source repository. In most outward-facing views, default to showing only -the most recent record for each suid. +A notable exception (non-URI identifier) is the "source-unique identifier" or "suid" -- essentially a two-tuple `(source, identifier)` that uniquely and persistently identifies a metadata record in a source repository. This `identifier` may be any string value, provided by the external source. ### Conventions (an incomplete list) -- functions prefixed `pls_` ("please") are a request for something to happen +- local variables prefixed with underscore (to consistently distinguish between internal-only names and those imported/built-in) +- prefer full type annotations in python code, wherever reasonably feasible ## Why this? inspired by [this writeup](https://matklad.github.io/2021/02/06/ARCHITECTURE.md.html) diff --git a/CHANGELOG.md b/CHANGELOG.md index d8af0c86a..c92dbfcf6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # Change Log +# [25.5.1] - 2025-08-21 +- improve error handling in celery task-result backend +- use logging config in celery worker +- improve code docs (README.md et al.) + # [25.5.0] - 2025-07-15 - use python 3.13 - use `poetry` to manage dependencies diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d14287ddb..ca8dcf691 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,7 +1,18 @@ # CONTRIBUTING -TODO: how do we want to guide community contributors? +> note: this codebase is currently (and historically) rather entangled with [osf.io](https://osf.io), which has its shtrove at https://share.osf.io -- stay tuned for more-reusable open-source libraries and tools that should be more accessible to community contribution -For now, if you're interested in contributing to SHARE/Trove, feel free to +For now, if you're interested in contributing to SHARE/trove, feel free to [open an issue on github](https://github.com/CenterForOpenScience/SHARE/issues) and start a conversation. + +## Required checks + +All changes must pass the following checks with no errors: +- linting: `python -m flake8` +- static type-checking (on `trove/` code only, for now): `python -m mypy trove` +- tests: `python -m pytest -x tests/` + - note: some tests require other services running -- if [using the provided docker-compose.yml](./how-to/run-locally.md), recommend running in the background (upping worker ups all: `docker compose up -d worker`) and executing tests from within one of the python containers (`indexer`, `worker`, or `web`): + `docker compose exec indexer python -m pytest -x tests/` + +All new changes should also avoid decreasing test coverage, when reasonably possible (currently checked on github pull requests). diff --git a/README.md b/README.md index 27a21f903..201adfc2b 100644 --- a/README.md +++ b/README.md @@ -1,33 +1,17 @@ -# SHARE/Trove +# SHARE/trove (aka SHARtrove, shtrove) -SHARE is creating a free, open dataset of research (meta)data. +> share (verb): to have or use in common. -> **Note**: SHARE’s open API tools and services help bring together scholarship distributed across research ecosystems for the purpose of greater discoverability. However, SHARE does not guarantee a complete aggregation of searched outputs. For this reason, SHARE results should not be used for methodological analyses, such as systematic reviews. +> trove (noun): a store of valuable or delightful things. -[![Coverage Status](https://coveralls.io/repos/github/CenterForOpenScience/SHARE/badge.svg?branch=develop)](https://coveralls.io/github/CenterForOpenScience/SHARE?branch=develop) +SHARE/trove (aka SHARtrove, shtrove) is is a service meant to store (meta)data you wish to keep and offer openly. -## Documentation +note: this codebase is currently (and historically) rather entangled with [osf.io](https://osf.io), which has its shtrove at https://share.osf.io -- stay tuned for more-reusable open-source libraries and tools for working with (meta)data -### What is this? -see [WHAT-IS-THIS-EVEN.md](./WHAT-IS-THIS-EVEN.md) +see [ARCHITECTURE.md](./ARCHITECTURE.md) for help navigating this codebase -### How can I use it? -see [how-to/use-the-api.md](./how-to/use-the-api.md) +see [CONTRIBUTING.md](./CONTRIBUTING.md) for info about contributing changes -### How do I navigate this codebase? -see [ARCHITECTURE.md](./ARCHITECTURE.md) - -### How do I run a copy locally? -see [how-to/run-locally.md](./how-to/run-locally.md) - - -## Running Tests - -### Unit test suite - - py.test - -### BDD Suite - - behave +see [how-to/use-the-api.md](./how-to/use-the-api.md) for help using the api to add and access (meta)data +see [how-to/run-locally.md](./how-to/run-locally.md) for help running a shtrove instance for local development diff --git a/TODO.md b/TODO.md new file mode 100644 index 000000000..4b9d41b16 --- /dev/null +++ b/TODO.md @@ -0,0 +1,86 @@ +# TODO: +ways to better this mess + +## better shtrove api experience + +- better web-browsing experience + - when `Accept` header accepts html, use html regardless of query-params + - when query param `acceptMediatype` requests another mediatype, display on page in copy/pastable way + - exception: when given `withFileName`, download without html wrapping + - exception: `/trove/browse` should still give hypertext with clickable links + - include more explanatory docs (and better fill out those explanations) + - more helpful (less erratic) visual design + - in each html rendering of an api response, include a `
` for adding/editing/viewing query params +- better tsv/csv experience + - set default columns for `index-value-search` (and/or broadly improve `fields` handling) +- better turtle experience + - quoted literal graphs also turtle + - omit unnecessary `^^rdf:string` +- better jsonld experience + - provide `@context` (via header, at least) + - accept jsonld at `/trove/ingest` (or at each `ldp:inbox`...) + + +## modular packaging +move actually-helpful logic into separate packages that can be used and maintained independently of +any particular web app/api/framework (and then use those packages in shtrove and osf) + +- `osfmap`: standalone OSFMAP definition + - define osfmap properties and shapes (following DCTAP) in static tsv files + - use `tapshoes` (below) to generate docs and helpful utility functions + - may replace/simplify: + - `osf.metadata.osf_gathering.OSFMAP` (and related constants) + - `trove.vocab.osfmap` + - `trove.derive.osfmap_json` +- `tapshoes`: for using and packaging [tabular application profiles](https://dcmi.github.io/dctap/) in python + - take a set of tsv/csv files as input + - should support any valid DCTAP (aim to be worth community interest) + - initial/immediate use case `osfmap` + - generate more human-readable docs of properties and shapes/types + - validate a given record (rdf graph) against a profile + - serialize a valid record in a consistent/stable way (according to the profile) + - enable publishing "official" application profiles as installable python packages + - learn from and consider using prior dctap work: + - dctap-python: https://pypi.org/project/dctap/ + - loads tabular files into more immediately usable form + - tap2shacl: https://pypi.org/project/tap2shacl/ + - builds shacl constraints from application profile + - could then validate a given graph with pyshacl: https://pypi.org/project/pyshacl/ +- metadata record crosswalk/serialization + - given a record (as rdf graph) and application profile to which it conforms (like OSFMAP), offer: + - crosswalking to a standard vocab (DCAT, schema.org, ...) + - stable rdf serialization (json-ld, turtle, xml, ...) + - special bespoke serialization (datacite xml/json, oai_dc, ...) + - may replace/simplify: + - `osf.metadata.serializers` + - `trove.derive` +- `shtrove`: reusable package with the good parts of share/trove + - python api and command-line tools + - given application profile + - digestive tract with pluggable storage/indexing interfaces + - methods for ingest, search, browse, subscribe +- `django-shtrove`: django wrapper for `shtrove` functionality + - set application profile via django setting + - django models for storage, elasticsearch for indexing + - django views for ingest, search, browse, subscribe + + +## open web standards +- data catalog vocabulary (DCAT) https://www.w3.org/TR/vocab-dcat-3/ + - an appropriate (and better thought-thru) vocab for a lot of what shtrove does + - already used in some ways, but would benefit from adopting more thoroughly + - replace bespoke types (like `trove:Indexcard`) with better-defined dcat equivalents (like `dcat:CatalogRecord`) + - rename various properties/types/variables similarly + - "catalog" vs "index" + - "record" vs "card" + - replace checksum-iris with `spdx:checksum` (added in dcat 3) +- linked data notifications (LDN) https://www.w3.org/TR/ldn/ + - shtrove incidentally (partially) aligns with linked-data principles -- could lean into that + - replace `/trove/ingest` with one or more `ldp:inbox` urls + - trove index-card like an inbox containing current/past resource descriptions + ``` + <://osf.example/blarg> ldp:inbox <://shtrove.example/index-card/0000-00...> . + <://shtrove.example/index-card/0000-00...> ldp:contains <://shtrove.example/description/0000-00...> . + <://shtrove.example/description/0000-00...> foaf:primaryTopic <://osf.example/blarg> + ``` + (might consider renaming "index-card" for consistency/clarity) diff --git a/WHAT-IS-THIS-EVEN.md b/WHAT-IS-THIS-EVEN.md deleted file mode 100644 index 8dd64d7e1..000000000 --- a/WHAT-IS-THIS-EVEN.md +++ /dev/null @@ -1,42 +0,0 @@ -# "What is this, even?" - -Imagine a vast, public library full of the outputs and results of some scientific -research -- shelves full of articles, preprints, datasets, data analysis plans, -and so on. - -You can think of SHARE/Trove as that library's card catalog. - -## "...What is a card catalog?" - -A [card catalog](https://en.wikipedia.org/wiki/Card_catalog) is that weird, cool cabinet you might see at the front of a -library with a bunch of tiny drawers full of index cards -- each index card -contains information about some item on the library shelves. - -The card catalog is where you go when you want to: -- locate a specific item in the library -- discover items related to a specific topic, author, or other keywords -- make a new item easily discoverable by others - -## "OK but what 'library' is this?" -As of July 2021, SHARE/Trove contains metadata on over 4.5 million items originating from: -- [OSF](https://osf.io) (including OSF-hosted Registries and Preprint Providers) -- [REPEC](http://repec.org) -- [arXiv](https://arxiv.org) -- [ClinicalTrials.gov](https://clinicaltrials.gov) -- ...and more! - -Updates from OSF are reflected within seconds, while updates from third-party sources are -harvested once daily. - -## "How can I use it?" - -You can search the full SHARE/Trove catalog at -[share.osf.io/discover](https://share.osf.io/discover). - -Other search pages can also be built on SHARE/Trove, showing only a specific -collection of items. For example, [OSF Preprints](https://osf.io/preprints/discover) -and [OSF Registries](https://osf.io/registries/discover) show only registrations -and preprints, respectively, which are hosted on OSF infrastructure. - -To learn about using the API (instead of a user interface), see -[how-to/use-the-api.md](./how-to/use-the-api.md) diff --git a/how-to/add-a-source.rst b/how-to/add-a-source.rst deleted file mode 100644 index 8e31ea6ac..000000000 --- a/how-to/add-a-source.rst +++ /dev/null @@ -1,251 +0,0 @@ -.. _harvesters-and-transformers: - -Harvesters and Transformers -=========================== - -A `harvester` gathers raw data from a source using their API. - -A `transformer` takes the raw data gathered by a harvester and maps the fields to the defined :ref:`SHARE models `. - -Writing a Harvester and Transformer ------------------------------------ - -See the transformers and harvesters located in the ``share/transformers/`` and ``share/harvesters/`` directories for more examples of syntax and best practices. - -Adding a new source -""""""""""""""""""""" - -- Determine whether the source has an API to access their metadata -- Create a source folder at ``share/sources/{source name}`` - - Source names are typically the reversed domain name of the source, e.g. a source at ``http://example.com`` would have the name ``com.example`` -- Create a file named ``source.yaml`` in the source folder - - See :ref:`Writing a source.yaml file ` -- Determine whether the source makes their data available using the `OAI-PMH`_ protocol - - If the source is OAI see :ref:`Best practices for OAI sources ` -- Writing the harvester - - See :ref:`Best practices for writing a Harvester ` -- Writing the transformer - - See :ref:`Best practices for writing a Transformer ` -- Adding a sources's icon - - visit ``www.domain.com/favicon.ico`` and download the ``favicon.ico`` file - - place the favicon as ``icon.ico`` in the source folder -- Load the source - - To make the source available in your local SHARE, run ``./manage.py loadsources`` in the terminal - -.. _OAI-PMH: http://www.openarchives.org/OAI/openarchivesprotocol.html - - -.. _writing-yaml: - -Writing a source.yaml file -"""""""""""""""""""""""""" - -The ``source.yaml`` file contains information about the source itself, and one or more configs that describe how to harvest and transform data from that source. - -.. code-block:: yaml - - name: com.example - long_title: Example SHARE Source for Examples - home_page: http://example.com/ - user: sources.com.example - configs: - - label: com.example.oai - base_url: http://example.com/oai/ - harvester: oai - harvester_kwargs: - metadata_prefix: oai_datacite - rate_limit_allowance: 5 - rate_limit_period: 1 - transformer: org.datacite - transformer_kwargs: {} - -See the whitepaper_ for Source and SourceConfig tables for the available fields. - -.. _whitepaper: https://github.com/CenterForOpenScience/SHARE/blob/develop/whitepapers/Tables.md - -.. _oai-sources: - -Best practices for OAI sources -"""""""""""""""""""""""""""""" - -Sources that use OAI-PMH_ make it easy to harvest their metadata. - -- Set ``harvester: oai`` in the source config. -- Choose a metadata format to harvest. - - Use the ``ListMetadataFormats`` OAI verb to see what formats the source supports. - - Every OAI source supports ``oai_dc``, but they usually also support at least one other format that has richer, more structured data, like ``oai_datacite`` or ``mods``. - - Choose the format that seems to have the most useful data for SHARE, especially if a transformer for that format already exists. - - Choose ``oai_dc`` only as a last resort. -- Add ``metadata_prefix: {prefix}`` to the ``harvester_kwargs`` in the source config. -- If necessary, write a transformer for the chosen format. - - See :ref:`Best practices for writing a Transformer ` - - -.. _.gitignore: https://github.com/CenterForOpenScience/SHARE/blob/develop/.gitignore - - -.. _writing-harvesters: - -Best practices for writing a non-OAI Harvester -"""""""""""""""""""""""""""""""""""""""""""""" - -- The harvester should be defined in ``share/harvesters/{harvester name}.py``. -- When writing the harvester: - - Inherit from ``share.harvest.BaseHarvester`` - - Add the version of the harvester ``VERSION = 1`` - - Implement ``do_harvest(...)`` (and possibly additional helper functions) to make requests to the source and to yield the harvested records. - - Check to see if the data returned by the source is paginated. - - There will often be a resumption token to get the next page of results. - - Check to see if the source's API accepts a date range - - If the API does not then, if possible, check the date on each record returned and stop harvesting if the date on the record is older than the specified start date. -- Add the harvester to ``entry_points`` in ``setup.py`` - - e.g. ``'com.example = share.harvesters.com_example:ExampleHarvester',`` - - run ``python setup.py develop`` to make the harvester available in your local SHARE -- Test by :ref:`running the harvester ` - -.. _writing-transformers: - -Best practices for writing a non-OAI Transformer -"""""""""""""""""""""""""""""""""""""""""""""""" - -- The transformer should be defined in ``share/transformers/{transformer name}.py``. -- When writing the transformer: - - Determine what information from the source record should be stored as part of the ``CreativeWork`` :ref:`model ` (i.e. if the record clearly defines a title, description, contributors, etc.). - - Use the :ref:`chain transformer tools ` as necessary to correctly parse the raw data. - - Alternatively, implement ``share.transform.BaseTransformer`` to create a transformer from scratch. - - Utilize the ``Extra`` class - - Raw data that does not fit into a defined :ref:`share model ` should be stored here. - - Raw data that is otherwise altered in the transformer should also be stored here to ensure data integrity. -- Add the transformer to ``entry_points`` in ``setup.py`` - - e.g. ``'com.example = share.transformer.com_example:ExampleTransformer',`` - - run ``python setup.py develop`` to make the transformer available in your local SHARE -- Test by :ref:`running the transformer ` against raw data you have harvested. - -.. _chain-transformer: - -SHARE Chain Transformer -""""""""""""""""""""""" - -SHARE provides a set of tools for writing transformers, based on the idea of constructing chains for each field that lead from the root of the raw document to the data for that field. To write a chain transformer, add ``from share.transform.chain import links`` at the top of the file and make the transformer inherit ``share.transform.chain.ChainTransformer``. - - -.. code-block:: python - - from share.transform.chain import ctx, links, ChainTransformer, Parser - - - class CreativeWork(Parser): - title = ctx.title - - - class ExampleTransformer(ChainTransformer): - VERSION = 1 - root_parser = CreativeWork - - -- Concat - To combine list or singular elements into a flat list:: - - links.Concat(, ) - -.. _delegate-reference: - -- Delegate - To specify which class to use:: - - links.Delegate() - -- Join - To combine list elements into a single string:: - - links.Join(, joiner=' ') - - Elements are separated with the ``joiner``. - By default ``joiner`` is a newline. - -- Map - To designate the class used for each instance of a value found:: - - links.Map(links.Delegate(), ) - - See the :ref:`share models ` for what uses a through table (anything that sets ``through=``). - Uses the :ref:`Delegate ` tool. - -- Maybe - To transform data that is not consistently available:: - - links.Maybe(, '') - - Indexing further if the path exists:: - - links.Maybe(, '')[''] - - Nesting Maybe:: - - links.Maybe(links.Maybe(, '')[''], '') - - To avoid excessive nesting use the :ref:`Try link ` - -- OneOf - To specify two possible paths for a single value:: - - links.OneOf(, ) - -- ParseDate - To determine a date from a string:: - - links.ParseDate() - -- ParseLanguage - To determine the ISO language code (i.e. 'ENG') from a string (i.e. 'English'):: - - links.ParseLanguage() - - Uses pycountry_ package. - - .. _pycountry: https://pypi.python.org/pypi/pycountry - -- ParseName - To determine the parts of a name (i.e. first name) out of a string:: - - links.ParseName().first - - options:: - - first - last - middle - suffix - title - nickname - - Uses nameparser_ package. - - .. _nameparser: https://pypi.python.org/pypi/nameparser - -- RunPython - To run a defined python function:: - - links.RunPython('', , *args, **kwargs) - -- Static - To define a static field:: - - links.Static() - -- Subjects - To map a subject to the PLOS taxonomy based on defined mappings:: - - links.Subjects() - -.. _try-reference: - -- Try - To transform data that is not consistently available and may throw an exception:: - - links.Try() - -- XPath - To access data using xpath:: - - links.XPath(, "") diff --git a/how-to/run-locally.md b/how-to/run-locally.md index 99e4a523d..7d0e6eb05 100644 --- a/how-to/run-locally.md +++ b/how-to/run-locally.md @@ -1,14 +1,14 @@ # SHARE Quickstart or: How I Learned to Stop Worrying and Love the Dock -this guide guides you through setting up SHARE locally using Docker -for development and manual testing. +this guide guides you through setting up SHARE locally for development and manual testing +using the `docker-compose.yml` file included in this repository. this guide does NOT guide you to anything appropriate for the open Internet. ## pre-requisites -- [git](https://git-scm.com/) -- [docker](https://www.docker.com/) (including `docker-compose`) +- [git](https://git-scm.com/) or equivalent +- [docker](https://www.docker.com/) (including `docker-compose`) or equivalent ## getting a local SHARE running @@ -48,11 +48,11 @@ docker-compose run --rm --no-deps worker bash this will open a bash prompt within a temporary `worker` container -- from here we can run commands within SHARE's environment, including django's `manage.py` -from within that worker shell, use django's `migrate` command to set up tables in postgres: +from within that worker shell, use django's `migrate` command to create tables in postgres: ``` python manage.py migrate ``` -...and use `sharectl` to set up indexes in elasticsearch: +...and the `shtrove_search_setup` command to create indexes in elasticsearch: ``` python manage.py shtrove_search_setup --initial ``` diff --git a/how-to/use-the-api.md b/how-to/use-the-api.md index 2a220615b..7a89650d6 100644 --- a/how-to/use-the-api.md +++ b/how-to/use-the-api.md @@ -1,25 +1,29 @@ -# How to use the API +# how to use the api -(see [openapi docs](/trove/docs/openapi.html) for detail) +## searching and browsing -## Sample and search for index-cards +`GET /trove/index-card-search`: search for cards that identify and describe things -`GET /trove/index-card-search`: search index-cards +`GET /trove/index-value-search`: search for values (like identifiers) used on cards, which you can use in card-searches -`GET /trove/index-value-search`: search values for specific properties on index-cards +`GET /trove/browse?iri=...`: inquire about a thing you have already identified -## Posting index-cards +(see [openapi docs](/trove/docs/openapi.html) for detail and available parameters) + + +### Posting index-cards > NOTE: currently used only by other COS projects, not yet for public use, authorization required -`POST /trove/ingest?focus_iri=...&record_identifier=...`: +`POST /trove/ingest?focus_iri=...`: currently supports only `Content-Type: text/turtle` query params: - `focus_iri` (required): full iri of the focus resource, exactly as used in the request body -- `record_identifier` (required): a source-specific identifier for the metadata record (no format restrictions) -- sending another record with the same `record_identifier` is considered a full update (only the most recent is used) +- `record_identifier`: a source-specific identifier for the metadata record (if omitted, uses `focus_iri`) -- sending another record with the same `record_identifier` is considered a full update (only the most recent is used) - `nonurgent`: if present (regardless of value), ingestion may be given a lower priority -- recommended for bulk or background operations - `is_supplementary`: if present (regardless of value), this record's metadata will be added to all pre-existing index-cards from the same user with the same `focus_iri` (if any), but will not get an index-card of its own nor affect the last-updated timestamp (e.g. in OAI-PMH) of the index-cards it supplements + - note: supplementary records must have a different `record_identifier` from the primary records for the same focus - `expiration_date`: optional date (in format `YYYY-MM-DD`) when the record is no longer valid and should be removed ## Deleting index-cards @@ -32,4 +36,3 @@ query params: `/oaipmh` -- an implementation of the Open Access Initiative's [Protocol for Metadata Harvesting](https://www.openarchives.org/OAI/openarchivesprotocol.html), an open standard for harvesting metadata from open repositories. You can use this to list metadata in bulk, or query by a few simple parameters (date range or source). - diff --git a/project/settings.py b/project/settings.py index a29abf4ef..96fa1d00d 100644 --- a/project/settings.py +++ b/project/settings.py @@ -326,6 +326,7 @@ def split(string, delim): RABBITMQ_HEARTBEAT_TIMEOUT = int(os.environ.get('RABBITMQ_HEARTBEAT_TIMEOUT', 60)) CELERY_BROKER_URL = os.environ.get('CELERY_BROKER_URL', 'amqp://{}:{}@{}:{}/{}'.format(RABBITMQ_USERNAME, RABBITMQ_PASSWORD, RABBITMQ_HOST, RABBITMQ_PORT, RABBITMQ_VHOST)) +CELERY_WORKER_HIJACK_ROOT_LOGGER = False CELERY_BEAT_SCHEDULER = 'django_celery_beat.schedulers:DatabaseScheduler' CELERY_BEAT_SCHEDULE = { @@ -336,6 +337,8 @@ def split(string, delim): } CELERY_RESULT_BACKEND = 'share.celery:CeleryDatabaseBackend' +CELERY_RESULT_BACKEND_ALWAYS_RETRY = True +CELERY_RESULT_BACKEND_MAX_RETRIES = int(os.environ.get('CELERY_RESULT_BACKEND_MAX_RETRIES', 17)) CELERY_RESULT_EXPIRES = int(os.environ.get( 'CELERY_RESULT_EXPIRES', 60 * 60 * 24 * 3, # 3 days diff --git a/project/static/img/shtroverview.png b/project/static/img/shtroverview.png new file mode 100644 index 000000000..0c78c3ebc Binary files /dev/null and b/project/static/img/shtroverview.png differ diff --git a/pyproject.toml b/pyproject.toml index 3dd8fa038..064e3ecd4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "shtrove" -version = "25.5.0" +version = "25.5.1" description = "" authors = [ {name = "CenterForOpenScience", email = "share-support@cos.io"} diff --git a/share/celery.py b/share/celery.py index 663ddbba9..962e08c11 100644 --- a/share/celery.py +++ b/share/celery.py @@ -4,14 +4,16 @@ from celery import states from celery.app.task import Context -from celery.backends.base import BaseDictBackend +from celery.backends.base import BaseBackend from celery.utils.time import maybe_timedelta - from django.conf import settings -from django.db import transaction +from django.db import ( + transaction, + IntegrityError as DjIntegrityError, + OperationalError as DjOperationalError, +) from django.db.models import Q from django.utils import timezone - import sentry_sdk from share.models import CeleryTaskResult @@ -40,7 +42,7 @@ def wrapped(*args, **kwargs): # Based on https://github.com/celery/django-celery-results/commit/f88c677d66ba1eaf1b7cb1f3b8c910012990984f -class CeleryDatabaseBackend(BaseDictBackend): +class CeleryDatabaseBackend(BaseBackend): """ Implemented from scratch rather than subclassed due to: @@ -53,8 +55,53 @@ class CeleryDatabaseBackend(BaseDictBackend): """ TaskModel = CeleryTaskResult + ### + # decorate some methods to fully stop/restart the worker on unhandled errors, + # including safe-to-retry errors that have been maximally retried + # (restarting may resolve some problems; others it will merely make more visible) + + @die_on_unhandled + def get_task_meta(self, *args, **kwargs): + super().get_task_meta(*args, **kwargs) + + @die_on_unhandled + def store_result(self, *args, **kwargs): + super().store_result(*args, **kwargs) + @die_on_unhandled + def forget(self, *args, **kwargs): + super().forget(*args, **kwargs) + + @die_on_unhandled + def cleanup(self, expires=None): + # no super implementation + TaskResultCleaner( + success_ttl=(expires or self.expires), + nonsuccess_ttl=settings.FAILED_CELERY_RESULT_EXPIRES, + ).clean() + + # END die_on_unhandled decorations + ### + + # override BaseBackend + def exception_safe_to_retry(self, exc): + return isinstance(exc, ( + DjOperationalError, # connection errors and whatnot + DjIntegrityError, # e.g. overlapping transactions with conflicting `get_or_create` + )) + + # implement for BaseBackend def _store_result(self, task_id, result, status, traceback=None, request=None, **kwargs): + _already_successful = ( + self.TaskModel.objects + .filter(task_id=task_id, status=states.SUCCESS) + .exists() + ) + if _already_successful: + # avoid clobbering prior successful result, which could be caused by network partition or lost worker, ostensibly: + # https://github.com/celery/celery/blob/92514ac88afc4ccdff31f3a1018b04499607ca1e/celery/backends/base.py#L967-L972 + return + fields = { 'result': result, 'traceback': traceback, @@ -88,20 +135,14 @@ def _store_result(self, task_id, result, status, traceback=None, request=None, * setattr(obj, key, value) obj.save() - return obj - - @die_on_unhandled - def cleanup(self, expires=None): - TaskResultCleaner( - success_ttl=(expires or self.expires), - nonsuccess_ttl=settings.FAILED_CELERY_RESULT_EXPIRES, - ).clean() - - @die_on_unhandled + # implement for BaseBackend def _get_task_meta_for(self, task_id): - return self.TaskModel.objects.get(task_id=task_id).as_dict() + try: + return self.TaskModel.objects.get(task_id=task_id).as_dict() + except self.TaskModel.DoesNotExist: + return {'status': states.PENDING, 'result': None} - @die_on_unhandled + # implement for BaseBackend def _forget(self, task_id): try: self.TaskModel.objects.get(task_id=task_id).delete()