Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/python_actions.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ jobs:

- name: Install dependencies
run: |
python -m pip install --upgrade setuptools pip
python -m pip install --upgrade setuptools "pip<24.1"
pip install -r requirements.txt
pip install -r dev-requirements.txt
- name: Test with pytest
Expand Down
2 changes: 1 addition & 1 deletion README.md
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ This pipeline is to process source reference files, if xml to parse them first a
python run.py RESOLVE -s <list of source filenames separated by spaces>
```

2. Specify a directory, and file extension, to recursively search all sub directories for this type of reference file, and queue them all for processing, use the command
2. Specify a directory, and file extension (i.e. -e *.raw), to recursively search all sub directories for this type of reference file, and queue them all for processing, use the command
```
python run.py RESOLVE -p <source files path> -e <source files extension>
```
Expand Down
635 changes: 391 additions & 244 deletions adsrefpipe/app.py
100644 → 100755

Large diffs are not rendered by default.

153 changes: 90 additions & 63 deletions adsrefpipe/models.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -20,17 +20,19 @@ class Action(Base):
__tablename__ = 'action'
status = Column(String, primary_key=True)

def get_status_new(self):
def get_status_new(self) -> str:
"""
returns the initial status

:return:
:return: string indicating the initial status
"""
return 'initial'

def get_status_retry(self):
def get_status_retry(self) -> str:
"""
returns the retry status

:return:
:return: string indicating the retry status
"""
return 'retry'

Expand All @@ -49,50 +51,57 @@ class Parser(Base):
reference_service_endpoint = Column(String)
matches = Column(JSONB, default=dict)

def __init__(self, name, extension_pattern, reference_service_endpoint, matches=[]):
def __init__(self, name: str, extension_pattern: str, reference_service_endpoint: str, matches: list = []):
"""
initializes a parser object

:param name:
:param extension_pattern:
:param reference_service_endpoint:
:param matches:
:param name: name of the parser
:param extension_pattern: reference file extension pattern used by the parser
:param reference_service_endpoint: endpoint for the reference service
:param matches: list of matches for the parser-reference file mapping
"""
self.name = name
self.extension_pattern = extension_pattern
self.reference_service_endpoint = reference_service_endpoint
self.matches = matches

def get_name(self):
def get_name(self) -> str:
"""
returns the name of the parser

:return:
:return: string indicating the name of the parser
"""
return self.name

def get_extension_pattern(self):
def get_extension_pattern(self) -> str:
"""
returns the extension pattern of the reference files processed by the parser

:return:
:return: string indicating the file extension pattern
"""
return self.extension_pattern

def get_endpoint(self):
def get_endpoint(self) -> str:
"""
returns the reference service endpoint to resolve references

:return:
:return: string indicating the reference service endpoint
"""
return self.reference_service_endpoint

def get_matches(self):
def get_matches(self) -> list:
"""
returns the list of mappings for the parser

:return:
:return: list of matches
"""
return self.matches

def toJSON(self):
def toJSON(self) -> dict:
"""
:return: values formatted as python dict
converts the parser object to a JSON dictionary

:return: dictionary containing the parser details
"""
return {
'name': self.name,
Expand All @@ -103,28 +112,37 @@ def toJSON(self):


class ReferenceSource(Base):
"""
This class represents the source of a reference in the database,
each entry links a source file with its resolved version and
the parser used to process the reference.
It serves as the initial record for the reference processing pipeline.
"""
__tablename__ = 'reference_source'
bibcode = Column(String, primary_key=True)
source_filename = Column(String, primary_key=True)
resolved_filename = Column(String)
parser_name = Column(String, ForeignKey('parser.name'))

def __init__(self, bibcode, source_filename, resolved_filename, parser_name):
def __init__(self, bibcode: str, source_filename: str, resolved_filename: str, parser_name: str):
"""
initializes a reference source object

:param bibcode:
:param source_filename:
:param resolved_filename:
:param parser_name:
:param bibcode: unique bibcode for the reference source
:param source_filename: name of the reference file
:param resolved_filename: name of the resolved file for future use
:param parser_name: name of the parser used
"""
self.bibcode = bibcode
self.source_filename = source_filename
self.resolved_filename = resolved_filename
self.parser_name = parser_name

def toJSON(self):
def toJSON(self) -> dict:
"""
:return: values formatted as python dict, if no values found returns empty structure, not None
converts the reference source object to a JSON dictionary

:return: dictionary containing reference source details
"""
return {
'bibcode': self.bibcode,
Expand All @@ -135,6 +153,10 @@ def toJSON(self):


class ProcessedHistory(Base):
"""
This class tracks the processing history of a resolved reference, recording details about the processing status,
reference file timestamp, and the total number of references parsed.
"""
__tablename__ = 'processed_history'
__table_args__ = (ForeignKeyConstraint( ['bibcode', 'source_filename'], ['reference_source.bibcode', 'reference_source.source_filename']),)
id = Column(Integer, primary_key=True)
Expand All @@ -145,15 +167,16 @@ class ProcessedHistory(Base):
date = Column(DateTime, default=func.now())
total_ref = Column(Integer)

def __init__(self, bibcode, source_filename, source_modified, status, date, total_ref):
def __init__(self, bibcode: str, source_filename: str, source_modified: DateTime, status: str, date: DateTime, total_ref: int):
"""
initializes a processed history object

:param bibcode:
:param source_filename:
:param source_modified:
:param status:
:param date:
:param total_ref:
:param bibcode: bibcode for the reference source
:param source_filename: name of the source reference file
:param source_modified: timestamp of the reference file at the time it was read
:param status: first time processing, or reprocessing this list of references
:param date: date of processing
:param total_ref: total number of references parsed
"""
self.bibcode = bibcode
self.source_filename = source_filename
Expand All @@ -162,9 +185,11 @@ def __init__(self, bibcode, source_filename, source_modified, status, date, tota
self.date = date
self.total_ref = total_ref

def toJSON(self):
def toJSON(self) -> dict:
"""
:return: values formatted as python dict, if no values found returns empty structure, not None
converts the processed history object to a JSON dictionary

:return: dictionary containing processed history details
"""
return {
'bibcode': self.bibcode,
Expand All @@ -177,6 +202,10 @@ def toJSON(self):


class ResolvedReference(Base):
"""
This class stores information about references that have been resolved, including the reference string, score,
and its associated history entry.
"""
__tablename__ = 'resolved_reference'
history_id = Column(Integer, ForeignKey('processed_history.id'), primary_key=True)
item_num = Column(Integer, primary_key=True)
Expand All @@ -185,14 +214,16 @@ class ResolvedReference(Base):
score = Column(Numeric)
reference_raw = Column(String)

def __init__(self, history_id, item_num, reference_str, bibcode, score, reference_raw):
def __init__(self, history_id: int, item_num: int, reference_str: str, bibcode: str, score: float, reference_raw: str):
"""
initializes a resolved reference object

:param history_id:
:param item_num
:param reference_str:
:param bibcode:
:param score:
:param history_id: ID of the related processed history entry
:param item_num: order of the reference within the source
:param reference_str: reference string
:param bibcode: resolved bibcode
:param score: confidence score of the resolved reference
:param reference_raw: raw reference string
"""
self.history_id = history_id
self.item_num = item_num
Expand All @@ -201,35 +232,28 @@ def __init__(self, history_id, item_num, reference_str, bibcode, score, referenc
self.score = score
self.reference_raw = reference_raw

def toJSON(self):
def toJSON(self) -> dict:
"""
:return: values formatted as python dict, if no values found returns empty structure, not None
converts the resolved reference object to a JSON dictionary

:return: dictionary containing resolved reference details
"""
if self.reference_raw:
return {
'history_id': self.history_id,
'reference_str': self.reference_str,
'bibcode': self.bibcode,
'score': self.score,
'item_num': self.item_num,
'reference_raw': self.reference_raw
}
# do not include reference_raw if it is None
return {
'history_id': self.history_id,
'reference_str': self.reference_str,
'bibcode': self.bibcode,
'score': self.score,
'item_num': self.item_num,
**({'reference_raw': self.reference_raw} if self.reference_raw else {})
}


class CompareClassic(Base):
"""
This table is for comparing classic resolver with service reference,
keeps track of service reference that matched classic reference
bibcode and score here is for classic

bibcode and score here is for classic, should be a temparary class
only used during development/testing and verification
"""
__tablename__ = 'compare_classic'
history_id = Column(Integer, ForeignKey('processed_history.id'), primary_key=True)
Expand All @@ -238,24 +262,27 @@ class CompareClassic(Base):
score = Column(Numeric)
state = Column(String)

def __init__(self, history_id, item_num, bibcode, score, state):
def __init__(self, history_id: int, item_num: int, bibcode: str, score: Numeric, state: str):
"""
initializes a compare classic object

:param history_id:
:param item_num:
:param bibcode:
:param classic_score:
:param state:
:param history_id: ID of the related processed history entry
:param item_num: order of the reference within the source
:param bibcode: resolved bibcode
:param score: confidence score of the resolved reference
:param state: comparison state (ie, matched, unmatched, etc.)
"""
self.history_id = history_id
self.item_num = item_num
self.bibcode = bibcode
self.score = score
self.state = state

def toJSON(self):
def toJSON(self) -> dict:
"""
:return: values formatted as python dict, if no values found returns empty structure, not None
converts the compare classic object to a JSON dictionary

:return: dictionary containing compare classic details
"""
return {
'history_id': self.history_id,
Expand Down
30 changes: 21 additions & 9 deletions adsrefpipe/refparsers/AASxml.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@

import sys, os
import regex as re
import argparse
from typing import List, Dict

from adsputils import setup_logging, load_config

logger = setup_logging('refparsers')
config = {}
config.update(load_config())
Expand All @@ -15,9 +14,14 @@


class AASreference(XMLreference):
"""
This class handles parsing AAS references in XML format. It extracts citation information such as authors,
year, journal, title, volume, pages, DOI, and eprint, and stores the parsed details.
"""

def parse(self):
"""
parse the AAS reference and extract citation information such as authors, year, title, and DOI

:return:
"""
Expand Down Expand Up @@ -50,22 +54,26 @@ def parse(self):


class AAStoREFs(XMLtoREFs):
"""
This class converts AAS XML references to a standardized reference format. It processes raw AAS references from
either a file or a buffer and outputs parsed references, including bibcodes, authors, volume, pages, and DOI.
"""

def __init__(self, filename, buffer):
def __init__(self, filename: str, buffer: str):
"""
initialize the AAStoREFs object to process AAS references

:param filename:
:param buffer:
:param unicode:
:param tag:
:param filename: the path to the source file
:param buffer: the XML references as a buffer
"""
XMLtoREFs.__init__(self, filename, buffer, parsername=AAStoREFs, tag='CITATION')


def process_and_dispatch(self):
def process_and_dispatch(self) -> List[Dict[str, List[Dict[str, str]]]]:
"""
perform reference cleaning and parsing, then dispatch the parsed references

:return:
:return: a list of dictionaries containing bibcodes and parsed references
"""
references = []
for raw_block_references in self.raw_references:
Expand All @@ -90,6 +98,10 @@ def process_and_dispatch(self):
return references


# This is the main program used for manual testing and verification of AASxml references.
# It allows parsing references from either a file or a buffer, and if no input is provided,
# it runs a source test file to verify the functionality against expected parsed results.
# The test results are printed to indicate whether the parsing is successful or not.
from adsrefpipe.tests.unittests.stubdata import parsed_references
if __name__ == '__main__': # pragma: no cover
parser = argparse.ArgumentParser(description='Parse AAS references')
Expand Down
Loading