From 124cad10c670a88254e4856b1dc293a83381e612 Mon Sep 17 00:00:00 2001 From: Oscar Arbelaez Date: Sun, 31 Aug 2025 14:13:47 +0100 Subject: [PATCH 1/3] fix: conform to the new scopus format Scopus changed the format by not providing the Abbreviated Source Title and changing the format of the authors. We'll use the Source title field now and the authors seem to come in `initials, last name, first name` formats. So we're going to re-format the authors as `last name, initials` as it was before. --- src/bibx/cli.py | 8 ++++++++ src/bibx/sources/scopus_csv.py | 19 +++++++++++++++++-- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/src/bibx/cli.py b/src/bibx/cli.py index a3a094d..e34f0af 100644 --- a/src/bibx/cli.py +++ b/src/bibx/cli.py @@ -109,6 +109,14 @@ def openalex( rprint(graph) +@app.command() +def csv(filename: str) -> None: + """Parse a scopus CSV file and print the collection.""" + with open(filename) as f: + c = read_scopus_csv(f) + rprint(list(c.citation_pairs)) + + def main() -> None: """Entry point for the CLI.""" app() diff --git a/src/bibx/sources/scopus_csv.py b/src/bibx/sources/scopus_csv.py index 9b35ad0..8d0877e 100644 --- a/src/bibx/sources/scopus_csv.py +++ b/src/bibx/sources/scopus_csv.py @@ -13,6 +13,8 @@ from .base import Source +_NUM_AHTOR_PARTS = 3 + logger = logging.getLogger(__name__) @@ -24,6 +26,19 @@ def _split_str(value: str | None) -> list[str]: return value.strip().split("; ") if value else [] +def _rotate_authors(authors: list[str]) -> list[str]: + result = [] + for author in authors: + parts = author.split(", ") + if len(parts) != _NUM_AHTOR_PARTS: + logger.debug("unexpected author format: %s", author) + result.append(author) + continue + initials, last, _ = parts + result.append(f"{last}, {initials}") + return result + + class Row(BaseModel): """Row model for Scopus CSV data.""" @@ -34,7 +49,7 @@ class Row(BaseModel): ] year: Annotated[int, Field(validation_alias="Year")] title: Annotated[str, Field(validation_alias="Title")] - journal: Annotated[str, Field(validation_alias="Abbreviated Source Title")] + journal: Annotated[str, Field(validation_alias="Source title")] volume: Annotated[ str | None, Field(validation_alias="Volume"), @@ -110,7 +125,7 @@ def _parse_file(self, file: TextIO) -> Generator[Article, None, None]: label="", ids=set(), title=datum.title, - authors=datum.authors, + authors=_rotate_authors(datum.authors), year=datum.year, journal=datum.journal, volume=datum.volume, From 7e0058b80016cec27c99d3c18b5fd08557717b14 Mon Sep 17 00:00:00 2001 From: Oscar Arbelaez Date: Sun, 31 Aug 2025 14:36:25 +0100 Subject: [PATCH 2/3] prepare release --- src/bibx/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bibx/__init__.py b/src/bibx/__init__.py index 9584b61..b352e88 100644 --- a/src/bibx/__init__.py +++ b/src/bibx/__init__.py @@ -28,7 +28,7 @@ "read_wos", ] -__version__ = "0.8.0" +__version__ = "0.9.0" def query_openalex( From 0d3a791b975c4318680682a356a358814cb4f536 Mon Sep 17 00:00:00 2001 From: Oscar Arbelaez Date: Sun, 31 Aug 2025 14:41:27 +0100 Subject: [PATCH 3/3] fix typo --- src/bibx/sources/scopus_csv.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/bibx/sources/scopus_csv.py b/src/bibx/sources/scopus_csv.py index 8d0877e..07b7a01 100644 --- a/src/bibx/sources/scopus_csv.py +++ b/src/bibx/sources/scopus_csv.py @@ -13,7 +13,7 @@ from .base import Source -_NUM_AHTOR_PARTS = 3 +_NUM_AUTHOR_PARTS = 3 logger = logging.getLogger(__name__) @@ -30,7 +30,7 @@ def _rotate_authors(authors: list[str]) -> list[str]: result = [] for author in authors: parts = author.split(", ") - if len(parts) != _NUM_AHTOR_PARTS: + if len(parts) != _NUM_AUTHOR_PARTS: logger.debug("unexpected author format: %s", author) result.append(author) continue