From 2e88aa5bf003125603079c1df26ce4ab351e340d Mon Sep 17 00:00:00 2001 From: Graham Hukill Date: Wed, 10 Dec 2025 15:49:12 -0500 Subject: [PATCH 1/3] Default exclusion_list_path to None --- transmogrifier/sources/transformer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/transmogrifier/sources/transformer.py b/transmogrifier/sources/transformer.py index 9a76cf5..9a7f281 100644 --- a/transmogrifier/sources/transformer.py +++ b/transmogrifier/sources/transformer.py @@ -190,7 +190,7 @@ def load( cls, source: str, source_file: str, - exclusion_list_path: str = "", + exclusion_list_path: str | None = None, run_id: str | None = None, run_timestamp: str | None = None, ) -> Transformer: @@ -200,6 +200,7 @@ def load( Args: source: Source repository label. Must match a source key from config.SOURCES. source_file: A file containing source records to be transformed. + exclusion_list_path: CSV filepath to use for explicitly skipping records. run_id: A unique identifier associated with this ETL run. run_timestamp: A timestamp associated with this ETL run. """ From fb64fa620784f94b4ce41f3e17825c91a6c2ceba Mon Sep 17 00:00:00 2001 From: Graham Hukill Date: Wed, 10 Dec 2025 15:53:31 -0500 Subject: [PATCH 2/3] Parse full HTML from mitlibwebsite source records Why these changes are being introduced: Now that browsertrix-harvester is including full HTML + response headers in the source record available to Transmogrifier, we can do two things: 1. Parse metadata for mitlibwebsite TIMDEX records from the original, full HTML in a more opinionated fashion than we could in browsertrix-harvester. 2. Extract good, meaningful full-text from the full HTML to use for the new `fulltext` field. How this addresses that need: Expects a new `html_base64` field in the browsertrix-harvester source records. Uses this to extract metadata and full-text for the record. Side effects of this change: * Full-text is now available in the TIMDEX record for the mitlibwebsite source. * If needed, this HTML parsing could be utilized to extract more granular, source specific metadata in the future. Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/USE-259 --- pyproject.toml | 4 ++ .../mitlibwebsite/mitlibwebsite_records.jsonl | 6 +-- tests/fixtures/mitlibwebsite/website.html | 18 ++++++++ .../website_missing_og_description.html | 16 +++++++ tests/sources/json/test_mitlibwebsite.py | 19 ++++++-- transmogrifier/sources/json/mitlibwebsite.py | 46 +++++++++++++++++-- 6 files changed, 98 insertions(+), 11 deletions(-) create mode 100644 tests/fixtures/mitlibwebsite/website.html create mode 100644 tests/fixtures/mitlibwebsite/website_missing_og_description.html diff --git a/pyproject.toml b/pyproject.toml index f5aab2a..5081712 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,6 +6,10 @@ disallow_untyped_calls = true disallow_untyped_defs = true exclude = ["tests/", "output/"] +[[tool.mypy.overrides]] +module = ["bs4", "bs4.*"] +ignore_missing_imports = true + [tool.pytest.ini_options] log_level = "INFO" diff --git a/tests/fixtures/mitlibwebsite/mitlibwebsite_records.jsonl b/tests/fixtures/mitlibwebsite/mitlibwebsite_records.jsonl index da23a20..ea8f0c8 100644 --- a/tests/fixtures/mitlibwebsite/mitlibwebsite_records.jsonl +++ b/tests/fixtures/mitlibwebsite/mitlibwebsite_records.jsonl @@ -1,3 +1,3 @@ -{"url": "https://libraries.mit.edu/search/", "cdx_warc_filename": "rec-ae2a62f6dc33-gh-test-2025-08-19-lib-website-full-no-depth-20250819202645354-7.warc.gz", "cdx_title": "Search | MIT Libraries", "cdx_offset": "303", "cdx_length": "40258", "og_title": "Search | MIT Libraries", "og_type": "website", "og_image": "https://libraries.mit.edu/app/themes/mitlib-parent/images/mit-libraries-logo-black-yellow-1200-1200.png", "og_url": "https://libraries.mit.edu/search/", "og_image_type": "image/png", "og_image_width": "1200", "og_image_height": "1200", "og_image_alt": "MIT Libraries logo", "og_description": "Use this page to learn about different ways you can search the MIT Libraries’ offerings. Use the Default Quick Search Our Quick Search is the default search on the Libraries’ homepage. This collects results from different library search tools and sorts the results into 4 categories: Books and media Articles and book chapters Archives and manuscript collections Our library website and guides The tool will search the 4 categories and present the top results from each category. It is useful to see the full breadth of what MIT Libraries has on a particular topic or author. Go straight to our […]", "fulltext": null, "fulltext_keywords": "Search,Web of Science,Search Start,Account Search Account,Borrow Direct Request,locations Locations Hours,Citation search Books,Search Hours,Books,articles,Search tools,Search Account Contact,Account Contact Search,Research,Search Search,Collections,Borrow,request,Site search Hours,Locations Hours", "og_site_name": null, "fulltext_50_words": NaN} -{"url": "https://libraries.mit.edu/about7/", "cdx_warc_filename": "rec-ae2a62f6dc33-gh-test-2025-08-19-lib-website-full-no-depth-20250819202645342-4.warc.gz", "cdx_title": "About | MIT Libraries", "cdx_offset": "303", "cdx_length": "38310", "og_title": "About | MIT Libraries", "og_type": "website", "og_image": "https://libraries.mit.edu/app/themes/mitlib-parent/images/mit-libraries-logo-black-yellow-1200-1200.png", "og_url": "https://libraries.mit.edu/about7/", "og_image_type": "image/png", "og_image_width": "1200", "og_image_height": "1200", "og_image_alt": "MIT Libraries logo", "og_description": "Saving the world, bit by bit. Our vision? A world where more people have access to knowledge and more voices are heard. Where tools and solutions can be networked, shared, and “hacked.” Where knowledge not only provides answers, but empowers and inspires. We think libraries can help build this world. Let’s get started. Discover our mission, vision, and values Read the Future of Libraries Task Force Report Envisioning Hayden Library The Libraries are pursuing a renovation of Hayden that reflects our vision for the future of libraries. Learn more Furthering MIT’s mission The final recommendations from the Open Access Task […]", "fulltext": null, "fulltext_keywords": "Web of Science,Collections Books,locations Hours Map,locations Locations Hours,Search Start,Account Search Account,Borrow Direct Request,Search Hours,Search,Site search Hours,Search Account Contact,Distinctive Collections Lewis,Collections Lewis Music,Locations Hours,Research,reserves Borrow Direct,WorldCat Books,Borrow,Books,Databases A-Z JSTOR", "og_site_name": null, "fulltext_50_words": NaN} -{"url": "https://libraries.mit.edu/research-support/", "cdx_warc_filename": "rec-ae2a62f6dc33-gh-test-2025-08-19-lib-website-full-no-depth-20250819202645511-6.warc.gz", "cdx_title": "Research support | MIT Libraries", "cdx_offset": "303", "cdx_length": "37208", "og_title": "Research support | MIT Libraries", "og_type": "website", "og_image": "https://libraries.mit.edu/app/themes/mitlib-parent/images/mit-libraries-logo-black-yellow-1200-1200.png", "og_url": "https://libraries.mit.edu/research-support/", "og_image_type": "image/png", "og_image_width": "1200", "og_image_height": "1200", "og_image_alt": "MIT Libraries logo", "og_description": "Find out where and how to get help for your research or coursework. Expert help Make an appointment for a consult with an expert librarian Research guides: specialized guides for every research interest Not sure where to begin or who to contact? Complete our Ask us email form. Quick help Connect from on & off-campus E-resource troubleshooting Guides for specific classes & programs Writing & publishing help Citing sources & avoiding plagiarism Dissertations/theses: prepare & submit your MIT thesis, or borrow others’  MIT thesis specifications DSpace@MIT: deposit your work to MIT’s repository Scholarly Communication Author identifiers: connect your name with […]", "fulltext": null, "fulltext_keywords": "Web of Science,Collections Books,Search Start,Research,Databases A-Z JSTOR,Distinctive Collections Lewis,Collections Lewis Music,locations Hours Map,Research support,Search,Borrow,Books,Search Hours,Site search Hours,WorldCat Books,locations Locations Hours,Account Search Account,Search Account Contact,request,Borrow Direct Request", "og_site_name": null, "fulltext_50_words": NaN} \ No newline at end of file +{"url": "https://libraries.mit.edu/search/", "status": "active", "cdx_warc_filename": "rec-595f2ddb3156-mitlibwebsite-20251209142416123-6.warc.gz", "cdx_title": "Search | MIT Libraries", "cdx_offset": "3524959", "cdx_length": "40691", "html_base64": "", "response_headers": {"accept-ranges": ["bytes"], "age": ["427428"], "cache-control": ["public, max-age=604800"], "content-length": ["129095"], "content-type": ["text/html; charset=UTF-8"], "date": ["Tue, 09 Dec 2025 14:33:41 GMT"], "link": ["; rel=\"https://api.w.org/\", ; rel=\"alternate\"; title=\"JSON\"; type=\"application/json\", ; rel=shortlink"], "permissions-policy": ["geolocation=(), microphone=(), camera=()"], "referrer-policy": ["no-referrer-when-downgrade"], "server": ["nginx"], "strict-transport-security": ["max-age=300"], "vary": ["Accept-Encoding, Cookie"], "via": ["1.1 varnish"], "x-cache": ["HIT"], "x-cache-hits": ["1"], "x-content-type-options": ["nosniff"], "x-frame-options": ["SAMEORIGIN"], "x-pantheon-styx-hostname": ["styx-fe3-b-5df8569779-zckhf"], "x-served-by": ["cache-chi-kigq8000105-CHI"], "x-styx-req-id": ["dfbd42aa-d128-11f0-85c8-4ea4934838e2"], "x-timer": ["S1765290822.680357,VS0,VE3"], "x-orig-content-encoding": ["gzip"]}} +{"url": "https://libraries.mit.edu/about7/", "status": "active", "cdx_warc_filename": "rec-595f2ddb3156-mitlibwebsite-20251209142416123-6.warc.gz", "cdx_title": "About | MIT Libraries", "cdx_offset": "249278", "cdx_length": "38741", "html_base64": "", "response_headers": {"accept-ranges": ["bytes"], "age": ["406550"], "cache-control": ["public, max-age=604800"], "content-length": ["121192"], "content-type": ["text/html; charset=UTF-8"], "date": ["Tue, 09 Dec 2025 14:24:53 GMT"], "link": ["; rel=\"https://api.w.org/\", ; rel=\"alternate\"; title=\"JSON\"; type=\"application/json\", ; rel=shortlink"], "permissions-policy": ["geolocation=(), microphone=(), camera=()"], "referrer-policy": ["no-referrer-when-downgrade"], "server": ["nginx"], "strict-transport-security": ["max-age=300"], "vary": ["Accept-Encoding, Cookie"], "via": ["1.1 varnish"], "x-cache": ["HIT"], "x-cache-hits": ["1"], "x-content-type-options": ["nosniff"], "x-frame-options": ["SAMEORIGIN"], "x-pantheon-styx-hostname": ["styx-fe3-b-5df8569779-lfsfb"], "x-served-by": ["cache-chi-kigq8000172-CHI"], "x-styx-req-id": ["416c60bc-d158-11f0-bd91-c6a1c2f971b5"], "x-timer": ["S1765290293.170547,VS0,VE4"], "x-orig-content-encoding": ["gzip"]}} +{"url": "https://libraries.mit.edu/research-support/", "status": "active", "cdx_warc_filename": "rec-595f2ddb3156-mitlibwebsite-20251209142415718-4.warc.gz", "cdx_title": "Research support | MIT Libraries", "cdx_offset": "1972108", "cdx_length": "37641", "html_base64": "", "response_headers": {"accept-ranges": ["bytes"], "age": ["421990"], "cache-control": ["public, max-age=604800"], "content-length": ["117657"], "content-type": ["text/html; charset=UTF-8"], "date": ["Tue, 09 Dec 2025 14:30:06 GMT"], "link": ["; rel=\"https://api.w.org/\", ; rel=\"alternate\"; title=\"JSON\"; type=\"application/json\", ; rel=shortlink"], "permissions-policy": ["geolocation=(), microphone=(), camera=()"], "referrer-policy": ["no-referrer-when-downgrade"], "server": ["nginx"], "strict-transport-security": ["max-age=300"], "vary": ["Accept-Encoding, Cookie"], "via": ["1.1 varnish"], "x-cache": ["HIT"], "x-cache-hits": ["1"], "x-content-type-options": ["nosniff"], "x-frame-options": ["SAMEORIGIN"], "x-pantheon-styx-hostname": ["styx-fe3-b-5df8569779-pqmgb"], "x-served-by": ["cache-chi-kigq8000105-CHI"], "x-styx-req-id": ["08be92e8-d135-11f0-a108-12562a46f4ee"], "x-timer": ["S1765290606.393752,VS0,VE3"], "x-orig-content-encoding": ["gzip"]}} diff --git a/tests/fixtures/mitlibwebsite/website.html b/tests/fixtures/mitlibwebsite/website.html new file mode 100644 index 0000000..23e7d61 --- /dev/null +++ b/tests/fixtures/mitlibwebsite/website.html @@ -0,0 +1,18 @@ + + + + + + +
+

Not Helpful

+
+ +

Hello World!

+
+
+

Also Not Helpful

+
+ + \ No newline at end of file diff --git a/tests/fixtures/mitlibwebsite/website_missing_og_description.html b/tests/fixtures/mitlibwebsite/website_missing_og_description.html new file mode 100644 index 0000000..047cee4 --- /dev/null +++ b/tests/fixtures/mitlibwebsite/website_missing_og_description.html @@ -0,0 +1,16 @@ + + + + + +
+

Not Helpful

+
+ +

Hello World!

+
+
+

Also Not Helpful

+
+ + \ No newline at end of file diff --git a/tests/sources/json/test_mitlibwebsite.py b/tests/sources/json/test_mitlibwebsite.py index 752dca2..033fe2f 100644 --- a/tests/sources/json/test_mitlibwebsite.py +++ b/tests/sources/json/test_mitlibwebsite.py @@ -1,15 +1,22 @@ # ruff: noqa: RUF001 +import base64 from unittest.mock import MagicMock, patch import transmogrifier.models as timdex from transmogrifier.sources.json.mitlibwebsite import MITLibWebsite -def create_mitlibwebsite_source_record_stub() -> dict: +def create_mitlibwebsite_source_record_stub( + html_filepath="tests/fixtures/mitlibwebsite/website.html", +) -> dict: + with open(html_filepath) as f: + html_content = f.read() + return { "url": "https://libraries.mit.edu/search/", "cdx_title": "Search | MIT Libraries", - "og_description": "Use this page to learn about different ways you can search the MIT Libraries' offerings.", # noqa: E501 + "html_base64": base64.b64encode(html_content.encode()).decode(), + "response_headers": {}, } @@ -45,6 +52,7 @@ def test_mitlibwebsite_transform_returns_timdex_record(mitlibwebsite_records): summary=[ "Use this page to learn about different ways you can search the MIT Libraries’ offerings. Use the Default Quick Search Our Quick Search is the default search on the Libraries’ homepage. This collects results from different library search tools and sorts the results into 4 categories: Books and media Articles and book chapters Archives and manuscript collections Our library website and guides The tool will search the 4 categories and present the top results from each category. It is useful to see the full breadth of what MIT Libraries has on a particular topic or author. Go straight to our […]" # noqa: E501 ], + fulltext=timdex_record.fulltext, ) @@ -119,13 +127,14 @@ def test_mitlibwebsite_get_links_success(): def test_mitlibwebsite_get_summary_success(): source_record = create_mitlibwebsite_source_record_stub() assert MITLibWebsite.get_summary(source_record) == [ - "Use this page to learn about different ways you can search the MIT Libraries' offerings." # noqa: E501 + "Use this page to learn about different ways you can search the MIT Libraries’ offerings. Use the Default Quick Search Our Quick Search is the default search on the Libraries’ homepage. This collects results from different library search tools and sorts the results into 4 categories: Books and media Articles and book chapters Archives and manuscript collections Our library website and guides The tool will search the 4 categories and present the top results from each category. It is useful to see the full breadth of what MIT Libraries has on a particular topic or author. Go straight to our […]" # noqa: E501 ] def test_mitlibwebsite_get_summary_returns_none_if_og_description_is_none(): - source_record = create_mitlibwebsite_source_record_stub() - source_record["og_description"] = None + source_record = create_mitlibwebsite_source_record_stub( + html_filepath="tests/fixtures/mitlibwebsite/website_missing_og_description.html" + ) assert MITLibWebsite.get_summary(source_record) is None diff --git a/transmogrifier/sources/json/mitlibwebsite.py b/transmogrifier/sources/json/mitlibwebsite.py index c933e56..442fcd9 100644 --- a/transmogrifier/sources/json/mitlibwebsite.py +++ b/transmogrifier/sources/json/mitlibwebsite.py @@ -1,5 +1,9 @@ +import base64 import hashlib import logging +from functools import lru_cache + +from bs4 import BeautifulSoup, Tag import transmogrifier.models as timdex from transmogrifier.sources.jsontransformer import JSONTransformer @@ -10,6 +14,30 @@ class MITLibWebsite(JSONTransformer): + @classmethod + @lru_cache(maxsize=8) + def parse_html(cls, html_base64: str) -> Tag: + """Parse HTML from base64 encoded ASCII string. + + For this mitlibwebsite source, also remove the
and