Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
547 changes: 276 additions & 271 deletions Pipfile.lock

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ disallow_untyped_calls = true
disallow_untyped_defs = true
exclude = ["tests/", "output/"]

[[tool.mypy.overrides]]
module = ["bs4", "bs4.*"]
ignore_missing_imports = true

[tool.pytest.ini_options]
log_level = "INFO"

Expand Down
6 changes: 3 additions & 3 deletions tests/fixtures/mitlibwebsite/mitlibwebsite_records.jsonl

Large diffs are not rendered by default.

18 changes: 18 additions & 0 deletions tests/fixtures/mitlibwebsite/website.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
<html>
<head>
<meta property="og:title" content="Search | MIT Libraries">
<meta property="og:description"
content="Use this page to learn about different ways you can search the MIT Libraries’ offerings. Use the Default Quick Search Our Quick Search is the default search on the Libraries’ homepage. This collects results from different library search tools and sorts the results into 4 categories: Books and media Articles and book chapters Archives and manuscript collections Our library website and guides The tool will search the 4 categories and present the top results from each category. It is useful to see the full breadth of what MIT Libraries has on a particular topic or author. Go straight to our […]">
</head>
<body>
<header>
<h1>Not Helpful</h1>
</header>
<content>
<p>Hello World!</p>
</content>
<footer>
<h1>Also Not Helpful</h1>
</footer>
</body>
</html>
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
<html>
<head>
<meta property="og:title" content="Search | MIT Libraries">
</head>
<body>
<header>
<h1>Not Helpful</h1>
</header>
<content>
<p>Hello World!</p>
</content>
<footer>
<h1>Also Not Helpful</h1>
</footer>
</body>
</html>
19 changes: 14 additions & 5 deletions tests/sources/json/test_mitlibwebsite.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,22 @@
# ruff: noqa: RUF001
import base64
from unittest.mock import MagicMock, patch

import transmogrifier.models as timdex
from transmogrifier.sources.json.mitlibwebsite import MITLibWebsite


def create_mitlibwebsite_source_record_stub() -> dict:
def create_mitlibwebsite_source_record_stub(
html_filepath="tests/fixtures/mitlibwebsite/website.html",
) -> dict:
with open(html_filepath) as f:
html_content = f.read()

return {
"url": "https://libraries.mit.edu/search/",
"cdx_title": "Search | MIT Libraries",
"og_description": "Use this page to learn about different ways you can search the MIT Libraries' offerings.", # noqa: E501
"html_base64": base64.b64encode(html_content.encode()).decode(),
"response_headers": {},
}


Expand Down Expand Up @@ -45,6 +52,7 @@ def test_mitlibwebsite_transform_returns_timdex_record(mitlibwebsite_records):
summary=[
"Use this page to learn about different ways you can search the MIT Libraries’ offerings. Use the Default Quick Search Our Quick Search is the default search on the Libraries’ homepage. This collects results from different library search tools and sorts the results into 4 categories: Books and media Articles and book chapters Archives and manuscript collections Our library website and guides The tool will search the 4 categories and present the top results from each category. It is useful to see the full breadth of what MIT Libraries has on a particular topic or author. Go straight to our […]" # noqa: E501
],
fulltext=timdex_record.fulltext,
)


Expand Down Expand Up @@ -119,13 +127,14 @@ def test_mitlibwebsite_get_links_success():
def test_mitlibwebsite_get_summary_success():
source_record = create_mitlibwebsite_source_record_stub()
assert MITLibWebsite.get_summary(source_record) == [
"Use this page to learn about different ways you can search the MIT Libraries' offerings." # noqa: E501
"Use this page to learn about different ways you can search the MIT Libraries offerings. Use the Default Quick Search Our Quick Search is the default search on the Libraries’ homepage. This collects results from different library search tools and sorts the results into 4 categories: Books and media Articles and book chapters Archives and manuscript collections Our library website and guides The tool will search the 4 categories and present the top results from each category. It is useful to see the full breadth of what MIT Libraries has on a particular topic or author. Go straight to our […]" # noqa: E501
]


def test_mitlibwebsite_get_summary_returns_none_if_og_description_is_none():
source_record = create_mitlibwebsite_source_record_stub()
source_record["og_description"] = None
source_record = create_mitlibwebsite_source_record_stub(
html_filepath="tests/fixtures/mitlibwebsite/website_missing_og_description.html"
)
assert MITLibWebsite.get_summary(source_record) is None


Expand Down
46 changes: 43 additions & 3 deletions transmogrifier/sources/json/mitlibwebsite.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
import base64
import hashlib
import logging
from functools import lru_cache

from bs4 import BeautifulSoup, Tag

import transmogrifier.models as timdex
from transmogrifier.sources.jsontransformer import JSONTransformer
Expand All @@ -10,6 +14,30 @@

class MITLibWebsite(JSONTransformer):

@classmethod
@lru_cache(maxsize=8)
def parse_html(cls, html_base64: str) -> Tag:
"""Parse HTML from base64 encoded ASCII string.

For this mitlibwebsite source, also remove the <header> and <footer> elements
which are not helpful for any metadata or fulltext purposes.

This method utilizes an LRU cache to only parse the HTML once per unique HTML
base64 string passed. Maxsize is set to 8 to ensure the cache is large enough
for 8 concurrent transformations if threading is used (increase if needed for
more threads).
"""
html_bytes = base64.b64decode(html_base64)
html_soup = BeautifulSoup(html_bytes, "html.parser")

# remove header and footer
if header := html_soup.select_one("body > header"):
header.decompose()
if footer := html_soup.select_one("body > footer"):
footer.decompose()

return html_soup

@classmethod
def get_main_titles(cls, source_record: dict) -> list[str]:
"""
Expand Down Expand Up @@ -81,12 +109,24 @@ def get_dates(self, _source_record: dict) -> list[timdex.Date]:
def get_format(self, _source_record: dict) -> str:
return "electronic resource"

def get_fulltext(self, source_record: dict) -> str:
html_soup = self.parse_html(source_record["html_base64"])
return html_soup.get_text(separator=" ", strip=True)

@classmethod
def get_links(cls, source_record: dict) -> list[timdex.Link]:
return [timdex.Link(url=source_record["url"], kind="Website")]

@classmethod
def get_summary(cls, source_record: dict) -> list[str] | None:
if og_description := source_record.get("og_description"):
return [og_description]
return None
html_soup = cls.parse_html(source_record["html_base64"])

og_tag = html_soup.find("meta", attrs={"property": "og:description"})
if not og_tag:
return None

content = og_tag.get("content", "").strip()
if content == "":
return None

return [content]
3 changes: 2 additions & 1 deletion transmogrifier/sources/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ def load(
cls,
source: str,
source_file: str,
exclusion_list_path: str = "",
exclusion_list_path: str | None = None,
run_id: str | None = None,
run_timestamp: str | None = None,
) -> Transformer:
Expand All @@ -200,6 +200,7 @@ def load(
Args:
source: Source repository label. Must match a source key from config.SOURCES.
source_file: A file containing source records to be transformed.
exclusion_list_path: CSV filepath to use for explicitly skipping records.
run_id: A unique identifier associated with this ETL run.
run_timestamp: A timestamp associated with this ETL run.
"""
Expand Down