Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions .github/CODEOWNERS
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# CODEOWNERS file (from GitHub template at
# https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners)
# Each line is a file pattern followed by one or more owners.

##############################################################################
# These owners will be the default owners for everything in the repo. Unless a
# later match takes precedence, @ghukill will be requested for review
# when someone opens a pull request.This is commented out in favor of using a
# team as the default (see below). It is left here as a comment to indicate
# the primary expert for this code.
# * @ghukill

# Teams can be specified as code owners as well. Teams should be identified in
# the format @org/team-name. Teams must have explicit write access to the
# repository.
* @mitlibraries/dataeng

# We set the senior engineer in the team as the owner of the CODEOWNERS file as
# a layer of protection for unauthorized changes.
/.github/CODEOWNERS @ghukill
1,586 changes: 820 additions & 766 deletions Pipfile.lock

Large diffs are not rendered by default.

17 changes: 17 additions & 0 deletions tests/fixtures/mitlibwebsite/fulltext_libguides.html
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Very helpfully formatted fixture!

Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<html>
<head>
<meta property="og:description" content="Test description">
</head>
<body>
<header>
<h1>Header content to ignore</h1>
</header>
<div class="s-lib-header">Staff Directory Header</div>
<div class="s-lib-main">
<p>Libguides directory content here.</p>
</div>
<footer>
<h1>Footer content to ignore</h1>
</footer>
</body>
</html>
16 changes: 16 additions & 0 deletions tests/fixtures/mitlibwebsite/fulltext_wordpress.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
<html>
<head>
<meta property="og:description" content="Test description">
</head>
<body>
<header>
<h1>Header content to ignore</h1>
</header>
<div class="content-main">
<p>WordPress main content here.</p>
</div>
<footer>
<h1>Footer content to ignore</h1>
</footer>
</body>
</html>
16 changes: 16 additions & 0 deletions tests/fixtures/mitlibwebsite/fulltext_wordpress_no_content.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
<html>
<head>
<meta property="og:description" content="Test description">
</head>
<body>
<header>
<h1>Header content</h1>
</header>
<div class="some-other-class">
<p>Content without expected class.</p>
</div>
<footer>
<h1>Footer content</h1>
</footer>
</body>
</html>
32 changes: 32 additions & 0 deletions tests/sources/json/test_mitlibwebsite.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,3 +153,35 @@ def test_mitlibwebsite_record_is_deleted_returns_false_when_status_is_not_delete
def test_mitlibwebsite_record_is_deleted_returns_false_when_status_field_is_missing():
source_record = create_mitlibwebsite_source_record_stub()
assert MITLibWebsite.record_is_deleted(source_record) is False


def test_mitlibwebsite_get_fulltext_extracts_wordpress_content_main():
source_record = create_mitlibwebsite_source_record_stub(
html_filepath="tests/fixtures/mitlibwebsite/fulltext_wordpress.html"
)
mitlibwebsite = MITLibWebsite("mitlibwebsite", iter([source_record]))
fulltext = mitlibwebsite.get_fulltext(source_record)
assert "WordPress main content here." in fulltext
assert "Header content to ignore" not in fulltext
assert "Footer content to ignore" not in fulltext


def test_mitlibwebsite_get_fulltext_extracts_libguides_directory():
source_record = create_mitlibwebsite_source_record_stub(
html_filepath="tests/fixtures/mitlibwebsite/fulltext_libguides.html"
)
source_record["url"] = "https://libguides.mit.edu/prf.php?id=12345"
mitlibwebsite = MITLibWebsite("mitlibwebsite", iter([source_record]))
fulltext = mitlibwebsite.get_fulltext(source_record)
assert "Staff Directory Header" in fulltext
assert "Libguides directory content here." in fulltext
assert "Header content to ignore" not in fulltext
assert "Footer content to ignore" not in fulltext


def test_mitlibwebsite_get_fulltext_returns_none_if_wordpress_selectors_not_found():
source_record = create_mitlibwebsite_source_record_stub(
html_filepath="tests/fixtures/mitlibwebsite/fulltext_wordpress_no_content.html"
)
mitlibwebsite = MITLibWebsite("mitlibwebsite", iter([source_record]))
assert mitlibwebsite.get_fulltext(source_record) is None
90 changes: 30 additions & 60 deletions tests/sources/xml/test_datacite.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,15 +381,13 @@ def test_datacite_with_attribute_and_subfield_variations_transforms_correctly():


def test_get_alternate_titles_success():
source_record = create_datacite_source_record_stub(
"""
source_record = create_datacite_source_record_stub("""
<titles>
<title>The Impact of Maternal Literacy and Participation Programs</title>
<title titleType="AlternativeTitle">An Alternative Title</title>
<title titleType="Subtitle">Baseline Data</title>
</titles>
"""
)
""")
assert Datacite.get_alternate_titles(source_record) == [
timdex.AlternateTitle(value="An Alternative Title", kind="AlternativeTitle"),
timdex.AlternateTitle(value="Baseline Data", kind="Subtitle"),
Expand All @@ -409,20 +407,16 @@ def test_get_alternate_titles_transforms_correctly_if_fields_missing():


def test_get_content_type_success():
source_record = create_datacite_source_record_stub(
"""
source_record = create_datacite_source_record_stub("""
<resourceType resourceTypeGeneral="Dataset">Survey Data</resourceType>
"""
)
""")
assert Datacite.get_content_type(source_record) == ["Dataset"]


def test_get_content_type_transforms_correctly_if_fields_blank():
source_record = create_datacite_source_record_stub(
"""
source_record = create_datacite_source_record_stub("""
<resourceType resourceTypeGeneral=""></resourceType>
"""
)
""")
assert Datacite.get_content_type(source_record) is None


Expand All @@ -432,8 +426,7 @@ def test_get_content_type_transforms_correctly_if_fields_missing():


def test_get_contributors_success():
source_record = create_datacite_source_record_stub(
"""
source_record = create_datacite_source_record_stub("""
<creators>
<creator>
<creatorName nameType="Personal">Banerji, Rukmini</creatorName>
Expand Down Expand Up @@ -466,8 +459,7 @@ def test_get_contributors_success():
<affiliation>Pratham and ASER Centre</affiliation>
</contributor>
</contributors>
"""
)
""")
assert Datacite.get_contributors(source_record) == [
timdex.Contributor(
value="Banerji, Rukmini",
Expand Down Expand Up @@ -497,15 +489,13 @@ def test_get_contributors_success():


def test_get_contributors_transforms_correctly_if_fields_blank():
source_record = create_datacite_source_record_stub(
"""
source_record = create_datacite_source_record_stub("""
<creators>
<creator />
<contributors>
<contributor />
</contributors>
"""
)
""")
assert Datacite.get_contributors(source_record) is None


Expand All @@ -515,17 +505,15 @@ def test_get_contributors_transforms_correctly_if_fields_missing():


def test_get_dates_success():
source_record = create_datacite_source_record_stub(
"""
source_record = create_datacite_source_record_stub("""
<publicationYear>2017</publicationYear>
<dates>
<date dateType="Submitted">2017-02-27</date>
<date dateType="Updated"
dateInformation="This was updated on this date">2019-06-24</date>
<date dateType="Collected">2007-01-01/2007-02-28</date>
</dates>
"""
)
""")
assert Datacite.get_dates(source_record) == [
timdex.Date(kind="Publication date", value="2017"),
timdex.Date(kind="Submitted", value="2017-02-27"),
Expand All @@ -540,14 +528,12 @@ def test_get_dates_success():


def test_get_dates_transforms_correctly_if_fields_blank():
source_record = create_datacite_source_record_stub(
"""
source_record = create_datacite_source_record_stub("""
<publicationYear />
<dates>
<date />
</dates>
"""
)
""")
assert Datacite.get_dates(source_record) is None


Expand All @@ -572,8 +558,7 @@ def test_get_edition_transforms_correctly_if_fields_missing():


def test_get_file_formats_success():
source_record = create_datacite_source_record_stub(
"""
source_record = create_datacite_source_record_stub("""
<formats>
<format>application/vnd.openxmlformats-officedocument.spreadsheetml.sheet</format>
<format>application/pdf</format>
Expand All @@ -587,8 +572,7 @@ def test_get_file_formats_success():
<format>application/pdf</format>
<format>application/pdf</format>
</formats>
"""
)
""")
assert Datacite.get_file_formats(source_record) == [
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"application/pdf",
Expand Down Expand Up @@ -619,8 +603,7 @@ def test_get_format_success():


def test_get_funding_information_success():
source_record = create_datacite_source_record_stub(
"""
source_record = create_datacite_source_record_stub("""
<fundingReferences>
<fundingReference>
<funderName>3ie, Nike Foundation</funderName>
Expand All @@ -629,8 +612,7 @@ def test_get_funding_information_success():
<awardNumber awardURI="http://awards.example/7689">OW1/1012 (3ie)</awardNumber>
</fundingReference>
</fundingReferences>
"""
)
""")
assert Datacite.get_funding_information(source_record) == [
timdex.Funder(
funder_name="3ie, Nike Foundation",
Expand All @@ -655,8 +637,7 @@ def test_get_funding_information_transforms_correctly_if_fields_missing():


def test_get_identifiers_success():
source_record = create_datacite_source_record_stub(
"""
source_record = create_datacite_source_record_stub("""
<identifier identifierType="DOI">10.7910/DVN/19PPE7</identifier>
<alternateIdentifiers>
<alternateIdentifier alternateIdentifierType="url">https://zenodo.org/record/5524465</alternateIdentifier>
Expand All @@ -673,8 +654,7 @@ def test_get_identifiers_success():
<relatedIdentifier relatedIdentifierType="URL" relationType="IsPartOf">
https://zenodo.org/communities/astronomy-general</relatedIdentifier>
</relatedIdentifiers>
"""
)
""")
assert Datacite.get_identifiers(source_record) == [
timdex.Identifier(value="10.7910/DVN/19PPE7", kind="DOI"),
timdex.Identifier(value="https://zenodo.org/record/5524465", kind="url"),
Expand All @@ -683,17 +663,15 @@ def test_get_identifiers_success():


def test_get_identifiers_transforms_correctly_if_fields_blank():
source_record = create_datacite_source_record_stub(
"""
source_record = create_datacite_source_record_stub("""
<identifier />
<alternateIdentifiers>
<alternateIdentifier />
</alternateIdentifiers>
<relatedIdentifiers>
<relatedIdentifier />
</relatedIdentifiers>
"""
)
""")
assert Datacite.get_identifiers(source_record) is None


Expand Down Expand Up @@ -730,15 +708,13 @@ def test_get_links_success(datacite_record_all_fields):


def test_get_locations_success():
source_record = create_datacite_source_record_stub(
"""
source_record = create_datacite_source_record_stub("""
<geoLocations>
<geoLocation>
<geoLocationPlace>A point on the globe</geoLocationPlace>
</geoLocation>
</geoLocations>
"""
)
""")
assert Datacite.get_locations(source_record) == [
timdex.Location(value="A point on the globe")
]
Expand All @@ -757,14 +733,12 @@ def test_get_locations_transforms_correctly_if_fields_missing():


def test_get_notes_success():
source_record = create_datacite_source_record_stub(
"""
source_record = create_datacite_source_record_stub("""
<resourceType resourceTypeGeneral="Dataset">Survey Data</resourceType>
<descriptions>
<description descriptionType="TechnicalInfo">Stata, 13</description>
</descriptions>
"""
)
""")
assert Datacite.get_notes(source_record) == [
timdex.Note(value=["Survey Data"], kind="Datacite resource type"),
timdex.Note(value=["Stata, 13"], kind="TechnicalInfo"),
Expand Down Expand Up @@ -840,15 +814,13 @@ def test_get_related_items_transforms_correctly_if_fields_missing():


def test_get_rights_success():
source_record = create_datacite_source_record_stub(
"""
source_record = create_datacite_source_record_stub("""
<rightsList>
<rights rightsURI="info:eu-repo/semantics/openAccess" />
<rights
rightsURI="http://creativecommons.org/publicdomain/zero/1.0">CC0 1.0</rights>
</rightsList>
"""
)
""")
assert Datacite.get_rights(source_record) == [
timdex.Rights(
description=None, kind=None, uri="info:eu-repo/semantics/openAccess"
Expand All @@ -874,17 +846,15 @@ def test_get_rights_transforms_correctly_if_fields_missing():


def test_get_subjects_success():
source_record = create_datacite_source_record_stub(
"""
source_record = create_datacite_source_record_stub("""
<subjects>
<subject>Social Sciences</subject>
<subject>Educational materials</subject>
<subject subjectScheme="LCSH"
>Adult education, education inputs, field experiments</subject>
<subject subjectScheme="LCSH">Education</subject>
</subjects>
"""
)
""")
assert Datacite.get_subjects(source_record) == [
timdex.Subject(
value=["Social Sciences", "Educational materials"],
Expand Down
Loading