MITLibraries · ghukill · Jan 30, 2026 · Jan 29, 2026 · Jan 29, 2026 · Jan 29, 2026
@@ -0,0 +1,20 @@
+# CODEOWNERS file (from GitHub template at
+# https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners)
+# Each line is a file pattern followed by one or more owners.
+
+##############################################################################
+# These owners will be the default owners for everything in the repo. Unless a
+# later match takes precedence, @ghukill will be requested for review
+# when someone opens a pull request.This is commented out in favor of using a
+# team as the default (see below). It is left here as a comment to indicate
+# the primary expert for this code.
+# * @ghukill
+
+# Teams can be specified as code owners as well. Teams should be identified in
+# the format @org/team-name. Teams must have explicit write access to the
+# repository.
+* @mitlibraries/dataeng
+
+# We set the senior engineer in the team as the owner of the CODEOWNERS file as
+# a layer of protection for unauthorized changes.
+/.github/CODEOWNERS @ghukill
@@ -0,0 +1,17 @@
+<html>
+<head>
+    <meta property="og:description" content="Test description">
+</head>
+<body>
+<header>
+    <h1>Header content to ignore</h1>
+</header>
+<div class="s-lib-header">Staff Directory Header</div>
+<div class="s-lib-main">
+    <p>Libguides directory content here.</p>
+</div>
+<footer>
+    <h1>Footer content to ignore</h1>
+</footer>
+</body>
+</html>
@@ -0,0 +1,16 @@
+<html>
+<head>
+    <meta property="og:description" content="Test description">
+</head>
+<body>
+<header>
+    <h1>Header content to ignore</h1>
+</header>
+<div class="content-main">
+    <p>WordPress main content here.</p>
+</div>
+<footer>
+    <h1>Footer content to ignore</h1>
+</footer>
+</body>
+</html>
@@ -0,0 +1,16 @@
+<html>
+<head>
+    <meta property="og:description" content="Test description">
+</head>
+<body>
+<header>
+    <h1>Header content</h1>
+</header>
+<div class="some-other-class">
+    <p>Content without expected class.</p>
+</div>
+<footer>
+    <h1>Footer content</h1>
+</footer>
+</body>
+</html>
@@ -153,3 +153,35 @@ def test_mitlibwebsite_record_is_deleted_returns_false_when_status_is_not_delete
 def test_mitlibwebsite_record_is_deleted_returns_false_when_status_field_is_missing():
     source_record = create_mitlibwebsite_source_record_stub()
     assert MITLibWebsite.record_is_deleted(source_record) is False
+
+
+def test_mitlibwebsite_get_fulltext_extracts_wordpress_content_main():
+    source_record = create_mitlibwebsite_source_record_stub(
+        html_filepath="tests/fixtures/mitlibwebsite/fulltext_wordpress.html"
+    )
+    mitlibwebsite = MITLibWebsite("mitlibwebsite", iter([source_record]))
+    fulltext = mitlibwebsite.get_fulltext(source_record)
+    assert "WordPress main content here." in fulltext
+    assert "Header content to ignore" not in fulltext
+    assert "Footer content to ignore" not in fulltext
+
+
+def test_mitlibwebsite_get_fulltext_extracts_libguides_directory():
+    source_record = create_mitlibwebsite_source_record_stub(
+        html_filepath="tests/fixtures/mitlibwebsite/fulltext_libguides.html"
+    )
+    source_record["url"] = "https://libguides.mit.edu/prf.php?id=12345"
+    mitlibwebsite = MITLibWebsite("mitlibwebsite", iter([source_record]))
+    fulltext = mitlibwebsite.get_fulltext(source_record)
+    assert "Staff Directory Header" in fulltext
+    assert "Libguides directory content here." in fulltext
+    assert "Header content to ignore" not in fulltext
+    assert "Footer content to ignore" not in fulltext
+
+
+def test_mitlibwebsite_get_fulltext_returns_none_if_wordpress_selectors_not_found():
+    source_record = create_mitlibwebsite_source_record_stub(
+        html_filepath="tests/fixtures/mitlibwebsite/fulltext_wordpress_no_content.html"
+    )
+    mitlibwebsite = MITLibWebsite("mitlibwebsite", iter([source_record]))
+    assert mitlibwebsite.get_fulltext(source_record) is None
@@ -381,15 +381,13 @@ def test_datacite_with_attribute_and_subfield_variations_transforms_correctly():
 
 
 def test_get_alternate_titles_success():
-    source_record = create_datacite_source_record_stub(
-        """
+    source_record = create_datacite_source_record_stub("""
         <titles>
          <title>The Impact of Maternal Literacy and Participation Programs</title>
          <title titleType="AlternativeTitle">An Alternative Title</title>
          <title titleType="Subtitle">Baseline Data</title>
         </titles>
-        """
-    )
+        """)
     assert Datacite.get_alternate_titles(source_record) == [
         timdex.AlternateTitle(value="An Alternative Title", kind="AlternativeTitle"),
         timdex.AlternateTitle(value="Baseline Data", kind="Subtitle"),
@@ -409,20 +407,16 @@ def test_get_alternate_titles_transforms_correctly_if_fields_missing():
 
 
 def test_get_content_type_success():
-    source_record = create_datacite_source_record_stub(
-        """
+    source_record = create_datacite_source_record_stub("""
         <resourceType resourceTypeGeneral="Dataset">Survey Data</resourceType>
-        """
-    )
+        """)
     assert Datacite.get_content_type(source_record) == ["Dataset"]
 
 
 def test_get_content_type_transforms_correctly_if_fields_blank():
-    source_record = create_datacite_source_record_stub(
-        """
+    source_record = create_datacite_source_record_stub("""
         <resourceType resourceTypeGeneral=""></resourceType>
-        """
-    )
+        """)
     assert Datacite.get_content_type(source_record) is None
 
 
@@ -432,8 +426,7 @@ def test_get_content_type_transforms_correctly_if_fields_missing():
 
 
 def test_get_contributors_success():
-    source_record = create_datacite_source_record_stub(
-        """
+    source_record = create_datacite_source_record_stub("""
         <creators>
          <creator>
           <creatorName nameType="Personal">Banerji, Rukmini</creatorName>
@@ -466,8 +459,7 @@ def test_get_contributors_success():
           <affiliation>Pratham and ASER Centre</affiliation>
          </contributor>
         </contributors>
-        """
-    )
+        """)
     assert Datacite.get_contributors(source_record) == [
         timdex.Contributor(
             value="Banerji, Rukmini",
@@ -497,15 +489,13 @@ def test_get_contributors_success():
 
 
 def test_get_contributors_transforms_correctly_if_fields_blank():
-    source_record = create_datacite_source_record_stub(
-        """
+    source_record = create_datacite_source_record_stub("""
         <creators>
          <creator />
         <contributors>
          <contributor />
         </contributors>
-        """
-    )
+        """)
     assert Datacite.get_contributors(source_record) is None
 
 
@@ -515,17 +505,15 @@ def test_get_contributors_transforms_correctly_if_fields_missing():
 
 
 def test_get_dates_success():
-    source_record = create_datacite_source_record_stub(
-        """
+    source_record = create_datacite_source_record_stub("""
         <publicationYear>2017</publicationYear>
         <dates>
          <date dateType="Submitted">2017-02-27</date>
          <date dateType="Updated"
          dateInformation="This was updated on this date">2019-06-24</date>
          <date dateType="Collected">2007-01-01/2007-02-28</date>
         </dates>
-        """
-    )
+        """)
     assert Datacite.get_dates(source_record) == [
         timdex.Date(kind="Publication date", value="2017"),
         timdex.Date(kind="Submitted", value="2017-02-27"),
@@ -540,14 +528,12 @@ def test_get_dates_success():
 
 
 def test_get_dates_transforms_correctly_if_fields_blank():
-    source_record = create_datacite_source_record_stub(
-        """
+    source_record = create_datacite_source_record_stub("""
         <publicationYear />
         <dates>
          <date />
         </dates>
-        """
-    )
+        """)
     assert Datacite.get_dates(source_record) is None
 
 
@@ -572,8 +558,7 @@ def test_get_edition_transforms_correctly_if_fields_missing():
 
 
 def test_get_file_formats_success():
-    source_record = create_datacite_source_record_stub(
-        """
+    source_record = create_datacite_source_record_stub("""
         <formats>
          <format>application/vnd.openxmlformats-officedocument.spreadsheetml.sheet</format>
          <format>application/pdf</format>
@@ -587,8 +572,7 @@ def test_get_file_formats_success():
          <format>application/pdf</format>
          <format>application/pdf</format>
         </formats>
-        """
-    )
+        """)
     assert Datacite.get_file_formats(source_record) == [
         "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
         "application/pdf",
@@ -619,8 +603,7 @@ def test_get_format_success():
 
 
 def test_get_funding_information_success():
-    source_record = create_datacite_source_record_stub(
-        """
+    source_record = create_datacite_source_record_stub("""
         <fundingReferences>
          <fundingReference>
           <funderName>3ie, Nike Foundation</funderName>
@@ -629,8 +612,7 @@ def test_get_funding_information_success():
           <awardNumber awardURI="http://awards.example/7689">OW1/1012 (3ie)</awardNumber>
          </fundingReference>
         </fundingReferences>
-        """
-    )
+        """)
     assert Datacite.get_funding_information(source_record) == [
         timdex.Funder(
             funder_name="3ie, Nike Foundation",
@@ -655,8 +637,7 @@ def test_get_funding_information_transforms_correctly_if_fields_missing():
 
 
 def test_get_identifiers_success():
-    source_record = create_datacite_source_record_stub(
-        """
+    source_record = create_datacite_source_record_stub("""
         <identifier identifierType="DOI">10.7910/DVN/19PPE7</identifier>
         <alternateIdentifiers>
          <alternateIdentifier alternateIdentifierType="url">https://zenodo.org/record/5524465</alternateIdentifier>
@@ -673,8 +654,7 @@ def test_get_identifiers_success():
          <relatedIdentifier relatedIdentifierType="URL" relationType="IsPartOf">
          https://zenodo.org/communities/astronomy-general</relatedIdentifier>
         </relatedIdentifiers>
-        """
-    )
+        """)
     assert Datacite.get_identifiers(source_record) == [
         timdex.Identifier(value="10.7910/DVN/19PPE7", kind="DOI"),
         timdex.Identifier(value="https://zenodo.org/record/5524465", kind="url"),
@@ -683,17 +663,15 @@ def test_get_identifiers_success():
 
 
 def test_get_identifiers_transforms_correctly_if_fields_blank():
-    source_record = create_datacite_source_record_stub(
-        """
+    source_record = create_datacite_source_record_stub("""
         <identifier />
         <alternateIdentifiers>
          <alternateIdentifier />
         </alternateIdentifiers>
         <relatedIdentifiers>
          <relatedIdentifier />
         </relatedIdentifiers>
-        """
-    )
+        """)
     assert Datacite.get_identifiers(source_record) is None
 
 
@@ -730,15 +708,13 @@ def test_get_links_success(datacite_record_all_fields):
 
 
 def test_get_locations_success():
-    source_record = create_datacite_source_record_stub(
-        """
+    source_record = create_datacite_source_record_stub("""
         <geoLocations>
          <geoLocation>
           <geoLocationPlace>A point on the globe</geoLocationPlace>
          </geoLocation>
         </geoLocations>
-        """
-    )
+        """)
     assert Datacite.get_locations(source_record) == [
         timdex.Location(value="A point on the globe")
     ]
@@ -757,14 +733,12 @@ def test_get_locations_transforms_correctly_if_fields_missing():
 
 
 def test_get_notes_success():
-    source_record = create_datacite_source_record_stub(
-        """
+    source_record = create_datacite_source_record_stub("""
         <resourceType resourceTypeGeneral="Dataset">Survey Data</resourceType>
         <descriptions>
          <description descriptionType="TechnicalInfo">Stata, 13</description>
         </descriptions>
-        """
-    )
+        """)
     assert Datacite.get_notes(source_record) == [
         timdex.Note(value=["Survey Data"], kind="Datacite resource type"),
         timdex.Note(value=["Stata, 13"], kind="TechnicalInfo"),
@@ -840,15 +814,13 @@ def test_get_related_items_transforms_correctly_if_fields_missing():
 
 
 def test_get_rights_success():
-    source_record = create_datacite_source_record_stub(
-        """
+    source_record = create_datacite_source_record_stub("""
         <rightsList>
           <rights rightsURI="info:eu-repo/semantics/openAccess" />
           <rights
           rightsURI="http://creativecommons.org/publicdomain/zero/1.0">CC0 1.0</rights>
         </rightsList>
-        """
-    )
+        """)
     assert Datacite.get_rights(source_record) == [
         timdex.Rights(
             description=None, kind=None, uri="info:eu-repo/semantics/openAccess"
@@ -874,17 +846,15 @@ def test_get_rights_transforms_correctly_if_fields_missing():
 
 
 def test_get_subjects_success():
-    source_record = create_datacite_source_record_stub(
-        """
+    source_record = create_datacite_source_record_stub("""
         <subjects>
          <subject>Social Sciences</subject>
          <subject>Educational materials</subject>
          <subject subjectScheme="LCSH"
          >Adult education, education inputs, field experiments</subject>
          <subject subjectScheme="LCSH">Education</subject>
         </subjects>
-        """
-    )
+        """)
     assert Datacite.get_subjects(source_record) == [
         timdex.Subject(
             value=["Social Sciences", "Educational materials"],