Skip to content

Commit 7b3c37d

Browse files
committed
🩹 (support/html) Get better metadata
1 parent 2f1ae31 commit 7b3c37d

File tree

1 file changed

+6
-5
lines changed

1 file changed

+6
-5
lines changed

ingestors/support/html.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
1-
import re
21
import logging
2+
import re
3+
34
from lxml import html
45
from lxml.etree import ParseError, ParserError
56
from normality import collapse_spaces
67

7-
from ingestors.support.timestamp import TimestampSupport
88
from ingestors.exc import ProcessingException
9+
from ingestors.support.timestamp import TimestampSupport
910

1011
log = logging.getLogger(__name__)
1112

@@ -33,9 +34,9 @@ def extract_html_header(self, entity, doc):
3334
entity.add("summary", self.get_meta(doc, "description"))
3435
entity.add("author", self.get_meta(doc, "author"))
3536
entity.add("author", self.get_meta(doc, "og:site_name"))
36-
published_at = self.get_meta(doc, "artcile:published_time")
37+
published_at = self.get_meta(doc, "article:published_time")
3738
entity.add("publishedAt", self.parse_timestamp(published_at))
38-
modified_at = self.get_meta(doc, "artcile:modified_time")
39+
modified_at = self.get_meta(doc, "aricle:modified_time")
3940
entity.add("modifiedAt", self.parse_timestamp(modified_at))
4041

4142
for field in ["keywords", "news_keywords"]:
@@ -47,7 +48,7 @@ def extract_html_header(self, entity, doc):
4748
def extract_html_text(self, doc):
4849
"""Get all text from a DOM, also used by the XML parser."""
4950
text = " ".join(self.extract_html_elements(doc))
50-
text = collapse_spaces(text)
51+
text = collapse_spaces(text) or ""
5152
if len(text):
5253
return text
5354

0 commit comments

Comments
 (0)