1- import re
21import logging
2+ import re
3+
34from lxml import html
45from lxml .etree import ParseError , ParserError
56from normality import collapse_spaces
67
7- from ingestors .support .timestamp import TimestampSupport
88from ingestors .exc import ProcessingException
9+ from ingestors .support .timestamp import TimestampSupport
910
1011log = logging .getLogger (__name__ )
1112
@@ -33,9 +34,9 @@ def extract_html_header(self, entity, doc):
3334 entity .add ("summary" , self .get_meta (doc , "description" ))
3435 entity .add ("author" , self .get_meta (doc , "author" ))
3536 entity .add ("author" , self .get_meta (doc , "og:site_name" ))
36- published_at = self .get_meta (doc , "artcile :published_time" )
37+ published_at = self .get_meta (doc , "article :published_time" )
3738 entity .add ("publishedAt" , self .parse_timestamp (published_at ))
38- modified_at = self .get_meta (doc , "artcile :modified_time" )
39+ modified_at = self .get_meta (doc , "aricle :modified_time" )
3940 entity .add ("modifiedAt" , self .parse_timestamp (modified_at ))
4041
4142 for field in ["keywords" , "news_keywords" ]:
@@ -47,7 +48,7 @@ def extract_html_header(self, entity, doc):
4748 def extract_html_text (self , doc ):
4849 """Get all text from a DOM, also used by the XML parser."""
4950 text = " " .join (self .extract_html_elements (doc ))
50- text = collapse_spaces (text )
51+ text = collapse_spaces (text ) or ""
5152 if len (text ):
5253 return text
5354
0 commit comments