From c3283e93c5555b51dc96931cc6c58c8e2f9e432b Mon Sep 17 00:00:00 2001 From: Erik Quaeghebeur Date: Mon, 15 Jun 2020 23:16:26 +0200 Subject: [PATCH 1/9] Expand use of dict get method to simplify code In the code, there was often both a check for whether a certain key was in a dict and a conditional access of the corresponding value. This can be elegantly combined in a single call to the dict get method, whose default value if the key is not in the dict is None. Using the get method instead of the check and conditional access makes it possible to simplify the code in various places. This is done so in this patch. --- html2text/__init__.py | 73 +++++++++++++++++++------------------------ 1 file changed, 32 insertions(+), 41 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 07a3dab..a7c6b95 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -201,24 +201,16 @@ def previousIndex(self, attrs: Dict[str, Optional[str]]) -> Optional[int]: self.a list. If the set of attributes is not found, returns None :rtype: int """ - if "href" not in attrs: + attrs_href = attrs.get("href") + if attrs_href is None: return None - match = False + attrs_title = attrs.get("title") for i, a in enumerate(self.a): - if "href" in a.attrs and a.attrs["href"] == attrs["href"]: - if "title" in a.attrs or "title" in attrs: - if ( - "title" in a.attrs - and "title" in attrs - and a.attrs["title"] == attrs["title"] - ): - match = True - else: - match = True - - if match: + if ((attrs_href == a.attrs.get("href")) and + (attrs_title == a.attrs.get("title"))): return i + return None def handle_emphasis( @@ -435,10 +427,8 @@ def no_preceding_space(self: HTML2Text) -> bool: if tag == "abbr": if start: - self.abbr_title = None + self.abbr_title = attrs.get("title") self.abbr_data = "" - if "title" in attrs: - self.abbr_title = attrs["title"] else: if self.abbr_title is not None: assert self.abbr_data is not None @@ -460,18 +450,18 @@ def link_url(self: HTML2Text, link: str, title: str = "") -> None: if tag == "a" and not self.ignore_links: if start: + attrs_href = attrs.get("href") if ( - "href" in attrs - and attrs["href"] is not None - and not (self.skip_internal_links and attrs["href"].startswith("#")) + attrs_href is None + or (self.skip_internal_links and attrs_href.startswith("#")) ): + self.astack.append(None) + else: self.astack.append(attrs) - self.maybe_automatic_link = attrs["href"] + self.maybe_automatic_link = attrs_href self.empty_link = True if self.protect_links: - attrs["href"] = "<" + attrs["href"] + ">" - else: - self.astack.append(None) + attrs["href"] = "<" + attrs_href + ">" else: if self.astack: a = self.astack.pop() @@ -484,7 +474,7 @@ def link_url(self: HTML2Text, link: str, title: str = "") -> None: self.empty_link = False self.maybe_automatic_link = None if self.inline_links: - title = a.get("title") or "" + title = a.get("title", "") title = escape_md(title) link_url(self, a["href"], title) else: @@ -498,24 +488,25 @@ def link_url(self: HTML2Text, link: str, title: str = "") -> None: self.o("][" + str(a_props.count) + "]") if tag == "img" and start and not self.ignore_images: - if "src" in attrs: - assert attrs["src"] is not None + img_attrs_src = attrs.get("src") + if img_attrs_src is not None: if not self.images_to_alt: - attrs["href"] = attrs["src"] - alt = attrs.get("alt") or self.default_image_alt + attrs["href"] = img_attrs_src + alt = attrs.get("alt", self.default_image_alt) # If we have images_with_size, write raw html including width, # height, and alt attributes + img_attrs_width = attrs.get("width") + img_attrs_height = attrs.get("height") if self.images_as_html or ( - self.images_with_size and ("width" in attrs or "height" in attrs) + self.images_with_size and + not (img_attrs_width is img_attrs_height is None) ): - self.o("") @@ -544,7 +535,7 @@ def link_url(self: HTML2Text, link: str, title: str = "") -> None: else: self.o("![" + escape_md(alt) + "]") if self.inline_links: - href = attrs.get("href") or "" + href = attrs.get("href", "") self.o( "(" + escape_md(urlparse.urljoin(self.baseurl, href)) + ")" ) @@ -777,9 +768,9 @@ def o( + "]: " + urlparse.urljoin(self.baseurl, link.attrs["href"]) ) - if "title" in link.attrs: - assert link.attrs["title"] is not None - self.out(" (" + link.attrs["title"] + ")") + link_attrs_title = link.attrs.get("title") + if link_attrs_title is not None: + self.out(" (" + link_attrs_title + ")") self.out("\n") else: newa.append(link) From 83c769df00ee97b2ee404e82065eeaa33f01183f Mon Sep 17 00:00:00 2001 From: Erik Quaeghebeur Date: Tue, 16 Jun 2020 13:55:56 +0200 Subject: [PATCH 2/9] fix a few style issues --- html2text/__init__.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index a7c6b95..84dc91e 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -201,14 +201,16 @@ def previousIndex(self, attrs: Dict[str, Optional[str]]) -> Optional[int]: self.a list. If the set of attributes is not found, returns None :rtype: int """ - attrs_href = attrs.get("href") + attrs_href = attrs.get("href") if attrs_href is None: return None attrs_title = attrs.get("title") for i, a in enumerate(self.a): - if ((attrs_href == a.attrs.get("href")) and - (attrs_title == a.attrs.get("title"))): + if ( + (attrs_href == a.attrs.get("href")) + and (attrs_title == a.attrs.get("title")) + ): return i return None @@ -499,8 +501,8 @@ def link_url(self: HTML2Text, link: str, title: str = "") -> None: img_attrs_width = attrs.get("width") img_attrs_height = attrs.get("height") if self.images_as_html or ( - self.images_with_size and - not (img_attrs_width is img_attrs_height is None) + self.images_with_size + and not (img_attrs_width is img_attrs_height is None) ): self.o(" Date: Fri, 11 Sep 2020 08:36:00 +0200 Subject: [PATCH 3/9] try to appease black --- html2text/__init__.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 84dc91e..f573758 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -207,9 +207,8 @@ def previousIndex(self, attrs: Dict[str, Optional[str]]) -> Optional[int]: attrs_title = attrs.get("title") for i, a in enumerate(self.a): - if ( - (attrs_href == a.attrs.get("href")) - and (attrs_title == a.attrs.get("title")) + if (attrs_href == a.attrs.get("href")) and ( + attrs_title == a.attrs.get("title") ): return i @@ -453,9 +452,8 @@ def link_url(self: HTML2Text, link: str, title: str = "") -> None: if tag == "a" and not self.ignore_links: if start: attrs_href = attrs.get("href") - if ( - attrs_href is None - or (self.skip_internal_links and attrs_href.startswith("#")) + if attrs_href is None or ( + self.skip_internal_links and attrs_href.startswith("#") ): self.astack.append(None) else: From b3e33193d4fec51b687734f6cdf5a8da977078da Mon Sep 17 00:00:00 2001 From: Erik Quaeghebeur Date: Fri, 11 Sep 2020 09:02:33 +0200 Subject: [PATCH 4/9] try to appease MyPy --- html2text/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/html2text/__init__.py b/html2text/__init__.py index f573758..81df6fe 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -475,6 +475,7 @@ def link_url(self: HTML2Text, link: str, title: str = "") -> None: self.maybe_automatic_link = None if self.inline_links: title = a.get("title", "") + assert title is not None title = escape_md(title) link_url(self, a["href"], title) else: @@ -493,6 +494,7 @@ def link_url(self: HTML2Text, link: str, title: str = "") -> None: if not self.images_to_alt: attrs["href"] = img_attrs_src alt = attrs.get("alt", self.default_image_alt) + assert alt is not None # If we have images_with_size, write raw html including width, # height, and alt attributes From d23d2598438952548e4a0106a8af5c8f30f4a90f Mon Sep 17 00:00:00 2001 From: Erik Quaeghebeur Date: Fri, 11 Sep 2020 13:50:20 +0200 Subject: [PATCH 5/9] deal properly with Falsy values --- html2text/__init__.py | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 81df6fe..1339d2e 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -474,9 +474,8 @@ def link_url(self: HTML2Text, link: str, title: str = "") -> None: self.empty_link = False self.maybe_automatic_link = None if self.inline_links: - title = a.get("title", "") - assert title is not None - title = escape_md(title) + title = a.get("title") + title = "" if title is None else escape_md(title) link_url(self, a["href"], title) else: i = self.previousIndex(a) @@ -494,35 +493,33 @@ def link_url(self: HTML2Text, link: str, title: str = "") -> None: if not self.images_to_alt: attrs["href"] = img_attrs_src alt = attrs.get("alt", self.default_image_alt) - assert alt is not None # If we have images_with_size, write raw html including width, # height, and alt attributes img_attrs_width = attrs.get("width") img_attrs_height = attrs.get("height") - if self.images_as_html or ( - self.images_with_size - and not (img_attrs_width is img_attrs_height is None) - ): + if self.images_as_html or self.images_with_size: self.o("") return + alt = "" if alt is None else escape_md(alt) + # If we have a link to create, output the start if self.maybe_automatic_link is not None: href = self.maybe_automatic_link if ( self.images_to_alt - and escape_md(alt) == href + and alt == href and self.absolute_url_matcher.match(href) ): - self.o("<" + escape_md(alt) + ">") + self.o("<" + alt + ">") self.empty_link = False return else: @@ -533,9 +530,9 @@ def link_url(self: HTML2Text, link: str, title: str = "") -> None: # If we have images_to_alt, we discard the image itself, # considering only the alt text. if self.images_to_alt: - self.o(escape_md(alt)) + self.o(alt) else: - self.o("![" + escape_md(alt) + "]") + self.o("![" + alt + "]") if self.inline_links: href = attrs.get("href", "") self.o( From 846effda67544378e79f20f0d55412c9f674e6e9 Mon Sep 17 00:00:00 2001 From: Erik Quaeghebeur Date: Fri, 11 Sep 2020 14:16:05 +0200 Subject: [PATCH 6/9] deal properly with Falsy values (second try) --- html2text/__init__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 1339d2e..b9e24d7 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -498,7 +498,10 @@ def link_url(self: HTML2Text, link: str, title: str = "") -> None: # height, and alt attributes img_attrs_width = attrs.get("width") img_attrs_height = attrs.get("height") - if self.images_as_html or self.images_with_size: + if self.images_as_html or ( + self.images_with_size + and not (not img_attrs_width and not img_attrs_height) + ): self.o(" Date: Fri, 11 Sep 2020 14:46:01 +0200 Subject: [PATCH 7/9] deal properly with Falsy values (third try) --- html2text/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index b9e24d7..29f7161 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -537,7 +537,8 @@ def link_url(self: HTML2Text, link: str, title: str = "") -> None: else: self.o("![" + alt + "]") if self.inline_links: - href = attrs.get("href", "") + href = attrs.get("href") + href = "" if href is None else href self.o( "(" + escape_md(urlparse.urljoin(self.baseurl, href)) + ")" ) From 1a4c007d5dccc610d91572eb6c8aeba1184f493e Mon Sep 17 00:00:00 2001 From: Erik Quaeghebeur Date: Mon, 19 Oct 2020 16:23:01 +0200 Subject: [PATCH 8/9] make sure href is always a str --- html2text/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 29f7161..b9e24d7 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -537,8 +537,7 @@ def link_url(self: HTML2Text, link: str, title: str = "") -> None: else: self.o("![" + alt + "]") if self.inline_links: - href = attrs.get("href") - href = "" if href is None else href + href = attrs.get("href", "") self.o( "(" + escape_md(urlparse.urljoin(self.baseurl, href)) + ")" ) From bafb96e3e8daf92796e72ba624097815e16fe2cc Mon Sep 17 00:00:00 2001 From: Erik Quaeghebeur Date: Mon, 19 Oct 2020 17:26:49 +0200 Subject: [PATCH 9/9] convince mypy that href is always a str --- html2text/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index b9e24d7..79f4311 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -537,7 +537,7 @@ def link_url(self: HTML2Text, link: str, title: str = "") -> None: else: self.o("![" + alt + "]") if self.inline_links: - href = attrs.get("href", "") + href = attrs.get("href") or "" self.o( "(" + escape_md(urlparse.urljoin(self.baseurl, href)) + ")" )