diff --git a/html2text/__init__.py b/html2text/__init__.py index 2ee872f..48d4454 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -3,6 +3,7 @@ """html2text: Turn HTML into equivalent Markdown-structured text.""" from __future__ import division import re +import cgi try: from textwrap import wrap @@ -160,10 +161,16 @@ def close(self): return outtext def handle_charref(self, c): - self.o(self.charref(c), 1) + charref = self.charref(c) + if not self.code and not self.pre: + charref = cgi.escape(charref) + self.o(charref, 1) def handle_entityref(self, c): - self.o(self.entityref(c), 1) + entityref = self.entityref(c) + if not self.code and not self.pre and entityref != ' _place_holder;': + entityref = cgi.escape(entityref) + self.o(entityref, 1) def handle_starttag(self, tag, attrs): self.handle_tag(tag, attrs, 1) @@ -351,6 +358,7 @@ def handle_tag(self, tag, attrs, start): if tag in ["code", "tt"] and not self.pre: self.o('`') # TODO: `` `this` `` + self.code = not self.code if tag == "abbr": if start: self.abbr_title = None @@ -416,7 +424,7 @@ def handle_tag(self, tag, attrs, start): else: self.o("[") self.maybe_automatic_link = None - self.empty_link = False + self.empty_link = False # If we have images_to_alt, we discard the image itself, # considering only the alt text. diff --git a/test/html-escaping.html b/test/html-escaping.html new file mode 100644 index 0000000..b6f1da7 --- /dev/null +++ b/test/html-escaping.html @@ -0,0 +1,3 @@ +
Escaped HTML like <div> or & should remain escaped on output
+...unless that escaped HTML is in a <pre> tag+
...or a <code> tag
\ No newline at end of file
diff --git a/test/html-escaping.md b/test/html-escaping.md
new file mode 100644
index 0000000..19e91ee
--- /dev/null
+++ b/test/html-escaping.md
@@ -0,0 +1,8 @@
+Escaped HTML like <div> or & should remain escaped on output
+
+
+
+ ...unless that escaped HTML is in a tag
+
+`...or a tag`
+