From 497c580b9fa8e6544fc2a799238d685e9b9938ba Mon Sep 17 00:00:00 2001 From: thomasallen Date: Thu, 16 Oct 2025 11:33:24 -0700 Subject: [PATCH 1/3] issue_21_encoding --- adsrefpipe/refparsers/unicode.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/adsrefpipe/refparsers/unicode.py b/adsrefpipe/refparsers/unicode.py index d5ca792..e4efd8a 100755 --- a/adsrefpipe/refparsers/unicode.py +++ b/adsrefpipe/refparsers/unicode.py @@ -182,8 +182,11 @@ def ent2asc(self, text: str) -> str: :return: text with entities replaced by ASCII equivalents """ text = self.re_replace_amp.sub('&', text) + # import pdb;pdb.set_trace() result = self.re_entity.sub(self.__sub_asc_entity, text) + # import pdb;pdb.set_trace() result = self.re_numentity.sub(self.__sub_numasc_entity, result) + # import pdb;pdb.set_trace() result = self.re_hexnumentity.sub(self.__sub_hexnumasc_entity, result) return result @@ -244,7 +247,8 @@ def __sub_hexnumasc_entity(self, match: re.Match) -> str: elif entno < 255: return self.u2asc(chr(entno)) except IndexError: - raise UnicodeHandlerError('Unknown hexadecimal entity: %s' % match.group(0)) + logger.error(UnicodeHandlerError('Unknown hexadecimal entity: %s' % match.group(0))) + return "" def __sub_hexnum_toent(self, match: re.Match) -> str: """ From 1d26e357a78ea48f92cb31e25df6feb556c1956a Mon Sep 17 00:00:00 2001 From: thomasallen Date: Thu, 16 Oct 2025 11:35:35 -0700 Subject: [PATCH 2/3] cleanup comments --- adsrefpipe/refparsers/unicode.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/adsrefpipe/refparsers/unicode.py b/adsrefpipe/refparsers/unicode.py index e4efd8a..39453d2 100755 --- a/adsrefpipe/refparsers/unicode.py +++ b/adsrefpipe/refparsers/unicode.py @@ -182,11 +182,8 @@ def ent2asc(self, text: str) -> str: :return: text with entities replaced by ASCII equivalents """ text = self.re_replace_amp.sub('&', text) - # import pdb;pdb.set_trace() result = self.re_entity.sub(self.__sub_asc_entity, text) - # import pdb;pdb.set_trace() result = self.re_numentity.sub(self.__sub_numasc_entity, result) - # import pdb;pdb.set_trace() result = self.re_hexnumentity.sub(self.__sub_hexnumasc_entity, result) return result From 03711e880b000ea9a30c07cf12c3e7c13557d50c Mon Sep 17 00:00:00 2001 From: thomasallen Date: Thu, 16 Oct 2025 13:10:41 -0700 Subject: [PATCH 3/3] update unit test --- adsrefpipe/tests/unittests/test_ref_parsers.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/adsrefpipe/tests/unittests/test_ref_parsers.py b/adsrefpipe/tests/unittests/test_ref_parsers.py index 539d7b8..d0a3434 100755 --- a/adsrefpipe/tests/unittests/test_ref_parsers.py +++ b/adsrefpipe/tests/unittests/test_ref_parsers.py @@ -304,14 +304,11 @@ def test_sub_hexnumasc_entity(self): handler.unicode = MagicMock() handler.unicode.__getitem__.side_effect = IndexError - # large invalid hex value to trigger the IndexError exception + # large invalid hex value to trigger returning and empty string "" match = re.match(r'&#x(?P[0-9A-Fa-f]+);', "򙦙") if match: - # check that the correct exception is raised - with self.assertRaises(UnicodeHandlerError) as context: - handler._UnicodeHandler__sub_hexnumasc_entity(match) - # ensure the exception message is correct - self.assertEqual(str(context.exception), "Unknown hexadecimal entity: 򙦙") + result = handler._UnicodeHandler__sub_hexnumasc_entity(match) + self.assertEqual(result, "") def test_sub_hexnum_toent(self): """ test __sub_hexnum_toent method """