From 7b1cd2ede259594eb7767970f28729ba47de530a Mon Sep 17 00:00:00 2001 From: Nathan Stender Date: Wed, 11 Mar 2026 17:26:55 -0400 Subject: [PATCH 1/4] feat: Add compatibility with chardet 6.0.0+ and fix encoding issues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Update pyproject.toml to require chardet >= 6.0.0 - Improve encoding detection logic to handle chardet 7.x behavior changes: - Add fallback to windows-1252 for very low confidence detections (<0.3) - Better handling of single-byte special characters (en dash, ®, µ) - Add BOM (Byte Order Mark) stripping for UTF-16 and UTF-8 files - Fix test data that contained mojibake from incorrect encoding detection - Corrected "®" to "®" in expected JSON files - Fixed "�" (replacement character) to proper "µ" symbol These changes ensure proper handling of various file encodings including UTF-16 LE (used by SoftMax Pro), Windows-1252, and UTF-8 with BOM. Co-Authored-By: Claude Opus 4.1 --- pyproject.toml | 2 +- src/allotropy/parsers/utils/encoding.py | 35 ++++++++++++++----- .../luminex_intelliflex_example_01.json | 24 ++++++------- .../testdata/luminex_xPONENT_example02.json | 10 +++--- .../luminex_xPONENT_example02_saved.json | 10 +++--- ...minex_xPONENT_missing_optional_fields.json | 26 +++++++------- 6 files changed, 63 insertions(+), 44 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 9290a294da..b64c1fa8f3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,7 +48,7 @@ classifiers = [ ] dependencies = [ "cattrs >= 23.2.0", - "chardet >= 5.2.0, < 6.0.0", + "chardet >= 6.0.0", "defusedxml >= 0.7.1", # NOTE: jsonschema 4.18.0 introduces a serious performance regression, seemingly due to use of new # referencing library. diff --git a/src/allotropy/parsers/utils/encoding.py b/src/allotropy/parsers/utils/encoding.py index b1ee571fd9..852200d26e 100644 --- a/src/allotropy/parsers/utils/encoding.py +++ b/src/allotropy/parsers/utils/encoding.py @@ -35,13 +35,26 @@ def determine_encoding( f"Unable to detect text encoding for file with content: {actual_contents!r}" ) raise AllotropeParsingError(msg) - # chardet can report the wrong encoding when there are strange characters in the contents (e.g. emojis) - # To address this, we take the following approach - if the confidence of the detection is < 70%, report - # DEFAULT_ENCODING first, and the detected encoding second. If we return multiple encodings, the caller - # should try all. - if detect_result["confidence"] < 0.7: - return [DEFAULT_ENCODING, detect_result["encoding"]] - return [detect_result["encoding"]] + + detected_encoding = detect_result["encoding"] + confidence = detect_result["confidence"] + + # chardet 7.x has different behavior for Windows-1252 detection + # For very low confidence detections, try Windows-1252 as a fallback + # This handles cases like single byte \x96 (en dash) which chardet 7.x + # may misdetect as iso-8859-16 or other encodings + if confidence < 0.3: + # For very low confidence, try multiple encodings + encodings = [DEFAULT_ENCODING, "windows-1252"] + if detected_encoding and detected_encoding not in encodings: + encodings.append(detected_encoding) + return encodings + elif confidence < 0.7: + # For medium confidence, try UTF-8 first, then the detected encoding + return [DEFAULT_ENCODING, detected_encoding] + else: + # High confidence, use the detected encoding + return [detected_encoding] def decode(contents: IO[bytes] | IO[str], encoding: str | None) -> str: @@ -56,7 +69,13 @@ def decode(contents: IO[bytes] | IO[str], encoding: str | None) -> str: msg = f"Could not determine encoding of contents: {actual_contents!r}" raise AssertionError(msg) try: - return actual_contents.decode(encoding) + decoded = actual_contents.decode(encoding) + # Strip BOM (Byte Order Mark) if present + # UTF-16 and UTF-8 files may include BOM character U+FEFF + # which should be removed from the content + if decoded and decoded[0] == '\ufeff': + decoded = decoded[1:] + return decoded except UnicodeDecodeError as e: if encoding != possible_encodings[-1]: continue diff --git a/tests/parsers/luminex_intelliflex/testdata/luminex_intelliflex_example_01.json b/tests/parsers/luminex_intelliflex/testdata/luminex_intelliflex_example_01.json index be95d4d9c8..4bc107a963 100644 --- a/tests/parsers/luminex_intelliflex/testdata/luminex_intelliflex_example_01.json +++ b/tests/parsers/luminex_intelliflex/testdata/luminex_intelliflex_example_01.json @@ -10,7 +10,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" @@ -418,7 +418,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" @@ -826,7 +826,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" @@ -1234,7 +1234,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" @@ -1642,7 +1642,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" @@ -2050,7 +2050,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" @@ -2458,7 +2458,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" @@ -2866,7 +2866,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" @@ -3274,7 +3274,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" @@ -3682,7 +3682,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" @@ -4098,7 +4098,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" @@ -4514,7 +4514,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" diff --git a/tests/parsers/luminex_xponent/testdata/luminex_xPONENT_example02.json b/tests/parsers/luminex_xponent/testdata/luminex_xPONENT_example02.json index 89965ddbfa..507ec0ea76 100644 --- a/tests/parsers/luminex_xponent/testdata/luminex_xPONENT_example02.json +++ b/tests/parsers/luminex_xponent/testdata/luminex_xPONENT_example02.json @@ -123,7 +123,7 @@ }, "custom information document": { "BatchStopTime": "5/17/2023 7:06:59 PM", - "ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def" + "ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def" } }, "analyst": "nguymaip" @@ -242,7 +242,7 @@ }, "custom information document": { "BatchStopTime": "5/17/2023 7:06:59 PM", - "ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def" + "ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def" } }, "analyst": "nguymaip" @@ -361,7 +361,7 @@ }, "custom information document": { "BatchStopTime": "5/17/2023 7:06:59 PM", - "ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def" + "ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def" } }, "analyst": "nguymaip" @@ -480,7 +480,7 @@ }, "custom information document": { "BatchStopTime": "5/17/2023 7:06:59 PM", - "ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def" + "ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def" } }, "analyst": "nguymaip" @@ -624,7 +624,7 @@ "file name": "luminex_xPONENT_example02.csv", "UNC path": "tests/parsers/luminex_xponent/testdata/luminex_xPONENT_example02.csv", "ASM converter name": "allotropy_luminex_xponent", - "ASM converter version": "0.1.103", + "ASM converter version": "0.1.113", "software name": "xPONENT", "software version": "4.3.229.0" }, diff --git a/tests/parsers/luminex_xponent/testdata/luminex_xPONENT_example02_saved.json b/tests/parsers/luminex_xponent/testdata/luminex_xPONENT_example02_saved.json index ee9fe2c561..07335b6184 100644 --- a/tests/parsers/luminex_xponent/testdata/luminex_xPONENT_example02_saved.json +++ b/tests/parsers/luminex_xponent/testdata/luminex_xPONENT_example02_saved.json @@ -123,7 +123,7 @@ }, "custom information document": { "BatchStopTime": "5/17/2023 7:06:59 PM", - "ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def" + "ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def" } }, "analyst": "nguymaip" @@ -242,7 +242,7 @@ }, "custom information document": { "BatchStopTime": "5/17/2023 7:06:59 PM", - "ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def" + "ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def" } }, "analyst": "nguymaip" @@ -361,7 +361,7 @@ }, "custom information document": { "BatchStopTime": "5/17/2023 7:06:59 PM", - "ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def" + "ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def" } }, "analyst": "nguymaip" @@ -480,7 +480,7 @@ }, "custom information document": { "BatchStopTime": "5/17/2023 7:06:59 PM", - "ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def" + "ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def" } }, "analyst": "nguymaip" @@ -624,7 +624,7 @@ "file name": "luminex_xPONENT_example02_saved.csv", "UNC path": "tests/parsers/luminex_xponent/testdata/luminex_xPONENT_example02_saved.csv", "ASM converter name": "allotropy_luminex_xponent", - "ASM converter version": "0.1.103", + "ASM converter version": "0.1.113", "software name": "xPONENT", "software version": "4.3.229.0" }, diff --git a/tests/parsers/luminex_xponent/testdata/luminex_xPONENT_missing_optional_fields.json b/tests/parsers/luminex_xponent/testdata/luminex_xPONENT_missing_optional_fields.json index a5d3794fce..d1bcd53868 100644 --- a/tests/parsers/luminex_xponent/testdata/luminex_xPONENT_missing_optional_fields.json +++ b/tests/parsers/luminex_xponent/testdata/luminex_xPONENT_missing_optional_fields.json @@ -10,7 +10,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" @@ -418,7 +418,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" @@ -826,7 +826,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" @@ -1234,7 +1234,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" @@ -1642,7 +1642,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" @@ -2050,7 +2050,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" @@ -2458,7 +2458,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" @@ -2866,7 +2866,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" @@ -3274,7 +3274,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" @@ -3682,7 +3682,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" @@ -4098,7 +4098,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" @@ -4514,7 +4514,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" @@ -6853,7 +6853,7 @@ "file name": "luminex_xPONENT_missing_optional_fields.csv", "UNC path": "tests/parsers/luminex_xponent/testdata/luminex_xPONENT_missing_optional_fields.csv", "ASM converter name": "allotropy_luminex_xponent", - "ASM converter version": "0.1.103", + "ASM converter version": "0.1.113", "software name": "xPONENT", "software version": "2.1.1015" }, From 4d7f872a6adb5cc8076341b30335384845e4181b Mon Sep 17 00:00:00 2001 From: Nathan Stender Date: Wed, 11 Mar 2026 17:36:27 -0400 Subject: [PATCH 2/4] chore: Add .bak.py files to gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index cb70611590..9a4a128c1a 100644 --- a/.gitignore +++ b/.gitignore @@ -224,3 +224,4 @@ fabric.properties .vscode/launch.json .vscode/settings.json CHROM/ +.bak.py From 5fb336439e8c3723abe8d7d2c1bfdacfbb2196e4 Mon Sep 17 00:00:00 2001 From: Nathan Stender Date: Wed, 11 Mar 2026 17:50:54 -0400 Subject: [PATCH 3/4] fix: Make chardet constraint flexible to support 5.x, 6.x, and 7.x - Changed chardet requirement from >= 6.0.0 to >= 5.2.0 to allow consumers flexibility - Enhanced encoding detection to handle differences between chardet versions: - Always try UTF-8 first when Latin-1 family encodings are detected - This prevents mojibake when chardet 5.x misdetects UTF-8 as ISO-8859-1 - Tests now pass with both chardet 5.2.0 and 7.0.1 This allows consumers to use any chardet version >= 5.2.0 without being forced to upgrade. Co-Authored-By: Claude Opus 4.1 --- pyproject.toml | 2 +- src/allotropy/parsers/utils/encoding.py | 33 ++++++++++++++++++------- 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b64c1fa8f3..75f33ac502 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,7 +48,7 @@ classifiers = [ ] dependencies = [ "cattrs >= 23.2.0", - "chardet >= 6.0.0", + "chardet >= 5.2.0", "defusedxml >= 0.7.1", # NOTE: jsonschema 4.18.0 introduces a serious performance regression, seemingly due to use of new # referencing library. diff --git a/src/allotropy/parsers/utils/encoding.py b/src/allotropy/parsers/utils/encoding.py index 852200d26e..2889314cc5 100644 --- a/src/allotropy/parsers/utils/encoding.py +++ b/src/allotropy/parsers/utils/encoding.py @@ -39,21 +39,36 @@ def determine_encoding( detected_encoding = detect_result["encoding"] confidence = detect_result["confidence"] - # chardet 7.x has different behavior for Windows-1252 detection - # For very low confidence detections, try Windows-1252 as a fallback - # This handles cases like single byte \x96 (en dash) which chardet 7.x - # may misdetect as iso-8859-16 or other encodings + # chardet can misdetect UTF-8 multi-byte sequences as ISO-8859-1 or similar + # Latin-1 encodings, especially in older versions (5.x). Always try UTF-8 + # first when Latin-1 family encodings are detected to avoid mojibake. + latin1_encodings = { + "ISO-8859-1", "ISO-8859-2", "ISO-8859-3", "ISO-8859-4", "ISO-8859-5", + "ISO-8859-6", "ISO-8859-7", "ISO-8859-8", "ISO-8859-9", "ISO-8859-10", + "ISO-8859-13", "ISO-8859-14", "ISO-8859-15", "ISO-8859-16", + "cp1250", "cp1251", "cp1252", "cp1253", "cp1254", "cp1255", + "cp1256", "cp1257", "cp1258", "windows-1250", "windows-1251", + "windows-1252", "windows-1253", "windows-1254", "windows-1255", + "windows-1256", "windows-1257", "windows-1258" + } + + # Normalize encoding name for comparison + normalized_encoding = detected_encoding.upper() if detected_encoding else "" + if confidence < 0.3: - # For very low confidence, try multiple encodings + # Very low confidence, try multiple encodings encodings = [DEFAULT_ENCODING, "windows-1252"] if detected_encoding and detected_encoding not in encodings: encodings.append(detected_encoding) return encodings - elif confidence < 0.7: - # For medium confidence, try UTF-8 first, then the detected encoding - return [DEFAULT_ENCODING, detected_encoding] + elif normalized_encoding in latin1_encodings or confidence < 0.7: + # For Latin-1 family or medium confidence, try UTF-8 first + # This handles chardet 5.x incorrectly detecting UTF-8 as ISO-8859-1 + if detected_encoding != DEFAULT_ENCODING: + return [DEFAULT_ENCODING, detected_encoding] + return [DEFAULT_ENCODING] else: - # High confidence, use the detected encoding + # High confidence non-Latin encoding, use as detected return [detected_encoding] From 3657e5b3bf1a2f50e5f1b7f1b614feda386dc2c2 Mon Sep 17 00:00:00 2001 From: Nathan Stender Date: Wed, 11 Mar 2026 18:03:50 -0400 Subject: [PATCH 4/4] fix: Fix linting issues (quotes and type annotation) --- src/allotropy/parsers/utils/encoding.py | 43 +++++++++++++++++++------ 1 file changed, 34 insertions(+), 9 deletions(-) diff --git a/src/allotropy/parsers/utils/encoding.py b/src/allotropy/parsers/utils/encoding.py index 2889314cc5..43775f295e 100644 --- a/src/allotropy/parsers/utils/encoding.py +++ b/src/allotropy/parsers/utils/encoding.py @@ -43,13 +43,38 @@ def determine_encoding( # Latin-1 encodings, especially in older versions (5.x). Always try UTF-8 # first when Latin-1 family encodings are detected to avoid mojibake. latin1_encodings = { - "ISO-8859-1", "ISO-8859-2", "ISO-8859-3", "ISO-8859-4", "ISO-8859-5", - "ISO-8859-6", "ISO-8859-7", "ISO-8859-8", "ISO-8859-9", "ISO-8859-10", - "ISO-8859-13", "ISO-8859-14", "ISO-8859-15", "ISO-8859-16", - "cp1250", "cp1251", "cp1252", "cp1253", "cp1254", "cp1255", - "cp1256", "cp1257", "cp1258", "windows-1250", "windows-1251", - "windows-1252", "windows-1253", "windows-1254", "windows-1255", - "windows-1256", "windows-1257", "windows-1258" + "ISO-8859-1", + "ISO-8859-2", + "ISO-8859-3", + "ISO-8859-4", + "ISO-8859-5", + "ISO-8859-6", + "ISO-8859-7", + "ISO-8859-8", + "ISO-8859-9", + "ISO-8859-10", + "ISO-8859-13", + "ISO-8859-14", + "ISO-8859-15", + "ISO-8859-16", + "cp1250", + "cp1251", + "cp1252", + "cp1253", + "cp1254", + "cp1255", + "cp1256", + "cp1257", + "cp1258", + "windows-1250", + "windows-1251", + "windows-1252", + "windows-1253", + "windows-1254", + "windows-1255", + "windows-1256", + "windows-1257", + "windows-1258", } # Normalize encoding name for comparison @@ -57,7 +82,7 @@ def determine_encoding( if confidence < 0.3: # Very low confidence, try multiple encodings - encodings = [DEFAULT_ENCODING, "windows-1252"] + encodings: list[str | None] = [DEFAULT_ENCODING, "windows-1252"] if detected_encoding and detected_encoding not in encodings: encodings.append(detected_encoding) return encodings @@ -88,7 +113,7 @@ def decode(contents: IO[bytes] | IO[str], encoding: str | None) -> str: # Strip BOM (Byte Order Mark) if present # UTF-16 and UTF-8 files may include BOM character U+FEFF # which should be removed from the content - if decoded and decoded[0] == '\ufeff': + if decoded and decoded[0] == "\ufeff": decoded = decoded[1:] return decoded except UnicodeDecodeError as e: