diff --git a/.gitignore b/.gitignore index cb7061159..9a4a128c1 100644 --- a/.gitignore +++ b/.gitignore @@ -224,3 +224,4 @@ fabric.properties .vscode/launch.json .vscode/settings.json CHROM/ +.bak.py diff --git a/pyproject.toml b/pyproject.toml index 9290a294d..75f33ac50 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,7 +48,7 @@ classifiers = [ ] dependencies = [ "cattrs >= 23.2.0", - "chardet >= 5.2.0, < 6.0.0", + "chardet >= 5.2.0", "defusedxml >= 0.7.1", # NOTE: jsonschema 4.18.0 introduces a serious performance regression, seemingly due to use of new # referencing library. diff --git a/src/allotropy/parsers/utils/encoding.py b/src/allotropy/parsers/utils/encoding.py index b1ee571fd..43775f295 100644 --- a/src/allotropy/parsers/utils/encoding.py +++ b/src/allotropy/parsers/utils/encoding.py @@ -35,13 +35,66 @@ def determine_encoding( f"Unable to detect text encoding for file with content: {actual_contents!r}" ) raise AllotropeParsingError(msg) - # chardet can report the wrong encoding when there are strange characters in the contents (e.g. emojis) - # To address this, we take the following approach - if the confidence of the detection is < 70%, report - # DEFAULT_ENCODING first, and the detected encoding second. If we return multiple encodings, the caller - # should try all. - if detect_result["confidence"] < 0.7: - return [DEFAULT_ENCODING, detect_result["encoding"]] - return [detect_result["encoding"]] + + detected_encoding = detect_result["encoding"] + confidence = detect_result["confidence"] + + # chardet can misdetect UTF-8 multi-byte sequences as ISO-8859-1 or similar + # Latin-1 encodings, especially in older versions (5.x). Always try UTF-8 + # first when Latin-1 family encodings are detected to avoid mojibake. + latin1_encodings = { + "ISO-8859-1", + "ISO-8859-2", + "ISO-8859-3", + "ISO-8859-4", + "ISO-8859-5", + "ISO-8859-6", + "ISO-8859-7", + "ISO-8859-8", + "ISO-8859-9", + "ISO-8859-10", + "ISO-8859-13", + "ISO-8859-14", + "ISO-8859-15", + "ISO-8859-16", + "cp1250", + "cp1251", + "cp1252", + "cp1253", + "cp1254", + "cp1255", + "cp1256", + "cp1257", + "cp1258", + "windows-1250", + "windows-1251", + "windows-1252", + "windows-1253", + "windows-1254", + "windows-1255", + "windows-1256", + "windows-1257", + "windows-1258", + } + + # Normalize encoding name for comparison + normalized_encoding = detected_encoding.upper() if detected_encoding else "" + + if confidence < 0.3: + # Very low confidence, try multiple encodings + encodings: list[str | None] = [DEFAULT_ENCODING, "windows-1252"] + if detected_encoding and detected_encoding not in encodings: + encodings.append(detected_encoding) + return encodings + elif normalized_encoding in latin1_encodings or confidence < 0.7: + # For Latin-1 family or medium confidence, try UTF-8 first + # This handles chardet 5.x incorrectly detecting UTF-8 as ISO-8859-1 + if detected_encoding != DEFAULT_ENCODING: + return [DEFAULT_ENCODING, detected_encoding] + return [DEFAULT_ENCODING] + else: + # High confidence non-Latin encoding, use as detected + return [detected_encoding] def decode(contents: IO[bytes] | IO[str], encoding: str | None) -> str: @@ -56,7 +109,13 @@ def decode(contents: IO[bytes] | IO[str], encoding: str | None) -> str: msg = f"Could not determine encoding of contents: {actual_contents!r}" raise AssertionError(msg) try: - return actual_contents.decode(encoding) + decoded = actual_contents.decode(encoding) + # Strip BOM (Byte Order Mark) if present + # UTF-16 and UTF-8 files may include BOM character U+FEFF + # which should be removed from the content + if decoded and decoded[0] == "\ufeff": + decoded = decoded[1:] + return decoded except UnicodeDecodeError as e: if encoding != possible_encodings[-1]: continue diff --git a/tests/parsers/luminex_intelliflex/testdata/luminex_intelliflex_example_01.json b/tests/parsers/luminex_intelliflex/testdata/luminex_intelliflex_example_01.json index be95d4d9c..4bc107a96 100644 --- a/tests/parsers/luminex_intelliflex/testdata/luminex_intelliflex_example_01.json +++ b/tests/parsers/luminex_intelliflex/testdata/luminex_intelliflex_example_01.json @@ -10,7 +10,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" @@ -418,7 +418,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" @@ -826,7 +826,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" @@ -1234,7 +1234,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" @@ -1642,7 +1642,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" @@ -2050,7 +2050,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" @@ -2458,7 +2458,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" @@ -2866,7 +2866,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" @@ -3274,7 +3274,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" @@ -3682,7 +3682,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" @@ -4098,7 +4098,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" @@ -4514,7 +4514,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" diff --git a/tests/parsers/luminex_xponent/testdata/luminex_xPONENT_example02.json b/tests/parsers/luminex_xponent/testdata/luminex_xPONENT_example02.json index 89965ddbf..507ec0ea7 100644 --- a/tests/parsers/luminex_xponent/testdata/luminex_xPONENT_example02.json +++ b/tests/parsers/luminex_xponent/testdata/luminex_xPONENT_example02.json @@ -123,7 +123,7 @@ }, "custom information document": { "BatchStopTime": "5/17/2023 7:06:59 PM", - "ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def" + "ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def" } }, "analyst": "nguymaip" @@ -242,7 +242,7 @@ }, "custom information document": { "BatchStopTime": "5/17/2023 7:06:59 PM", - "ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def" + "ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def" } }, "analyst": "nguymaip" @@ -361,7 +361,7 @@ }, "custom information document": { "BatchStopTime": "5/17/2023 7:06:59 PM", - "ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def" + "ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def" } }, "analyst": "nguymaip" @@ -480,7 +480,7 @@ }, "custom information document": { "BatchStopTime": "5/17/2023 7:06:59 PM", - "ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def" + "ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def" } }, "analyst": "nguymaip" @@ -624,7 +624,7 @@ "file name": "luminex_xPONENT_example02.csv", "UNC path": "tests/parsers/luminex_xponent/testdata/luminex_xPONENT_example02.csv", "ASM converter name": "allotropy_luminex_xponent", - "ASM converter version": "0.1.103", + "ASM converter version": "0.1.113", "software name": "xPONENT", "software version": "4.3.229.0" }, diff --git a/tests/parsers/luminex_xponent/testdata/luminex_xPONENT_example02_saved.json b/tests/parsers/luminex_xponent/testdata/luminex_xPONENT_example02_saved.json index ee9fe2c56..07335b618 100644 --- a/tests/parsers/luminex_xponent/testdata/luminex_xPONENT_example02_saved.json +++ b/tests/parsers/luminex_xponent/testdata/luminex_xPONENT_example02_saved.json @@ -123,7 +123,7 @@ }, "custom information document": { "BatchStopTime": "5/17/2023 7:06:59 PM", - "ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def" + "ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def" } }, "analyst": "nguymaip" @@ -242,7 +242,7 @@ }, "custom information document": { "BatchStopTime": "5/17/2023 7:06:59 PM", - "ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def" + "ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def" } }, "analyst": "nguymaip" @@ -361,7 +361,7 @@ }, "custom information document": { "BatchStopTime": "5/17/2023 7:06:59 PM", - "ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def" + "ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def" } }, "analyst": "nguymaip" @@ -480,7 +480,7 @@ }, "custom information document": { "BatchStopTime": "5/17/2023 7:06:59 PM", - "ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def" + "ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def" } }, "analyst": "nguymaip" @@ -624,7 +624,7 @@ "file name": "luminex_xPONENT_example02_saved.csv", "UNC path": "tests/parsers/luminex_xponent/testdata/luminex_xPONENT_example02_saved.csv", "ASM converter name": "allotropy_luminex_xponent", - "ASM converter version": "0.1.103", + "ASM converter version": "0.1.113", "software name": "xPONENT", "software version": "4.3.229.0" }, diff --git a/tests/parsers/luminex_xponent/testdata/luminex_xPONENT_missing_optional_fields.json b/tests/parsers/luminex_xponent/testdata/luminex_xPONENT_missing_optional_fields.json index a5d3794fc..d1bcd5386 100644 --- a/tests/parsers/luminex_xponent/testdata/luminex_xPONENT_missing_optional_fields.json +++ b/tests/parsers/luminex_xponent/testdata/luminex_xPONENT_missing_optional_fields.json @@ -10,7 +10,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" @@ -418,7 +418,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" @@ -826,7 +826,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" @@ -1234,7 +1234,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" @@ -1642,7 +1642,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" @@ -2050,7 +2050,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" @@ -2458,7 +2458,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" @@ -2866,7 +2866,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" @@ -3274,7 +3274,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" @@ -3682,7 +3682,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" @@ -4098,7 +4098,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" @@ -4514,7 +4514,7 @@ "device control document": [ { "device type": "multi analyte profiling analyzer", - "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", + "detector gain setting": "xMAP INTELLIFLEX® High Sensitivity", "sample volume setting": { "value": 50.0, "unit": "μL" @@ -6853,7 +6853,7 @@ "file name": "luminex_xPONENT_missing_optional_fields.csv", "UNC path": "tests/parsers/luminex_xponent/testdata/luminex_xPONENT_missing_optional_fields.csv", "ASM converter name": "allotropy_luminex_xponent", - "ASM converter version": "0.1.103", + "ASM converter version": "0.1.113", "software name": "xPONENT", "software version": "2.1.1015" },