Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -224,3 +224,4 @@ fabric.properties
.vscode/launch.json
.vscode/settings.json
CHROM/
.bak.py
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ classifiers = [
]
dependencies = [
"cattrs >= 23.2.0",
"chardet >= 5.2.0, < 6.0.0",
"chardet >= 5.2.0",
"defusedxml >= 0.7.1",
# NOTE: jsonschema 4.18.0 introduces a serious performance regression, seemingly due to use of new
# referencing library.
Expand Down
75 changes: 67 additions & 8 deletions src/allotropy/parsers/utils/encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,66 @@ def determine_encoding(
f"Unable to detect text encoding for file with content: {actual_contents!r}"
)
raise AllotropeParsingError(msg)
# chardet can report the wrong encoding when there are strange characters in the contents (e.g. emojis)
# To address this, we take the following approach - if the confidence of the detection is < 70%, report
# DEFAULT_ENCODING first, and the detected encoding second. If we return multiple encodings, the caller
# should try all.
if detect_result["confidence"] < 0.7:
return [DEFAULT_ENCODING, detect_result["encoding"]]
return [detect_result["encoding"]]

detected_encoding = detect_result["encoding"]
confidence = detect_result["confidence"]

# chardet can misdetect UTF-8 multi-byte sequences as ISO-8859-1 or similar
# Latin-1 encodings, especially in older versions (5.x). Always try UTF-8
# first when Latin-1 family encodings are detected to avoid mojibake.
latin1_encodings = {
"ISO-8859-1",
"ISO-8859-2",
"ISO-8859-3",
"ISO-8859-4",
"ISO-8859-5",
"ISO-8859-6",
"ISO-8859-7",
"ISO-8859-8",
"ISO-8859-9",
"ISO-8859-10",
"ISO-8859-13",
"ISO-8859-14",
"ISO-8859-15",
"ISO-8859-16",
"cp1250",
"cp1251",
"cp1252",
"cp1253",
"cp1254",
"cp1255",
"cp1256",
"cp1257",
"cp1258",
"windows-1250",
"windows-1251",
"windows-1252",
"windows-1253",
"windows-1254",
"windows-1255",
"windows-1256",
"windows-1257",
"windows-1258",
}

# Normalize encoding name for comparison
normalized_encoding = detected_encoding.upper() if detected_encoding else ""

if confidence < 0.3:
# Very low confidence, try multiple encodings
encodings: list[str | None] = [DEFAULT_ENCODING, "windows-1252"]
if detected_encoding and detected_encoding not in encodings:
encodings.append(detected_encoding)
return encodings
elif normalized_encoding in latin1_encodings or confidence < 0.7:
# For Latin-1 family or medium confidence, try UTF-8 first
# This handles chardet 5.x incorrectly detecting UTF-8 as ISO-8859-1
if detected_encoding != DEFAULT_ENCODING:
return [DEFAULT_ENCODING, detected_encoding]
return [DEFAULT_ENCODING]
else:
# High confidence non-Latin encoding, use as detected
return [detected_encoding]


def decode(contents: IO[bytes] | IO[str], encoding: str | None) -> str:
Expand All @@ -56,7 +109,13 @@ def decode(contents: IO[bytes] | IO[str], encoding: str | None) -> str:
msg = f"Could not determine encoding of contents: {actual_contents!r}"
raise AssertionError(msg)
try:
return actual_contents.decode(encoding)
decoded = actual_contents.decode(encoding)
# Strip BOM (Byte Order Mark) if present
# UTF-16 and UTF-8 files may include BOM character U+FEFF
# which should be removed from the content
if decoded and decoded[0] == "\ufeff":
decoded = decoded[1:]
return decoded
except UnicodeDecodeError as e:
if encoding != possible_encodings[-1]:
continue
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
"device control document": [
{
"device type": "multi analyte profiling analyzer",
"detector gain setting": "xMAP INTELLIFLEX® High Sensitivity",
"detector gain setting": "xMAP INTELLIFLEX® High Sensitivity",
"sample volume setting": {
"value": 50.0,
"unit": "μL"
Expand Down Expand Up @@ -418,7 +418,7 @@
"device control document": [
{
"device type": "multi analyte profiling analyzer",
"detector gain setting": "xMAP INTELLIFLEX® High Sensitivity",
"detector gain setting": "xMAP INTELLIFLEX® High Sensitivity",
"sample volume setting": {
"value": 50.0,
"unit": "μL"
Expand Down Expand Up @@ -826,7 +826,7 @@
"device control document": [
{
"device type": "multi analyte profiling analyzer",
"detector gain setting": "xMAP INTELLIFLEX® High Sensitivity",
"detector gain setting": "xMAP INTELLIFLEX® High Sensitivity",
"sample volume setting": {
"value": 50.0,
"unit": "μL"
Expand Down Expand Up @@ -1234,7 +1234,7 @@
"device control document": [
{
"device type": "multi analyte profiling analyzer",
"detector gain setting": "xMAP INTELLIFLEX® High Sensitivity",
"detector gain setting": "xMAP INTELLIFLEX® High Sensitivity",
"sample volume setting": {
"value": 50.0,
"unit": "μL"
Expand Down Expand Up @@ -1642,7 +1642,7 @@
"device control document": [
{
"device type": "multi analyte profiling analyzer",
"detector gain setting": "xMAP INTELLIFLEX® High Sensitivity",
"detector gain setting": "xMAP INTELLIFLEX® High Sensitivity",
"sample volume setting": {
"value": 50.0,
"unit": "μL"
Expand Down Expand Up @@ -2050,7 +2050,7 @@
"device control document": [
{
"device type": "multi analyte profiling analyzer",
"detector gain setting": "xMAP INTELLIFLEX® High Sensitivity",
"detector gain setting": "xMAP INTELLIFLEX® High Sensitivity",
"sample volume setting": {
"value": 50.0,
"unit": "μL"
Expand Down Expand Up @@ -2458,7 +2458,7 @@
"device control document": [
{
"device type": "multi analyte profiling analyzer",
"detector gain setting": "xMAP INTELLIFLEX® High Sensitivity",
"detector gain setting": "xMAP INTELLIFLEX® High Sensitivity",
"sample volume setting": {
"value": 50.0,
"unit": "μL"
Expand Down Expand Up @@ -2866,7 +2866,7 @@
"device control document": [
{
"device type": "multi analyte profiling analyzer",
"detector gain setting": "xMAP INTELLIFLEX® High Sensitivity",
"detector gain setting": "xMAP INTELLIFLEX® High Sensitivity",
"sample volume setting": {
"value": 50.0,
"unit": "μL"
Expand Down Expand Up @@ -3274,7 +3274,7 @@
"device control document": [
{
"device type": "multi analyte profiling analyzer",
"detector gain setting": "xMAP INTELLIFLEX® High Sensitivity",
"detector gain setting": "xMAP INTELLIFLEX® High Sensitivity",
"sample volume setting": {
"value": 50.0,
"unit": "μL"
Expand Down Expand Up @@ -3682,7 +3682,7 @@
"device control document": [
{
"device type": "multi analyte profiling analyzer",
"detector gain setting": "xMAP INTELLIFLEX® High Sensitivity",
"detector gain setting": "xMAP INTELLIFLEX® High Sensitivity",
"sample volume setting": {
"value": 50.0,
"unit": "μL"
Expand Down Expand Up @@ -4098,7 +4098,7 @@
"device control document": [
{
"device type": "multi analyte profiling analyzer",
"detector gain setting": "xMAP INTELLIFLEX® High Sensitivity",
"detector gain setting": "xMAP INTELLIFLEX® High Sensitivity",
"sample volume setting": {
"value": 50.0,
"unit": "μL"
Expand Down Expand Up @@ -4514,7 +4514,7 @@
"device control document": [
{
"device type": "multi analyte profiling analyzer",
"detector gain setting": "xMAP INTELLIFLEX® High Sensitivity",
"detector gain setting": "xMAP INTELLIFLEX® High Sensitivity",
"sample volume setting": {
"value": 50.0,
"unit": "μL"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@
},
"custom information document": {
"BatchStopTime": "5/17/2023 7:06:59 PM",
"ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def"
"ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def"
}
},
"analyst": "nguymaip"
Expand Down Expand Up @@ -242,7 +242,7 @@
},
"custom information document": {
"BatchStopTime": "5/17/2023 7:06:59 PM",
"ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def"
"ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def"
}
},
"analyst": "nguymaip"
Expand Down Expand Up @@ -361,7 +361,7 @@
},
"custom information document": {
"BatchStopTime": "5/17/2023 7:06:59 PM",
"ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def"
"ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def"
}
},
"analyst": "nguymaip"
Expand Down Expand Up @@ -480,7 +480,7 @@
},
"custom information document": {
"BatchStopTime": "5/17/2023 7:06:59 PM",
"ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def"
"ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def"
}
},
"analyst": "nguymaip"
Expand Down Expand Up @@ -624,7 +624,7 @@
"file name": "luminex_xPONENT_example02.csv",
"UNC path": "tests/parsers/luminex_xponent/testdata/luminex_xPONENT_example02.csv",
"ASM converter name": "allotropy_luminex_xponent",
"ASM converter version": "0.1.103",
"ASM converter version": "0.1.113",
"software name": "xPONENT",
"software version": "4.3.229.0"
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@
},
"custom information document": {
"BatchStopTime": "5/17/2023 7:06:59 PM",
"ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def"
"ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def"
}
},
"analyst": "nguymaip"
Expand Down Expand Up @@ -242,7 +242,7 @@
},
"custom information document": {
"BatchStopTime": "5/17/2023 7:06:59 PM",
"ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def"
"ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def"
}
},
"analyst": "nguymaip"
Expand Down Expand Up @@ -361,7 +361,7 @@
},
"custom information document": {
"BatchStopTime": "5/17/2023 7:06:59 PM",
"ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def"
"ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def"
}
},
"analyst": "nguymaip"
Expand Down Expand Up @@ -480,7 +480,7 @@
},
"custom information document": {
"BatchStopTime": "5/17/2023 7:06:59 PM",
"ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def"
"ProtocolDescription": "V116 plates 50�l sample 25 count cutoff. No plate def"
}
},
"analyst": "nguymaip"
Expand Down Expand Up @@ -624,7 +624,7 @@
"file name": "luminex_xPONENT_example02_saved.csv",
"UNC path": "tests/parsers/luminex_xponent/testdata/luminex_xPONENT_example02_saved.csv",
"ASM converter name": "allotropy_luminex_xponent",
"ASM converter version": "0.1.103",
"ASM converter version": "0.1.113",
"software name": "xPONENT",
"software version": "4.3.229.0"
},
Expand Down
Loading
Loading