Skip to content

Commit 2ae5316

Browse files
committed
Merge remote-tracking branch 'ArrayBolt3/arraybolt3/trixie'
2 parents f6876ea + 3d12d9b commit 2ae5316

31 files changed

+920
-329
lines changed

.github/workflows/lint.yml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,15 @@ on:
66
- usr/lib/python3/dist-packages/stdisplay/**
77
- usr/lib/python3/dist-packages/sanitize_string/**
88
- usr/lib/python3/dist-packages/strip_markup/**
9+
- usr/lib/python3/dist-packages/unicode_show/**
910
- .github/workflows/lint.yml
1011
- run-tests
1112
pull_request:
1213
paths:
1314
- usr/lib/python3/dist-packages/stdisplay/**
1415
- usr/lib/python3/dist-packages/sanitize_string/**
1516
- usr/lib/python3/dist-packages/strip_markup/**
17+
- usr/lib/python3/dist-packages/unicode_show/**
1618
- .github/workflows/lint.yml
1719
- run-tests
1820

@@ -27,7 +29,10 @@ jobs:
2729
- image: debian:stable
2830
- image: debian:testing
2931
- image: debian:unstable
30-
- image: ubuntu:latest
32+
## Disabled because ubuntu:latest is currently 24.04, which uses a
33+
## version of Python that does not support the `newline` argument of
34+
## Path.read_text()
35+
# - image: ubuntu:latest
3136
- image: ubuntu:rolling
3237

3338
container:

man/unicode-show.1.ronn

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@ unicode-show(1) -- Scan and annotate suspicious Unicode characters
1919

2020
### What is considered suspicious:
2121
- Characters outside the printable ASCII range (`0x20`-`0x7E`)
22-
- Control characters (excluding `\n`, `\r`, `\t`)
22+
- Control characters (excluding `\n` and `\t`)
23+
- Carriage returns (`\r`), even when used in CRLF pairs
2324
- Any character not in the standard set of ASCII letters, digits, punctuation, and trailing whitespace
2425

2526
### Output formatting:
@@ -29,6 +30,7 @@ unicode-show(1) -- Scan and annotate suspicious Unicode characters
2930

3031
## Color output is disabled if:
3132
- The environment variable `$NOCOLOR` is set
33+
- The environment variable `$NO_COLOR` is set to `1`
3234
- `$TERM` is set to `dumb`
3335
- Output is redirected (non-interactive terminal)
3436

@@ -71,6 +73,7 @@ NOCOLOR=1 unicode-show example.txt
7173
## ENVIRONMENT
7274

7375
- **NOCOLOR** - disables color output if set
76+
- **NO_COLOR** - disables color output if set to `1`
7477
- **TERM** - if set to `dumb`, disables color output
7578

7679
## AUTHOR

run-tests

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ black=(black --config="${pyrc}" --color --diff --check)
1010
pylint=(pylint --rcfile="${pyrc}")
1111
mypy=(mypy --config-file="${pyrc}")
1212

13-
py_lib_to_test_list=(stdisplay sanitize_string strip_markup)
13+
py_lib_to_test_list=(stdisplay sanitize_string strip_markup unicode_show)
1414

1515
for py_lib_to_test in "${py_lib_to_test_list[@]}"; do
1616
cd "${pythonpath}/${py_lib_to_test}/"
@@ -23,7 +23,7 @@ for py_lib_to_test in "${py_lib_to_test_list[@]}"; do
2323
done
2424

2525
stdin_file_read_utils=(stcat stcatn)
26-
stdin_implicit_read_utils=(sttee stsponge strip-markup)
26+
stdin_implicit_read_utils=(sttee stsponge strip-markup unicode-show)
2727
stdin_utils=("${stdin_file_read_utils[@]}" "${stdin_implicit_read_utils[@]}")
2828
utils=(stprint stecho "${stdin_utils[@]}")
2929
cd "${git_toplevel}/usr/bin"

usr/bin/grep-find-unicode-wrapper

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -71,10 +71,10 @@ set -o errexit
7171

7272
output_message="$(
7373
{
74-
[ -n "$one" ] && printf '%s\n' "$one"
75-
[ -n "$two" ] && printf '%s\n' "$two"
76-
[ -n "$three" ] && printf '%s\n' "$three"
77-
[ -n "$four" ] && printf '%s\n' "$four"
74+
[ -n "${one:-}" ] && printf '%s\n' "$one" || true
75+
[ -n "${two:-}" ] && printf '%s\n' "$two" || true
76+
[ -n "${three:-}" ] && printf '%s\n' "$three" || true
77+
[ -n "${four:-}" ] && printf '%s\n' "$four" || true
7878
} | sort --unique
7979
)"
8080

usr/bin/unicode-show

Lines changed: 3 additions & 148 deletions
Original file line numberDiff line numberDiff line change
@@ -3,155 +3,10 @@
33
## Copyright (C) 2025 - 2025 ENCRYPTED SUPPORT LLC <adrelanos@whonix.org>
44
## See the file COPYING for copying conditions.
55

6-
"""
7-
This script scans input text or files for non-ASCII and suspicious Unicode characters.
8-
It prints lines with suspicious characters annotated inline (e.g., [U+XXXX]).
9-
For each such character, it prints the Unicode codepoint, name, and category.
10-
11-
Exit codes:
12-
0 - No suspicious Unicode found
13-
1 - Suspicious Unicode found
14-
2 - Error (e.g., file I/O or decoding error)
15-
"""
6+
# pylint: disable=missing-module-docstring,invalid-name
167

178
import sys
18-
import unicodedata
19-
import string
20-
import io
21-
import os
22-
23-
USE_COLOR = (
24-
not os.getenv("NOCOLOR") and
25-
os.getenv("TERM") != "dumb" and
26-
sys.stdout.isatty()
27-
)
28-
29-
RED = "\033[91m"
30-
CYAN = "\033[96m"
31-
RESET = "\033[0m"
32-
33-
VISIBLE_ASCII_RANGE = range(0x20, 0x7F)
34-
ALLOWED_WHITESPACE = {'\n', '\r', '\t'}
35-
SAFE_ASCII_SEMANTIC = set(string.ascii_letters + string.digits + string.punctuation + " \n\r\t")
36-
37-
def colorize(text, color):
38-
return f"{color}{text}{RESET}" if USE_COLOR else text
39-
40-
def is_suspicious(c):
41-
codepoint_allowed = ord(c) in VISIBLE_ASCII_RANGE or c in ALLOWED_WHITESPACE
42-
semantically_allowed = c in SAFE_ASCII_SEMANTIC
43-
## Purposeful redundancy for extra safety.
44-
return not (codepoint_allowed and semantically_allowed)
45-
46-
def describe_char(c):
47-
"""Return a description of a Unicode character including codepoint, name, and category."""
48-
code = ord(c)
49-
name = unicodedata.name(c, "<unnamed>")
50-
cat = unicodedata.category(c)
51-
52-
codepoint_allowed = code in VISIBLE_ASCII_RANGE or c in ALLOWED_WHITESPACE
53-
semantically_allowed = c in SAFE_ASCII_SEMANTIC
54-
55-
## Purposeful redundancy for extra safety in character display.
56-
if codepoint_allowed and semantically_allowed:
57-
display = c
58-
else:
59-
display = repr(c)
60-
61-
desc = f"{display} (U+{code:04X}, {name}, {cat})"
62-
return colorize(desc, CYAN)
63-
64-
def scan_line(line, lineno=None, filename=None):
65-
"""Scan a single line for suspicious characters, print annotated line and character info."""
66-
67-
annotated = ""
68-
has_suspicious = False
69-
prefix = f"{filename or '<stdin>'}:{lineno}: "
70-
suspicious_descrs = []
71-
72-
for c in line:
73-
if is_suspicious(c):
74-
has_suspicious = True
75-
code = f"[U+{ord(c):04X}]"
76-
annotated += colorize(code, RED)
77-
suspicious_descrs.append(f" -> {describe_char(c)}")
78-
else:
79-
annotated += c
80-
81-
if annotated and annotated[-1] == "\n":
82-
annotated = annotated[:-1]
83-
84-
annotated_stripped = annotated.rstrip()
85-
## Trailing whitespaces are suspicious.
86-
## https://forums.whonix.org/t/detecting-malicious-unicode-in-source-code-and-pull-requests/13754/28
87-
if len(annotated) != len(annotated_stripped):
88-
annotated_new = annotated_stripped
89-
has_suspicious = True
90-
for c in annotated[len(annotated_stripped):]:
91-
code = f"[U+{ord(c):04X}]"
92-
annotated_new += colorize(code, RED)
93-
suspicious_descrs.append(f" -> {describe_char(c)}")
94-
annotated = annotated_new
95-
96-
if not has_suspicious:
97-
return False
98-
99-
print(prefix + annotated)
100-
for suspicious_descr in suspicious_descrs:
101-
print(suspicious_descr)
102-
103-
return True
104-
105-
def scan_file(f, filename=None):
106-
"""Scan an entire file-like object for suspicious characters."""
107-
found = False
108-
last_lineno = 0
109-
## Empty files should not report "missing newline at end".
110-
#last_line = ""
111-
last_line = None
112-
113-
for lineno, line in enumerate(f, 1):
114-
last_lineno = lineno
115-
last_line = line
116-
if scan_line(line, lineno=lineno, filename=filename):
117-
found = True
118-
119-
if last_line is not None and not last_line.endswith('\n'):
120-
found = True
121-
## Missing newline at the end is suspicious.
122-
msg = f"{filename or '<stdin>'}:{last_lineno}: " + colorize("[missing newline at end]", RED)
123-
print(msg)
124-
125-
return found
9+
from unicode_show.unicode_show import main
12610

12711
if __name__ == "__main__":
128-
clean = True
129-
try:
130-
if len(sys.argv) > 1:
131-
for fname in sys.argv[1:]:
132-
try:
133-
## Must not use errors='replace' because otherwise suspicious unicode might slip.
134-
## Fail closed for non-UTF-8.
135-
with open(fname, 'r', encoding='utf-8', errors='strict') as f:
136-
if scan_file(f, filename=fname):
137-
clean = False
138-
except UnicodeDecodeError as e:
139-
print(f"[ERROR] Unicode decode error [{fname}]: {e}", file=sys.stderr)
140-
clean = False
141-
except Exception as e:
142-
print(f"[ERROR] File read error [{fname}]: {e}", file=sys.stderr)
143-
sys.exit(2)
144-
else:
145-
try:
146-
stdin_buffer = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8', errors='strict')
147-
if scan_file(stdin_buffer):
148-
clean = False
149-
except UnicodeDecodeError as e:
150-
print(f"[ERROR] Unicode decode error [stdin]: {e}", file=sys.stderr)
151-
clean = False
152-
except Exception as e:
153-
print(f"[ERROR] Unexpected error [main]: {e}", file=sys.stderr)
154-
sys.exit(2)
155-
156-
if not clean:
157-
sys.exit(1)
12+
sys.exit(main())
Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +0,0 @@
1-
#!/usr/bin/python3 -su

usr/lib/python3/dist-packages/sanitize_string/sanitize_string.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,3 @@ def main() -> int:
8989
else:
9090
sys.stdout.write(sanitized_string)
9191
return 0
92-
93-
94-
if __name__ == "__main__":
95-
main()

usr/lib/python3/dist-packages/stdisplay/__init__.py

100755100644
File mode changed.

usr/lib/python3/dist-packages/stdisplay/py.typed

100755100644
Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +0,0 @@
1-
#!/usr/bin/python3 -su

usr/lib/python3/dist-packages/stdisplay/stcat.py

100755100644
Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,25 @@
11
#!/usr/bin/python3 -su
22

3-
## SPDX-FileCopyrightText: 2025 Benjamin Grande M. S. <ben.grande.b@gmail.com>
4-
## SPDX-FileCopyrightText: 2025 ENCRYPTED SUPPORT LLC <adrelanos@whonix.org>
5-
##
6-
## SPDX-License-Identifier: AGPL-3.0-or-later
3+
## Copyright (C) 2025 - 2025 Benjamin Grande M. S. <ben.grande.b@gmail.com>
4+
## Copyright (C) 2025 - 2025 ENCRYPTED SUPPORT LLC <adrelanos@whonix.org>
5+
## See the file COPYING for copying conditions.
76

87
"""Safely print stdin or file to stdout."""
98

109
from pathlib import Path
11-
from sys import argv, stdin, stdout, modules
10+
from sys import argv, stdin, stdout
1211
from stdisplay.stdisplay import stdisplay
1312

1413

1514
def main() -> None:
1615
"""Safely print stdin or file to stdout."""
17-
# https://github.com/pytest-dev/pytest/issues/4843
18-
if "pytest" not in modules and stdin is not None:
19-
stdin.reconfigure(errors="replace") # type: ignore
16+
stdout.reconfigure( # type: ignore
17+
encoding="ascii", errors="replace", newline="\n"
18+
)
19+
if stdin is not None:
20+
stdin.reconfigure( # type: ignore
21+
encoding="utf-8", errors="replace", newline="\n"
22+
)
2023
if len(argv) == 1:
2124
if stdin is not None:
2225
for untrusted_line in stdin:
@@ -30,10 +33,8 @@ def main() -> None:
3033
stdout.write(stdisplay(untrusted_line))
3134
else:
3235
path = Path(untrusted_arg)
33-
untrusted_text = path.read_text(encoding="utf-8", errors="replace")
36+
untrusted_text = path.read_text(
37+
encoding="utf-8", errors="replace", newline="\n"
38+
)
3439
stdout.write(stdisplay(untrusted_text))
3540
stdout.flush()
36-
37-
38-
if __name__ == "__main__":
39-
main()

0 commit comments

Comments
 (0)