Skip to content

Commit a3ce2f7

Browse files
gh-55531: Implement normalize_encoding in C (#136643)
Closes gh-55531
1 parent 6826166 commit a3ce2f7

File tree

6 files changed

+123
-22
lines changed

6 files changed

+123
-22
lines changed

Lib/encodings/__init__.py

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030

3131
import codecs
3232
import sys
33+
from _codecs import _normalize_encoding
3334
from . import aliases
3435

3536
_cache = {}
@@ -55,18 +56,7 @@ def normalize_encoding(encoding):
5556
if isinstance(encoding, bytes):
5657
encoding = str(encoding, "ascii")
5758

58-
chars = []
59-
punct = False
60-
for c in encoding:
61-
if c.isalnum() or c == '.':
62-
if punct and chars:
63-
chars.append('_')
64-
if c.isascii():
65-
chars.append(c)
66-
punct = False
67-
else:
68-
punct = True
69-
return ''.join(chars)
59+
return _normalize_encoding(encoding)
7060

7161
def search_function(encoding):
7262

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
:mod:`encodings`: Improve :func:`~encodings.normalize_encoding` performance
2+
by implementing the function in C using the private
3+
``_Py_normalize_encoding`` which has been modified to make lowercase
4+
conversion optional.

Modules/_codecsmodule.c

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1018,6 +1018,47 @@ _codecs_lookup_error_impl(PyObject *module, const char *name)
10181018
return PyCodec_LookupError(name);
10191019
}
10201020

1021+
extern int _Py_normalize_encoding(const char *, char *, size_t, int);
1022+
1023+
/*[clinic input]
1024+
_codecs._normalize_encoding
1025+
encoding: unicode
1026+
1027+
Normalize an encoding name *encoding*.
1028+
1029+
Used for encodings.normalize_encoding. Does not convert to lower case.
1030+
[clinic start generated code]*/
1031+
1032+
static PyObject *
1033+
_codecs__normalize_encoding_impl(PyObject *module, PyObject *encoding)
1034+
/*[clinic end generated code: output=d27465d81e361f8e input=3ff3f4d64995b988]*/
1035+
{
1036+
Py_ssize_t len;
1037+
const char *cstr = PyUnicode_AsUTF8AndSize(encoding, &len);
1038+
if (cstr == NULL) {
1039+
return NULL;
1040+
}
1041+
1042+
if (len > PY_SSIZE_T_MAX) {
1043+
PyErr_SetString(PyExc_OverflowError, "encoding is too large");
1044+
return NULL;
1045+
}
1046+
1047+
char *normalized = PyMem_Malloc(len + 1);
1048+
if (normalized == NULL) {
1049+
return PyErr_NoMemory();
1050+
}
1051+
1052+
if (!_Py_normalize_encoding(cstr, normalized, len + 1, 0)) {
1053+
PyMem_Free(normalized);
1054+
return NULL;
1055+
}
1056+
1057+
PyObject *result = PyUnicode_FromString(normalized);
1058+
PyMem_Free(normalized);
1059+
return result;
1060+
}
1061+
10211062
/* --- Module API --------------------------------------------------------- */
10221063

10231064
static PyMethodDef _codecs_functions[] = {
@@ -1067,6 +1108,7 @@ static PyMethodDef _codecs_functions[] = {
10671108
_CODECS_REGISTER_ERROR_METHODDEF
10681109
_CODECS__UNREGISTER_ERROR_METHODDEF
10691110
_CODECS_LOOKUP_ERROR_METHODDEF
1111+
_CODECS__NORMALIZE_ENCODING_METHODDEF
10701112
{NULL, NULL} /* sentinel */
10711113
};
10721114

Modules/clinic/_codecsmodule.c.h

Lines changed: 65 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Objects/unicodeobject.c

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3449,13 +3449,14 @@ PyUnicode_FromEncodedObject(PyObject *obj,
34493449
return v;
34503450
}
34513451

3452-
/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3453-
also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3454-
longer than lower_len-1). */
3452+
/* Normalize an encoding name like encodings.normalize_encoding()
3453+
but allow to convert to lowercase if *to_lower* is true.
3454+
Return 1 on success, or 0 on error (encoding is longer than lower_len-1). */
34553455
int
34563456
_Py_normalize_encoding(const char *encoding,
34573457
char *lower,
3458-
size_t lower_len)
3458+
size_t lower_len,
3459+
int to_lower)
34593460
{
34603461
const char *e;
34613462
char *l;
@@ -3486,7 +3487,7 @@ _Py_normalize_encoding(const char *encoding,
34863487
if (l == l_end) {
34873488
return 0;
34883489
}
3489-
*l++ = Py_TOLOWER(c);
3490+
*l++ = to_lower ? Py_TOLOWER(c) : c;
34903491
}
34913492
else {
34923493
punct = 1;
@@ -3521,7 +3522,7 @@ PyUnicode_Decode(const char *s,
35213522
}
35223523

35233524
/* Shortcuts for common default encodings */
3524-
if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3525+
if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) {
35253526
char *lower = buflower;
35263527

35273528
/* Fast paths */
@@ -3778,7 +3779,7 @@ PyUnicode_AsEncodedString(PyObject *unicode,
37783779
}
37793780

37803781
/* Shortcuts for common default encodings */
3781-
if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3782+
if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) {
37823783
char *lower = buflower;
37833784

37843785
/* Fast paths */

Python/fileutils.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ _Py_mbrtowc(wchar_t *pwc, const char *str, size_t len, mbstate_t *pmbs)
178178

179179
#define USE_FORCE_ASCII
180180

181-
extern int _Py_normalize_encoding(const char *, char *, size_t);
181+
extern int _Py_normalize_encoding(const char *, char *, size_t, int);
182182

183183
/* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale
184184
and POSIX locale. nl_langinfo(CODESET) announces an alias of the
@@ -229,7 +229,7 @@ check_force_ascii(void)
229229
}
230230

231231
char encoding[20]; /* longest name: "iso_646.irv_1991\0" */
232-
if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding))) {
232+
if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding), 1)) {
233233
goto error;
234234
}
235235

0 commit comments

Comments
 (0)