From a71f68712288c7c3c2a2d9e82e50fd45588fdcc7 Mon Sep 17 00:00:00 2001 From: Stephen Rosen Date: Sun, 18 May 2025 23:52:24 -0400 Subject: [PATCH 1/3] Convert from fast_double_parser to fast_float The only oddity in here is handling around `std::errc` because `0` is not part of the named enum values. As a workaround, treat the enum type as an int, and explicitly cast it when comparing it against `0`. --- .gitmodules | 6 +++--- CHANGELOG.md | 2 ++ MANIFEST.in | 2 +- src/_decoder.pyx | 10 +++++----- src/_imports.pyx | 7 +++++-- third-party/fast_double_parser | 1 - third-party/fast_float | 1 + 7 files changed, 17 insertions(+), 12 deletions(-) delete mode 160000 third-party/fast_double_parser create mode 160000 third-party/fast_float diff --git a/.gitmodules b/.gitmodules index eb19f3c..2e5c5fc 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,6 +4,6 @@ [submodule "third-party/JSONTestSuite"] path = third-party/JSONTestSuite url = https://github.com/nst/JSONTestSuite.git -[submodule "third-party/fast_double_parser"] - path = third-party/fast_double_parser - url = https://github.com/lemire/fast_double_parser.git +[submodule "third-party/fast_float"] + path = third-party/fast_float + url = https://github.com/fastfloat/fast_float diff --git a/CHANGELOG.md b/CHANGELOG.md index 04e182b..2be7aaf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,7 @@ # Changelog +* Switch from ``fast_double_parser`` to ``fast_float`` + **1.6.9 (2025-05-12)** * Remove unused import to fix installation on Termux (by veka0, [#105](https://github.com/Kijewski/pyjson5/pull/105)) diff --git a/MANIFEST.in b/MANIFEST.in index 494767a..0fbfc40 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -4,10 +4,10 @@ include Makefile include pyjson5.cpp include pyjson5.pyx include pyproject.toml -include third-party/fast_double_parser/include/fast_double_parser.h include requirements*.txt recursive-include docs ** recursive-include scripts ** recursive-include src ** recursive-include third-party/json5-tests ** recursive-include third-party/JSONTestSuite/test_parsing ** +recursive-include third-party/fast_float/include ** diff --git a/src/_decoder.pyx b/src/_decoder.pyx index 6038863..85ff96e 100644 --- a/src/_decoder.pyx +++ b/src/_decoder.pyx @@ -243,14 +243,14 @@ cdef object _decode_string(ReaderRef reader, int32_t *c_in_out): cdef object _decode_double(StackHeapString[char] &buf, Py_ssize_t start): cdef double d0 - cdef const char *end_of_double + cdef from_chars_result result d0 = 0.0 # silence warning - end_of_double = parse_number(buf.data(), &d0) - if end_of_double != NULL and end_of_double[0] == b'\0': - return PyFloat_FromDouble(d0) + result = from_chars(buf.data(), buf.data() + buf.size(), d0) + if (result.ec): + _raise_unclosed('NumericLiteral', start) - _raise_unclosed('NumericLiteral', start) + return PyFloat_FromDouble(d0) cdef object _decode_number_leading_zero(ReaderRef reader, StackHeapString[char] &buf, diff --git a/src/_imports.pyx b/src/_imports.pyx index 834600d..e868936 100644 --- a/src/_imports.pyx +++ b/src/_imports.pyx @@ -124,9 +124,12 @@ cdef extern from 'src/_decoder_recursive_select.hpp' namespace 'JSON5EncoderCpp' DrsKind drs_lookup[128] +cdef extern from 'third-party/fast_float/include/fast_float/fast_float.h' namespace 'fast_float' nogil: + ctypedef struct from_chars_result: + char *ptr + int ec -cdef extern from 'third-party/fast_double_parser/include/fast_double_parser.h' namespace 'fast_double_parser' nogil: - const char *parse_number(const char *p, double *outDouble) + cdef from_chars_result from_chars(char *first, char *last, double &value); cdef extern from 'src/dragonbox.cc' namespace 'dragonbox' nogil: diff --git a/third-party/fast_double_parser b/third-party/fast_double_parser deleted file mode 160000 index bc93aee..0000000 --- a/third-party/fast_double_parser +++ /dev/null @@ -1 +0,0 @@ -Subproject commit bc93aee338615e46faac4140dd60eef761ba5b12 diff --git a/third-party/fast_float b/third-party/fast_float new file mode 160000 index 0000000..c5a3ca3 --- /dev/null +++ b/third-party/fast_float @@ -0,0 +1 @@ +Subproject commit c5a3ca37c459050f367a4cb0b23c862c29242d30 From fe9236dff1379444f6873633b6dd165b6a22e7b3 Mon Sep 17 00:00:00 2001 From: Stephen Rosen Date: Mon, 19 May 2025 01:32:51 -0400 Subject: [PATCH 2/3] Update to set float parse format to json_or_infnan This more accurately matches the supported types for the JSON5 spec. --- src/_decoder.pyx | 2 +- src/_fast_float_compat.hpp | 10 ++++++++++ src/_imports.pyx | 7 ++++++- 3 files changed, 17 insertions(+), 2 deletions(-) create mode 100644 src/_fast_float_compat.hpp diff --git a/src/_decoder.pyx b/src/_decoder.pyx index 85ff96e..5bfce23 100644 --- a/src/_decoder.pyx +++ b/src/_decoder.pyx @@ -246,7 +246,7 @@ cdef object _decode_double(StackHeapString[char] &buf, Py_ssize_t start): cdef from_chars_result result d0 = 0.0 # silence warning - result = from_chars(buf.data(), buf.data() + buf.size(), d0) + result = from_chars(buf.data(), buf.data() + buf.size(), d0, fmt_json_or_infnan) if (result.ec): _raise_unclosed('NumericLiteral', start) diff --git a/src/_fast_float_compat.hpp b/src/_fast_float_compat.hpp new file mode 100644 index 0000000..6d27900 --- /dev/null +++ b/src/_fast_float_compat.hpp @@ -0,0 +1,10 @@ +#pragma once +#include "../third-party/fast_float/include/fast_float/float_common.h" + +/* This header file is a shim to handle 'enum class' in Cython, which doesn't + * namespace properly. */ +namespace chars_format { + using chars_format = fast_float::chars_format; + + constexpr chars_format fmt_json_or_infnan = fast_float::chars_format::json_or_infnan; +} diff --git a/src/_imports.pyx b/src/_imports.pyx index e868936..f378422 100644 --- a/src/_imports.pyx +++ b/src/_imports.pyx @@ -124,12 +124,17 @@ cdef extern from 'src/_decoder_recursive_select.hpp' namespace 'JSON5EncoderCpp' DrsKind drs_lookup[128] +cdef extern from 'src/_fast_float_compat.hpp' namespace 'chars_format' nogil: + cdef cppclass chars_format: + pass + cdef const chars_format fmt_json_or_infnan + cdef extern from 'third-party/fast_float/include/fast_float/fast_float.h' namespace 'fast_float' nogil: ctypedef struct from_chars_result: char *ptr int ec - cdef from_chars_result from_chars(char *first, char *last, double &value); + cdef from_chars_result from_chars(char *first, char *last, double &value, chars_format fmt); cdef extern from 'src/dragonbox.cc' namespace 'dragonbox' nogil: From 469a89028e212dce1e7eefad62a8139aeaa5d97e Mon Sep 17 00:00:00 2001 From: Stephen Rosen Date: Mon, 19 May 2025 18:25:27 -0400 Subject: [PATCH 3/3] Add an extra check for invalid exponents `fast_float` allows for exponents containing floats, which are not allowed per the JSON5 spec. Constraining `fast_float` to a format which does not allow for these exponents results in other supported usages being rejected. To handle this, explicitly check by adding a new guard. `has_invalid_exponent` is a string scan which returns true if it finds a suffix on the string which looks like an invalid exponent. --- src/_decoder.pyx | 3 +++ src/_fast_float_compat.hpp | 40 ++++++++++++++++++++++++++++++++++++++ src/_imports.pyx | 4 ++++ 3 files changed, 47 insertions(+) diff --git a/src/_decoder.pyx b/src/_decoder.pyx index 5bfce23..af0d114 100644 --- a/src/_decoder.pyx +++ b/src/_decoder.pyx @@ -245,6 +245,9 @@ cdef object _decode_double(StackHeapString[char] &buf, Py_ssize_t start): cdef double d0 cdef from_chars_result result + if has_invalid_exponent(buf.data()): + _raise_unclosed('NumericLiteral', start) + d0 = 0.0 # silence warning result = from_chars(buf.data(), buf.data() + buf.size(), d0, fmt_json_or_infnan) if (result.ec): diff --git a/src/_fast_float_compat.hpp b/src/_fast_float_compat.hpp index 6d27900..203a2c6 100644 --- a/src/_fast_float_compat.hpp +++ b/src/_fast_float_compat.hpp @@ -8,3 +8,43 @@ namespace chars_format { constexpr chars_format fmt_json_or_infnan = fast_float::chars_format::json_or_infnan; } + +namespace check_floats { + /* + * Check for invalid exponents on strings which represent floats. + * Does not guarantee that the float is valid -- only that *if* it has an + * exponent, the exponent is valid. + * + * Checks in this order: + * + * - no exponent OK (false) + * + * - nothing after exponent FAIL (true) + * + * - a sign (+/-) at the end FAIL (true) + * + * - anything after exponent FAIL (true) + * and optional sign + * which is not a digit + * + * - nothing failed? OK (false) + */ + bool has_invalid_exponent(const std::string &s) { + auto pos = s.find_first_of("e"); + if (pos == std::string::npos) return false; + + if (++pos >= s.size()) return true; + + if (s[pos] == '+' || s[pos] == '-') { + if (++pos >= s.size()) return true; + } + + // Now check the exponent part for a dot ('.') or any other non-digit + // character + for (; pos < s.size(); ++pos) { + if (!isdigit(s[pos])) return true; + } + + return false; + } +} diff --git a/src/_imports.pyx b/src/_imports.pyx index f378422..ca94ef1 100644 --- a/src/_imports.pyx +++ b/src/_imports.pyx @@ -13,6 +13,7 @@ from cpython.object cimport PyObject, PyObject_GetIter from cpython.type cimport PyType_Check from cpython.unicode cimport PyUnicode_Check, PyUnicode_FromEncodedObject, PyUnicode_Format from libcpp cimport bool as boolean +from libcpp.string cimport string cdef extern from '' namespace 'std' nogil: @@ -129,6 +130,9 @@ cdef extern from 'src/_fast_float_compat.hpp' namespace 'chars_format' nogil: pass cdef const chars_format fmt_json_or_infnan +cdef extern from 'src/_fast_float_compat.hpp' namespace 'check_floats' nogil: + cdef boolean has_invalid_exponent(string &s) + cdef extern from 'third-party/fast_float/include/fast_float/fast_float.h' namespace 'fast_float' nogil: ctypedef struct from_chars_result: char *ptr