From 41e7e5e8b78d2ca017b252366a546a8368841fed Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 13 Feb 2026 20:07:49 +0000 Subject: [PATCH] Add RFC 6901-style tilde escaping for special characters in keys Keys containing `.`, `$`, `[`, or `~` previously caused crashes (ValueError, AttributeError) or silent data corruption on round-trip through flatten/unflatten. This adds escape sequences (~0 for ~, ~1 for ., ~2 for $, ~3 for [) so all valid JSON keys round-trip correctly. Also fixes: rsplit("$", 2) -> rsplit("$", 1), _int_key_re.match -> fullmatch, and guards against AttributeError when root becomes list. https://claude.ai/code/session_01SwKhkAmPwW1qrMHpJmYT3x --- README.md | 47 ++++++++++++++ json_flatten.py | 59 ++++++++++++++--- test_json_flatten.py | 150 ++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 244 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 52ac6a2..983dcb8 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,7 @@ The top-level object passed to `flatten()` must be a dictionary. 2. Type information is preserved using `$type` suffixes. 3. List indices are represented using `[index]` notation. 4. Empty objects and lists have special representations. +5. Special characters in keys are escaped using `~` tilde escaping (inspired by [RFC 6901 JSON Pointer](https://www.rfc-editor.org/rfc/rfc6901)). ### Nested objects @@ -137,6 +138,52 @@ matrix.[1].[1]$int=4 ``` +### Key escaping + +Dictionary keys containing characters that are special to the flattening format (`.`, `$`, `[`, `~`) are escaped using tilde sequences, inspired by [RFC 6901 JSON Pointer](https://www.rfc-editor.org/rfc/rfc6901): + +| Sequence | Literal character | +|----------|-------------------| +| `~0` | `~` | +| `~1` | `.` | +| `~2` | `$` | +| `~3` | `[` | + +This ensures that keys with special characters round-trip correctly through `flatten()` and `unflatten()`. + + +**Key `a.b`**: +``` +a~1b=dot in key +``` +**Key `price$int`**: +``` +price~2int=dollar in key +``` +**Key `[0]`**: +``` +~30]=bracket in key +``` +**Key `a~b`**: +``` +a~0b=tilde in key +``` + + ## Type preservation Types are preserved using `$type` suffixes: diff --git a/json_flatten.py b/json_flatten.py index c8acaa5..d59cdb4 100644 --- a/json_flatten.py +++ b/json_flatten.py @@ -2,7 +2,7 @@ flatten() and unflatten() A pair of functions that can convert an arbitrary JSON object into a -flat name/value pair dictionary and back again, preserving type +flat name/value pair dictionary and back again, preserving type information and handling both nested lists and nested dictionaries. For example: @@ -33,11 +33,34 @@ "this.other_types.false$bool": "False", "this.other_types.none$none": "None", } + +Keys containing special characters (., $, [, ~) are escaped using +RFC 6901-style tilde escaping: + + ~0 = literal ~ + ~1 = literal . + ~2 = literal $ + ~3 = literal [ """ import re +def _escape_key(key): + """Escape special characters in a dictionary key. + + Order matters: ~ must be escaped first to avoid double-escaping.""" + return key.replace("~", "~0").replace(".", "~1").replace("$", "~2").replace("[", "~3") + + +def _unescape_key(key): + """Unescape a previously escaped key segment. + + Order matters: ~0 must be decoded last so ~03 doesn't prematurely + become ~ + 3 then [.""" + return key.replace("~3", "[").replace("~2", "$").replace("~1", ".").replace("~0", "~") + + def _object_to_rows(obj, prefix=None): rows = [] dot_prefix = prefix and (prefix + ".") or "" @@ -46,7 +69,9 @@ def _object_to_rows(obj, prefix=None): rows.append(((prefix or "") + "$empty", "{}")) else: for key, item in obj.items(): - rows.extend(_object_to_rows(item, prefix=dot_prefix + key)) + rows.extend( + _object_to_rows(item, prefix=dot_prefix + _escape_key(key)) + ) elif isinstance(obj, (list, tuple)): if len(obj) == 0: rows.append(((prefix or "") + "$emptylist", "[]")) @@ -72,7 +97,7 @@ def flatten(obj): _types_re = re.compile(r".*\$(none|bool|int|float|empty|emptylist)$") -_int_key_re = re.compile(r"\[(\d+)\]") +_int_key_re = re.compile(r"\[(\d+)\]$") def unflatten(data): @@ -86,7 +111,7 @@ def unflatten(data): current = current[bit] # Now deal with $type suffixes: if _types_re.match(lastkey): - lastkey, lasttype = lastkey.rsplit("$", 2) + lastkey, lasttype = lastkey.rsplit("$", 1) value = { "int": int, "float": float, @@ -95,20 +120,22 @@ def unflatten(data): "bool": lambda v: v.lower() == "true", "none": lambda v: None, }.get(lasttype, lambda v: v)(value) + # Keep lastkey in escaped form here -- unescaping happens in third pass + # so that [N] detection in second pass isn't confused by literal bracket keys current[lastkey] = value - # We handle foo.[0].one, foo.[1].two syntax in a second pass, - # by iterating through our structure looking for dictionaries - # where all of the keys are stringified integers + # Second pass: convert dicts where all keys are [N] into lists. + # This works on escaped keys, so real array indices [0] match but + # escaped bracket keys like ~30] do not. def replace_integer_keyed_dicts_with_lists(obj): if isinstance(obj, dict): - if obj and all(_int_key_re.match(k) for k in obj): + if obj and all(_int_key_re.fullmatch(k) for k in obj): return [ i[1] for i in sorted( [ ( - int(_int_key_re.match(k).group(1)), + int(_int_key_re.fullmatch(k).group(1)), replace_integer_keyed_dicts_with_lists(v), ) for k, v in obj.items() @@ -126,7 +153,19 @@ def replace_integer_keyed_dicts_with_lists(obj): return obj obj = replace_integer_keyed_dicts_with_lists(obj) + + # Third pass: unescape all remaining dict keys + def unescape_keys(obj): + if isinstance(obj, dict): + return {_unescape_key(k): unescape_keys(v) for k, v in obj.items()} + elif isinstance(obj, list): + return [unescape_keys(v) for v in obj] + else: + return obj + + obj = unescape_keys(obj) + # Handle root units only, e.g. {'$empty': '{}'} - if list(obj.keys()) == [""]: + if isinstance(obj, dict) and list(obj.keys()) == [""]: return list(obj.values())[0] return obj diff --git a/test_json_flatten.py b/test_json_flatten.py index f4efb44..2eb5a69 100644 --- a/test_json_flatten.py +++ b/test_json_flatten.py @@ -39,7 +39,7 @@ }, ), ( - "dollar_signs_that_are_not_type_indicators", + "dollar_signs_escaped", { "foo": [ { @@ -50,7 +50,7 @@ }, { "foo.[0].emails.[0]": "bar@example.com", - "foo.[0].phones._$!!$_": "555-555-5555", + "foo.[0].phones._~2!!~2_": "555-555-5555", }, ), ("empty_object", {}, {"$empty": "{}"}), @@ -88,3 +88,149 @@ def test_integers_with_gaps_does_not_create_sparse_array(): def test_list_as_base_level_object_rejected_with_error(): with pytest.raises(TypeError): flatten([{"name": "john"}]) + + +# --- RED phase: tests for tilde escaping (RFC 6901-style) --- + + +class TestDotInKeys: + """Issue #1: Keys containing dots must round-trip correctly.""" + + def test_simple_dot_in_key(self): + obj = {"a.b": "value"} + assert unflatten(flatten(obj)) == obj + + def test_dot_key_distinct_from_nested(self): + """Dotted key and nested key must produce different flattened forms.""" + dotted = flatten({"a.b": "value"}) + nested = flatten({"a": {"b": "value"}}) + assert dotted != nested + + def test_dot_key_flattened_form(self): + assert flatten({"a.b": "value"}) == {"a~1b": "value"} + + def test_dot_key_with_type_suffix(self): + obj = {"a.b": 5} + assert flatten(obj) == {"a~1b$int": "5"} + assert unflatten(flatten(obj)) == obj + + def test_multiple_dots_in_key(self): + obj = {"a.b.c": "value"} + assert unflatten(flatten(obj)) == obj + + def test_dot_key_nested_in_object(self): + obj = {"outer": {"a.b": "value"}} + assert unflatten(flatten(obj)) == obj + + +class TestDollarInKeys: + """Issue #2: Keys containing $ must not crash rsplit.""" + + def test_dollar_in_key_with_int_value(self): + obj = {"my$key": 5} + assert unflatten(flatten(obj)) == obj + + def test_dollar_in_key_flattened_form(self): + assert flatten({"my$key": 5}) == {"my~2key$int": "5"} + + def test_dollar_in_key_with_string_value(self): + obj = {"my$key": "hello"} + assert unflatten(flatten(obj)) == obj + + def test_multiple_dollars_in_key(self): + obj = {"a$b$c": 10} + assert unflatten(flatten(obj)) == obj + + +class TestAmbiguousTypeSuffix: + """Issue #3: Keys ending with type suffix names must not be misinterpreted.""" + + def test_key_ending_with_dollar_int_string_value(self): + obj = {"price$int": "hello"} + assert unflatten(flatten(obj)) == obj + + def test_key_ending_with_dollar_none_string_value(self): + obj = {"flag$none": "active"} + assert unflatten(flatten(obj)) == obj + + def test_key_ending_with_dollar_bool_string_value(self): + obj = {"x$bool": "maybe"} + assert unflatten(flatten(obj)) == obj + + def test_key_ending_with_dollar_float_string_value(self): + obj = {"val$float": "text"} + assert unflatten(flatten(obj)) == obj + + def test_key_ending_with_dollar_empty_string_value(self): + obj = {"obj$empty": "not empty"} + assert unflatten(flatten(obj)) == obj + + def test_key_ending_with_dollar_emptylist_string_value(self): + obj = {"arr$emptylist": "not a list"} + assert unflatten(flatten(obj)) == obj + + +class TestBracketKeys: + """Issue #4: Keys in [N] format must not crash or be treated as array indices.""" + + def test_bracket_key_round_trip(self): + obj = {"[0]": "value"} + assert unflatten(flatten(obj)) == obj + + def test_bracket_key_flattened_form(self): + assert flatten({"[0]": "value"}) == {"~30]": "value"} + + def test_bracket_key_not_confused_with_list(self): + """A dict with [N] keys must stay a dict, not become a list.""" + obj = {"[0]": "a", "[1]": "b"} + result = unflatten(flatten(obj)) + assert isinstance(result, dict) + assert result == obj + + def test_bracket_key_nested(self): + obj = {"outer": {"[0]": "value"}} + assert unflatten(flatten(obj)) == obj + + +class TestTildeInKeys: + """Self-consistency: keys containing ~ must round-trip correctly.""" + + def test_tilde_in_key(self): + obj = {"a~b": "value"} + assert unflatten(flatten(obj)) == obj + + def test_tilde_in_key_flattened_form(self): + assert flatten({"a~b": "value"}) == {"a~0b": "value"} + + def test_tilde_escape_sequence_in_key(self): + """A key that looks like an escape sequence must round-trip.""" + obj = {"a~1b": "value"} + assert unflatten(flatten(obj)) == obj + + def test_tilde_with_type_suffix(self): + obj = {"a~b": 5} + assert unflatten(flatten(obj)) == obj + + +class TestCombinationEscaping: + """Multiple special characters in the same key.""" + + def test_dot_and_dollar_in_key(self): + obj = {"a.b$int": "hello"} + assert unflatten(flatten(obj)) == obj + + def test_all_special_chars_in_key(self): + obj = {"a.b$c[0]~d": "value"} + assert unflatten(flatten(obj)) == obj + + def test_existing_dollar_sign_test_updated(self): + """The _$!!$_ key must still round-trip with escaping.""" + obj = { + "foo": [ + { + "emails": ["bar@example.com"], + "phones": {"_$!!$_": "555-555-5555"}, + } + ] + } + assert unflatten(flatten(obj)) == obj