Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ The top-level object passed to `flatten()` must be a dictionary.
2. Type information is preserved using `$type` suffixes.
3. List indices are represented using `[index]` notation.
4. Empty objects and lists have special representations.
5. Special characters in keys are escaped using `~` tilde escaping (inspired by [RFC 6901 JSON Pointer](https://www.rfc-editor.org/rfc/rfc6901)).

### Nested objects

Expand Down Expand Up @@ -137,6 +138,52 @@ matrix.[1].[1]$int=4
```
<!-- [[[end]]] -->

### Key escaping

Dictionary keys containing characters that are special to the flattening format (`.`, `$`, `[`, `~`) are escaped using tilde sequences, inspired by [RFC 6901 JSON Pointer](https://www.rfc-editor.org/rfc/rfc6901):

| Sequence | Literal character |
|----------|-------------------|
| `~0` | `~` |
| `~1` | `.` |
| `~2` | `$` |
| `~3` | `[` |

This ensures that keys with special characters round-trip correctly through `flatten()` and `unflatten()`.

<!-- [[[cog
examples = [
{"a.b": "dot in key"},
{"price$int": "dollar in key"},
{"[0]": "bracket in key"},
{"a~b": "tilde in key"},
]

for example in examples:
key = list(example.keys())[0]
cog.out(f"**Key `{key}`**:\n```\n")
for k, v in flatten(example).items():
cog.out(f"{k}={v}\n")
cog.out("```\n")
]]] -->
**Key `a.b`**:
```
a~1b=dot in key
```
**Key `price$int`**:
```
price~2int=dollar in key
```
**Key `[0]`**:
```
~30]=bracket in key
```
**Key `a~b`**:
```
a~0b=tilde in key
```
<!-- [[[end]]] -->

## Type preservation

Types are preserved using `$type` suffixes:
Expand Down
59 changes: 49 additions & 10 deletions json_flatten.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
flatten() and unflatten()

A pair of functions that can convert an arbitrary JSON object into a
flat name/value pair dictionary and back again, preserving type
flat name/value pair dictionary and back again, preserving type
information and handling both nested lists and nested dictionaries.

For example:
Expand Down Expand Up @@ -33,11 +33,34 @@
"this.other_types.false$bool": "False",
"this.other_types.none$none": "None",
}

Keys containing special characters (., $, [, ~) are escaped using
RFC 6901-style tilde escaping:

~0 = literal ~
~1 = literal .
~2 = literal $
~3 = literal [
"""

import re


def _escape_key(key):
"""Escape special characters in a dictionary key.

Order matters: ~ must be escaped first to avoid double-escaping."""
return key.replace("~", "~0").replace(".", "~1").replace("$", "~2").replace("[", "~3")


def _unescape_key(key):
"""Unescape a previously escaped key segment.

Order matters: ~0 must be decoded last so ~03 doesn't prematurely
become ~ + 3 then [."""
return key.replace("~3", "[").replace("~2", "$").replace("~1", ".").replace("~0", "~")


def _object_to_rows(obj, prefix=None):
rows = []
dot_prefix = prefix and (prefix + ".") or ""
Expand All @@ -46,7 +69,9 @@ def _object_to_rows(obj, prefix=None):
rows.append(((prefix or "") + "$empty", "{}"))
else:
for key, item in obj.items():
rows.extend(_object_to_rows(item, prefix=dot_prefix + key))
rows.extend(
_object_to_rows(item, prefix=dot_prefix + _escape_key(key))
)
elif isinstance(obj, (list, tuple)):
if len(obj) == 0:
rows.append(((prefix or "") + "$emptylist", "[]"))
Expand All @@ -72,7 +97,7 @@ def flatten(obj):


_types_re = re.compile(r".*\$(none|bool|int|float|empty|emptylist)$")
_int_key_re = re.compile(r"\[(\d+)\]")
_int_key_re = re.compile(r"\[(\d+)\]$")


def unflatten(data):
Expand All @@ -86,7 +111,7 @@ def unflatten(data):
current = current[bit]
# Now deal with $type suffixes:
if _types_re.match(lastkey):
lastkey, lasttype = lastkey.rsplit("$", 2)
lastkey, lasttype = lastkey.rsplit("$", 1)
value = {
"int": int,
"float": float,
Expand All @@ -95,20 +120,22 @@ def unflatten(data):
"bool": lambda v: v.lower() == "true",
"none": lambda v: None,
}.get(lasttype, lambda v: v)(value)
# Keep lastkey in escaped form here -- unescaping happens in third pass
# so that [N] detection in second pass isn't confused by literal bracket keys
current[lastkey] = value

# We handle foo.[0].one, foo.[1].two syntax in a second pass,
# by iterating through our structure looking for dictionaries
# where all of the keys are stringified integers
# Second pass: convert dicts where all keys are [N] into lists.
# This works on escaped keys, so real array indices [0] match but
# escaped bracket keys like ~30] do not.
def replace_integer_keyed_dicts_with_lists(obj):
if isinstance(obj, dict):
if obj and all(_int_key_re.match(k) for k in obj):
if obj and all(_int_key_re.fullmatch(k) for k in obj):
return [
i[1]
for i in sorted(
[
(
int(_int_key_re.match(k).group(1)),
int(_int_key_re.fullmatch(k).group(1)),
replace_integer_keyed_dicts_with_lists(v),
)
for k, v in obj.items()
Expand All @@ -126,7 +153,19 @@ def replace_integer_keyed_dicts_with_lists(obj):
return obj

obj = replace_integer_keyed_dicts_with_lists(obj)

# Third pass: unescape all remaining dict keys
def unescape_keys(obj):
if isinstance(obj, dict):
return {_unescape_key(k): unescape_keys(v) for k, v in obj.items()}
elif isinstance(obj, list):
return [unescape_keys(v) for v in obj]
else:
return obj

obj = unescape_keys(obj)

# Handle root units only, e.g. {'$empty': '{}'}
if list(obj.keys()) == [""]:
if isinstance(obj, dict) and list(obj.keys()) == [""]:
return list(obj.values())[0]
return obj
150 changes: 148 additions & 2 deletions test_json_flatten.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
},
),
(
"dollar_signs_that_are_not_type_indicators",
"dollar_signs_escaped",
{
"foo": [
{
Expand All @@ -50,7 +50,7 @@
},
{
"foo.[0].emails.[0]": "bar@example.com",
"foo.[0].phones._$!<home>!$_": "555-555-5555",
"foo.[0].phones._~2!<home>!~2_": "555-555-5555",
},
),
("empty_object", {}, {"$empty": "{}"}),
Expand Down Expand Up @@ -88,3 +88,149 @@ def test_integers_with_gaps_does_not_create_sparse_array():
def test_list_as_base_level_object_rejected_with_error():
with pytest.raises(TypeError):
flatten([{"name": "john"}])


# --- RED phase: tests for tilde escaping (RFC 6901-style) ---


class TestDotInKeys:
"""Issue #1: Keys containing dots must round-trip correctly."""

def test_simple_dot_in_key(self):
obj = {"a.b": "value"}
assert unflatten(flatten(obj)) == obj

def test_dot_key_distinct_from_nested(self):
"""Dotted key and nested key must produce different flattened forms."""
dotted = flatten({"a.b": "value"})
nested = flatten({"a": {"b": "value"}})
assert dotted != nested

def test_dot_key_flattened_form(self):
assert flatten({"a.b": "value"}) == {"a~1b": "value"}

def test_dot_key_with_type_suffix(self):
obj = {"a.b": 5}
assert flatten(obj) == {"a~1b$int": "5"}
assert unflatten(flatten(obj)) == obj

def test_multiple_dots_in_key(self):
obj = {"a.b.c": "value"}
assert unflatten(flatten(obj)) == obj

def test_dot_key_nested_in_object(self):
obj = {"outer": {"a.b": "value"}}
assert unflatten(flatten(obj)) == obj


class TestDollarInKeys:
"""Issue #2: Keys containing $ must not crash rsplit."""

def test_dollar_in_key_with_int_value(self):
obj = {"my$key": 5}
assert unflatten(flatten(obj)) == obj

def test_dollar_in_key_flattened_form(self):
assert flatten({"my$key": 5}) == {"my~2key$int": "5"}

def test_dollar_in_key_with_string_value(self):
obj = {"my$key": "hello"}
assert unflatten(flatten(obj)) == obj

def test_multiple_dollars_in_key(self):
obj = {"a$b$c": 10}
assert unflatten(flatten(obj)) == obj


class TestAmbiguousTypeSuffix:
"""Issue #3: Keys ending with type suffix names must not be misinterpreted."""

def test_key_ending_with_dollar_int_string_value(self):
obj = {"price$int": "hello"}
assert unflatten(flatten(obj)) == obj

def test_key_ending_with_dollar_none_string_value(self):
obj = {"flag$none": "active"}
assert unflatten(flatten(obj)) == obj

def test_key_ending_with_dollar_bool_string_value(self):
obj = {"x$bool": "maybe"}
assert unflatten(flatten(obj)) == obj

def test_key_ending_with_dollar_float_string_value(self):
obj = {"val$float": "text"}
assert unflatten(flatten(obj)) == obj

def test_key_ending_with_dollar_empty_string_value(self):
obj = {"obj$empty": "not empty"}
assert unflatten(flatten(obj)) == obj

def test_key_ending_with_dollar_emptylist_string_value(self):
obj = {"arr$emptylist": "not a list"}
assert unflatten(flatten(obj)) == obj


class TestBracketKeys:
"""Issue #4: Keys in [N] format must not crash or be treated as array indices."""

def test_bracket_key_round_trip(self):
obj = {"[0]": "value"}
assert unflatten(flatten(obj)) == obj

def test_bracket_key_flattened_form(self):
assert flatten({"[0]": "value"}) == {"~30]": "value"}

def test_bracket_key_not_confused_with_list(self):
"""A dict with [N] keys must stay a dict, not become a list."""
obj = {"[0]": "a", "[1]": "b"}
result = unflatten(flatten(obj))
assert isinstance(result, dict)
assert result == obj

def test_bracket_key_nested(self):
obj = {"outer": {"[0]": "value"}}
assert unflatten(flatten(obj)) == obj


class TestTildeInKeys:
"""Self-consistency: keys containing ~ must round-trip correctly."""

def test_tilde_in_key(self):
obj = {"a~b": "value"}
assert unflatten(flatten(obj)) == obj

def test_tilde_in_key_flattened_form(self):
assert flatten({"a~b": "value"}) == {"a~0b": "value"}

def test_tilde_escape_sequence_in_key(self):
"""A key that looks like an escape sequence must round-trip."""
obj = {"a~1b": "value"}
assert unflatten(flatten(obj)) == obj

def test_tilde_with_type_suffix(self):
obj = {"a~b": 5}
assert unflatten(flatten(obj)) == obj


class TestCombinationEscaping:
"""Multiple special characters in the same key."""

def test_dot_and_dollar_in_key(self):
obj = {"a.b$int": "hello"}
assert unflatten(flatten(obj)) == obj

def test_all_special_chars_in_key(self):
obj = {"a.b$c[0]~d": "value"}
assert unflatten(flatten(obj)) == obj

def test_existing_dollar_sign_test_updated(self):
"""The _$!<home>!$_ key must still round-trip with escaping."""
obj = {
"foo": [
{
"emails": ["bar@example.com"],
"phones": {"_$!<home>!$_": "555-555-5555"},
}
]
}
assert unflatten(flatten(obj)) == obj