From 52149dbc38315a5b36edc2841edcda7765accaf0 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 1 Nov 2025 15:01:41 +0000 Subject: [PATCH 1/4] Enable roundtripping nested dtypes through parquet and arrow --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/dtypes/dtypes.py | 66 +++++++++++++++++++++++- pandas/tests/extension/test_arrow.py | 77 ++++++++++++++++++++++++++++ 3 files changed, 143 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 12f522301e121..e0120a445c36a 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1221,6 +1221,7 @@ ExtensionArray - Bug in :meth:`api.types.is_datetime64_any_dtype` where a custom :class:`ExtensionDtype` would return ``False`` for array-likes (:issue:`57055`) - Bug in comparison between object with :class:`ArrowDtype` and incompatible-dtyped (e.g. string vs bool) incorrectly raising instead of returning all-``False`` (for ``==``) or all-``True`` (for ``!=``) (:issue:`59505`) - Bug in constructing pandas data structures when passing into ``dtype`` a string of the type followed by ``[pyarrow]`` while PyArrow is not installed would raise ``NameError`` rather than ``ImportError`` (:issue:`57928`) +- Bug in dtype inference when roundtripping nested arrow dtypes like ``list``, ``struct``, ``map`` through pyarrow tables or parquet (:issue:`61529`) - Bug in various :class:`DataFrame` reductions for pyarrow temporal dtypes returning incorrect dtype when result was null (:issue:`59234`) - Fixed flex arithmetic with :class:`ExtensionArray` operands raising when ``fill_value`` was passed. (:issue:`62467`) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 2e3d73edcdf4f..e5f93293a34fd 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -2387,6 +2387,10 @@ def construct_from_string(cls, string: str) -> ArrowDtype: except (NotImplementedError, ValueError): # Fall through to raise with nice exception message below pass + binary_pattern = re.compile(r"^fixed_size_binary\[(?P\d+)\]$") + if match := binary_pattern.match(base_type): + byte_width = match.group("width") + return cls(pa.binary(int(byte_width))) raise NotImplementedError( "Passing pyarrow type specific parameters " @@ -2394,9 +2398,69 @@ def construct_from_string(cls, string: str) -> ArrowDtype: "Please construct an ArrowDtype object with a pyarrow_dtype " "instance with specific parameters." ) from err - raise TypeError(f"'{base_type}' is not a valid pyarrow data type.") from err + # match maps + map_pattern = re.compile(r"^map<(?P[^,<>]+),\s(?P[^,<>]+)>$") + # match lists + list_inner_pattern = r".+)>$" + list_pattern = re.compile(rf"^list{list_inner_pattern}") + large_list_pattern = re.compile(rf"^large_list{list_inner_pattern}") + # match structs + struct_pattern = re.compile(r"^struct<(?P.+)>$") + if match := map_pattern.match(base_type): + pa_dtype = pa.map_( + pa.type_for_alias(match.group("key")), + pa.type_for_alias(match.group("value")), + ) + elif match := list_pattern.match(base_type): + pa_dtype = pa.list_( + cls._resolve_inner_types(match.group("item_type") + "[pyarrow]") + ) + elif match := large_list_pattern.match(base_type): + pa_dtype = pa.large_list( + cls._resolve_inner_types(match.group("item_type") + "[pyarrow]") + ) + elif match := struct_pattern.match(base_type): + fields = [] + for name, t in cls._split_struct(match.group("fields")): + field_dtype = cls._resolve_inner_types(t + "[pyarrow]") + fields.append((name, field_dtype)) + pa_dtype = pa.struct(fields) + else: + raise TypeError( + f"'{base_type}' is not a valid pyarrow data type." + ) from err return cls(pa_dtype) + @classmethod + def _resolve_inner_types(cls, string: str) -> pa.DataType: + if string == "string[pyarrow]": + return pa.string() + else: + return cls.construct_from_string(string).pyarrow_dtype + + @staticmethod + def _split_struct(fields: str): + field_pattern = re.compile(r"^\s*(?P[^:]+):\s*(?P.+)\s*$") + + parts, start, depth = [], 0, 0 + for i, char in enumerate(fields): + if char in "<": + depth += 1 + elif char in ">": + depth -= 1 + elif char == "," and depth == 0: + parts.append(fields[start:i].strip()) + start = i + 1 + + if start < len(fields): + parts.append(fields[start:].strip()) + + for field in parts: + if match := field_pattern.match(field): + yield match.group("name"), match.group("type") + else: + raise TypeError(f"Could not parse struct field definition: '{field}'") + # TODO(arrow#33642): This can be removed once supported by pyarrow @classmethod def _parse_temporal_dtype_string(cls, string: str) -> ArrowDtype: diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index c1e01bbbe57a0..c4ea4f28d7a34 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3780,6 +3780,83 @@ def test_arrow_dtype_itemsize_fixed_width(type_name, expected_size): ) +def test_roundtrip_of_nested_types(): + df = pd.DataFrame( + { + "list_int": pd.Series( + [[1, 2, 3], [4, 5]], dtype=ArrowDtype(pa.list_(pa.int64())) + ), + "list_string": pd.Series( + [["a", "b"], ["c"]], dtype=ArrowDtype(pa.list_(pa.string())) + ), + "large_list_int": pd.Series( + [[1, 2], [3, 4, 5]], dtype=ArrowDtype(pa.large_list(pa.int64())) + ), + "large_list_string": pd.Series( + [["x", "y"], ["z"]], dtype=ArrowDtype(pa.large_list(pa.string())) + ), + "list_map": pd.Series( + [[{"a": 1.0, "b": 2.0}], [{"c": 3.0}]], + dtype=ArrowDtype(pa.list_(pa.map_(pa.string(), pa.float64()))), + ), + "large_list_map": pd.Series( + [[{"x": 1.5}], [{"y": 2.5, "z": 3.5}]], + dtype=ArrowDtype(pa.large_list(pa.map_(pa.string(), pa.float64()))), + ), + "map_int_float": pd.Series( + [{1: 1.1, 2: 2.2}, {3: 3.3}], + dtype=ArrowDtype(pa.map_(pa.int64(), pa.float64())), + ), + "struct_simple": pd.Series( + [{"f1": 1, "f2": 1.5}, {"f1": 2, "f2": 2.5}], + dtype=ArrowDtype(pa.struct([("f1", pa.int64()), ("f2", pa.float64())])), + ), + "struct_nested": pd.Series( + [ + { + "outer_int": 10, + "inner": {"int_list": [1, 2, 3], "text": "hello"}, + }, + {"outer_int": 20, "inner": {"int_list": [4, 5], "text": "world"}}, + ], + dtype=ArrowDtype( + pa.struct( + [ + ("outer_int", pa.int64()), + ( + "inner", + pa.struct( + [ + ("int_list", pa.list_(pa.int64())), + ("text", pa.string()), + ] + ), + ), + ] + ) + ), + ), + "binary_16": pd.Series( + [b"0123456789abcdef", b"fedcba9876543210"], + dtype=ArrowDtype(pa.binary(16)), + ), + "list_struct": pd.Series( + [ + [{"id": 1, "value": 10.5}, {"id": 2, "value": 20.5}], + [{"id": 3, "value": 30.5}], + ], + dtype=ArrowDtype( + pa.list_(pa.struct([("id", pa.int64()), ("value", pa.float64())])) + ), + ), + } + ) + + table = pa.Table.from_pandas(df) + result = table.to_pandas() + tm.assert_frame_equal(result, df) + + @pytest.mark.parametrize("type_name", ["string", "binary", "large_string"]) def test_arrow_dtype_itemsize_variable_width(type_name): # GH 57948 From 668b119aff485e21bb95f18a566288aca11e4783 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 1 Nov 2025 17:21:09 +0000 Subject: [PATCH 2/4] Fixup --- pandas/tests/extension/test_arrow.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index c4ea4f28d7a34..70ee8f0468202 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3780,6 +3780,7 @@ def test_arrow_dtype_itemsize_fixed_width(type_name, expected_size): ) +@pytest.mark.filterwarnings("ignore::Pandas4Warning") # min versions build def test_roundtrip_of_nested_types(): df = pd.DataFrame( { From cecfe20f3acb0f5dbc207cd7e19206a7ac1a25ba Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 1 Nov 2025 22:37:28 +0000 Subject: [PATCH 3/4] Fixup --- pandas/tests/extension/test_arrow.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 70ee8f0468202..b8357f53486ba 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3780,7 +3780,6 @@ def test_arrow_dtype_itemsize_fixed_width(type_name, expected_size): ) -@pytest.mark.filterwarnings("ignore::Pandas4Warning") # min versions build def test_roundtrip_of_nested_types(): df = pd.DataFrame( { @@ -3853,8 +3852,13 @@ def test_roundtrip_of_nested_types(): } ) - table = pa.Table.from_pandas(df) - result = table.to_pandas() + if pa_version_under19p0: + with tm.assert_produces_warning(Pandas4Warning): + table = pa.Table.from_pandas(df) + result = table.to_pandas() + else: + table = pa.Table.from_pandas(df) + result = table.to_pandas() tm.assert_frame_equal(result, df) From 87e6841d78c2af25fff2dd8fadce521be7d1abb2 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 1 Nov 2025 23:06:38 +0000 Subject: [PATCH 4/4] Update test_arrow.py --- pandas/tests/extension/test_arrow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index b8357f53486ba..d0c889f1a27c0 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3853,7 +3853,7 @@ def test_roundtrip_of_nested_types(): ) if pa_version_under19p0: - with tm.assert_produces_warning(Pandas4Warning): + with tm.assert_produces_warning(Pandas4Warning, check_stacklevel=False): table = pa.Table.from_pandas(df) result = table.to_pandas() else: