Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1221,6 +1221,7 @@ ExtensionArray
- Bug in :meth:`api.types.is_datetime64_any_dtype` where a custom :class:`ExtensionDtype` would return ``False`` for array-likes (:issue:`57055`)
- Bug in comparison between object with :class:`ArrowDtype` and incompatible-dtyped (e.g. string vs bool) incorrectly raising instead of returning all-``False`` (for ``==``) or all-``True`` (for ``!=``) (:issue:`59505`)
- Bug in constructing pandas data structures when passing into ``dtype`` a string of the type followed by ``[pyarrow]`` while PyArrow is not installed would raise ``NameError`` rather than ``ImportError`` (:issue:`57928`)
- Bug in dtype inference when roundtripping nested arrow dtypes like ``list``, ``struct``, ``map`` through pyarrow tables or parquet (:issue:`61529`)
- Bug in various :class:`DataFrame` reductions for pyarrow temporal dtypes returning incorrect dtype when result was null (:issue:`59234`)
- Fixed flex arithmetic with :class:`ExtensionArray` operands raising when ``fill_value`` was passed. (:issue:`62467`)

Expand Down
66 changes: 65 additions & 1 deletion pandas/core/dtypes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -2387,16 +2387,80 @@ def construct_from_string(cls, string: str) -> ArrowDtype:
except (NotImplementedError, ValueError):
# Fall through to raise with nice exception message below
pass
binary_pattern = re.compile(r"^fixed_size_binary\[(?P<width>\d+)\]$")
if match := binary_pattern.match(base_type):
byte_width = match.group("width")
return cls(pa.binary(int(byte_width)))

raise NotImplementedError(
"Passing pyarrow type specific parameters "
f"({has_parameters.group()}) in the string is not supported. "
"Please construct an ArrowDtype object with a pyarrow_dtype "
"instance with specific parameters."
) from err
raise TypeError(f"'{base_type}' is not a valid pyarrow data type.") from err
# match maps
map_pattern = re.compile(r"^map<(?P<key>[^,<>]+),\s(?P<value>[^,<>]+)>$")
# match lists
list_inner_pattern = r"<item:\s(?P<item_type>.+)>$"
list_pattern = re.compile(rf"^list{list_inner_pattern}")
large_list_pattern = re.compile(rf"^large_list{list_inner_pattern}")
# match structs
struct_pattern = re.compile(r"^struct<(?P<fields>.+)>$")
if match := map_pattern.match(base_type):
pa_dtype = pa.map_(
pa.type_for_alias(match.group("key")),
pa.type_for_alias(match.group("value")),
)
elif match := list_pattern.match(base_type):
pa_dtype = pa.list_(
cls._resolve_inner_types(match.group("item_type") + "[pyarrow]")
)
elif match := large_list_pattern.match(base_type):
pa_dtype = pa.large_list(
cls._resolve_inner_types(match.group("item_type") + "[pyarrow]")
)
elif match := struct_pattern.match(base_type):
fields = []
for name, t in cls._split_struct(match.group("fields")):
field_dtype = cls._resolve_inner_types(t + "[pyarrow]")
fields.append((name, field_dtype))
pa_dtype = pa.struct(fields)
else:
raise TypeError(
f"'{base_type}' is not a valid pyarrow data type."
) from err
return cls(pa_dtype)

@classmethod
def _resolve_inner_types(cls, string: str) -> pa.DataType:
if string == "string[pyarrow]":
return pa.string()
else:
return cls.construct_from_string(string).pyarrow_dtype

@staticmethod
def _split_struct(fields: str):
field_pattern = re.compile(r"^\s*(?P<name>[^:]+):\s*(?P<type>.+)\s*$")

parts, start, depth = [], 0, 0
for i, char in enumerate(fields):
if char in "<":
depth += 1
elif char in ">":
depth -= 1
elif char == "," and depth == 0:
parts.append(fields[start:i].strip())
start = i + 1

if start < len(fields):
parts.append(fields[start:].strip())

for field in parts:
if match := field_pattern.match(field):
yield match.group("name"), match.group("type")
else:
raise TypeError(f"Could not parse struct field definition: '{field}'")

# TODO(arrow#33642): This can be removed once supported by pyarrow
@classmethod
def _parse_temporal_dtype_string(cls, string: str) -> ArrowDtype:
Expand Down
82 changes: 82 additions & 0 deletions pandas/tests/extension/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -3780,6 +3780,88 @@ def test_arrow_dtype_itemsize_fixed_width(type_name, expected_size):
)


def test_roundtrip_of_nested_types():
df = pd.DataFrame(
{
"list_int": pd.Series(
[[1, 2, 3], [4, 5]], dtype=ArrowDtype(pa.list_(pa.int64()))
),
"list_string": pd.Series(
[["a", "b"], ["c"]], dtype=ArrowDtype(pa.list_(pa.string()))
),
"large_list_int": pd.Series(
[[1, 2], [3, 4, 5]], dtype=ArrowDtype(pa.large_list(pa.int64()))
),
"large_list_string": pd.Series(
[["x", "y"], ["z"]], dtype=ArrowDtype(pa.large_list(pa.string()))
),
"list_map": pd.Series(
[[{"a": 1.0, "b": 2.0}], [{"c": 3.0}]],
dtype=ArrowDtype(pa.list_(pa.map_(pa.string(), pa.float64()))),
),
"large_list_map": pd.Series(
[[{"x": 1.5}], [{"y": 2.5, "z": 3.5}]],
dtype=ArrowDtype(pa.large_list(pa.map_(pa.string(), pa.float64()))),
),
"map_int_float": pd.Series(
[{1: 1.1, 2: 2.2}, {3: 3.3}],
dtype=ArrowDtype(pa.map_(pa.int64(), pa.float64())),
),
"struct_simple": pd.Series(
[{"f1": 1, "f2": 1.5}, {"f1": 2, "f2": 2.5}],
dtype=ArrowDtype(pa.struct([("f1", pa.int64()), ("f2", pa.float64())])),
),
"struct_nested": pd.Series(
[
{
"outer_int": 10,
"inner": {"int_list": [1, 2, 3], "text": "hello"},
},
{"outer_int": 20, "inner": {"int_list": [4, 5], "text": "world"}},
],
dtype=ArrowDtype(
pa.struct(
[
("outer_int", pa.int64()),
(
"inner",
pa.struct(
[
("int_list", pa.list_(pa.int64())),
("text", pa.string()),
]
),
),
]
)
),
),
"binary_16": pd.Series(
[b"0123456789abcdef", b"fedcba9876543210"],
dtype=ArrowDtype(pa.binary(16)),
),
"list_struct": pd.Series(
[
[{"id": 1, "value": 10.5}, {"id": 2, "value": 20.5}],
[{"id": 3, "value": 30.5}],
],
dtype=ArrowDtype(
pa.list_(pa.struct([("id", pa.int64()), ("value", pa.float64())]))
),
),
}
)

if pa_version_under19p0:
with tm.assert_produces_warning(Pandas4Warning, check_stacklevel=False):
table = pa.Table.from_pandas(df)
result = table.to_pandas()
else:
table = pa.Table.from_pandas(df)
result = table.to_pandas()
tm.assert_frame_equal(result, df)


@pytest.mark.parametrize("type_name", ["string", "binary", "large_string"])
def test_arrow_dtype_itemsize_variable_width(type_name):
# GH 57948
Expand Down
Loading