From a5f0b1dd803e7dd380cc8778b035e2efa4a493ac Mon Sep 17 00:00:00 2001 From: Georgios Date: Wed, 29 Oct 2025 22:45:38 +0000 Subject: [PATCH 1/5] fix sort_index using level name on MultiIndex --- pandas/core/sorting.py | 28 ++++++- pandas/tests/frame/methods/test_sort_index.py | 83 +++++++++++++++++++ 2 files changed, 107 insertions(+), 4 deletions(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 930704e6f62f4..c1e65d2d4a5b1 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -57,7 +57,7 @@ def get_indexer_indexer( target: Index, - level: Level | list[Level] | None, + level: Level | list[Level] | None, # can level actually be a list here? ascending: list[bool] | bool, kind: SortKind, na_position: NaPosition, @@ -87,7 +87,19 @@ def get_indexer_indexer( # error: Incompatible types in assignment (expression has type # "Union[ExtensionArray, ndarray[Any, Any], Index, Series]", variable has # type "Index") + + # before: + # MultiIndex([('a', 'top10'), + # ('a', 'top2')], + # names=['A', 'B']) target = ensure_key_mapped(target, key, levels=level) # type: ignore[assignment] + # # after + # MultiIndex([('a', 1), + # ('a', 0)], + # names=['A', None]) + # the big problem is that the name is lost as well, + # but with the new change I preserve it + target = target._sort_levels_monotonic() if level is not None: @@ -531,11 +543,15 @@ def _ensure_key_mapped_multiindex( level_iter = [level] else: level_iter = level - sort_levels: range | set = {index._get_level_number(lev) for lev in level_iter} else: sort_levels = range(index.nlevels) + # breakpoint() # the loops through the levels + # for the levels to be sorted, it applies the key function + # (uses the number, not the name) + # it returns the indexeer: ensure_key_mapped( + # index._get_level_values(1), key) = Index([1, 0], dtype='int64') mapped = [ ( ensure_key_mapped(index._get_level_values(level), key) @@ -569,19 +585,23 @@ def ensure_key_mapped( return values if isinstance(values, ABCMultiIndex): + # redirects to special MultiIndex handler return _ensure_key_mapped_multiindex(values, key, level=levels) result = key(values.copy()) if len(result) != len(values): raise ValueError( - "User-provided `key` function must not change the shape of the array." + "User-provided `key` bfunction must not change the shape of the array." ) try: if isinstance( values, Index ): # convert to a new Index subclass, not necessarily the same - result = Index(result, tupleize_cols=False) + # preserve the original name when creating the new Index + result = Index( + result, tupleize_cols=False, name=getattr(values, "name", None) + ) else: # try to revert to original type otherwise type_of_values = type(values) diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py index 1a631e760208a..8b089ffe93833 100644 --- a/pandas/tests/frame/methods/test_sort_index.py +++ b/pandas/tests/frame/methods/test_sort_index.py @@ -1,3 +1,4 @@ +from natsort import index_natsorted import numpy as np import pytest @@ -943,6 +944,88 @@ def test_sort_index_multiindex_sort_remaining(self, ascending): tm.assert_frame_equal(result, expected) + def test_sort_multi_index_sort_by_level_name(self): + # GH#62361 + + df = DataFrame( + [[1, 2], [3, 4]], + columns=MultiIndex.from_product( + [["a"], ["top10", "top2"]], names=("A", "B") + ), + ) + + expected = DataFrame( + [[2, 1], [4, 3]], + columns=MultiIndex.from_product( + [["a"], ["top2", "top10"]], names=("A", "B") + ), + ) + + sorted_df = df.sort_index( + axis=1, level="B", key=lambda x: np.argsort(index_natsorted(x)) + ) + tm.assert_frame_equal(sorted_df, expected) + + def test_sort_multi_index_sort_by_level_name_2(self): + # GH#62361 + + df = DataFrame( + [[1, 2], [3, 4]], + columns=MultiIndex.from_tuples( + [("alpha10", "top10"), ("alpha3", "top2")], names=("A", "B") + ), + ) + + expected = DataFrame( + [[2, 1], [4, 3]], + columns=MultiIndex.from_tuples( + [("alpha3", "top2"), ("alpha10", "top10")], names=("A", "B") + ), + ) + + sorted_df = df.sort_index( + axis=1, level=0, key=lambda x: np.argsort(index_natsorted(x)) + ) + tm.assert_frame_equal(sorted_df, expected) + + sorted_df = df.sort_index( + axis=1, level="A", key=lambda x: np.argsort(index_natsorted(x)) + ) + tm.assert_frame_equal(sorted_df, expected) + + sorted_df = df.sort_index( + axis=1, level=1, key=lambda x: np.argsort(index_natsorted(x)) + ) + tm.assert_frame_equal(sorted_df, expected) + + sorted_df = df.sort_index( + axis=1, level="B", key=lambda x: np.argsort(index_natsorted(x)) + ) + tm.assert_frame_equal(sorted_df, expected) + + sorted_df = df.sort_index( + axis=1, level=[0, 1], key=lambda x: np.argsort(index_natsorted(x)) + ) + tm.assert_frame_equal(sorted_df, expected) + + sorted_df = df.sort_index( + axis=1, level=[1, 0], key=lambda x: np.argsort(index_natsorted(x)) + ) + tm.assert_frame_equal(sorted_df, expected) + + sorted_df = df.sort_index( + axis=1, level=[1, "A"], key=lambda x: np.argsort(index_natsorted(x)) + ) + tm.assert_frame_equal(sorted_df, expected) + + # repetition does not matter + sorted_df = df.sort_index( + axis=1, + level=["A", "B", 0, 1, "B"], + key=lambda x: np.argsort(index_natsorted(x)), + ) + tm.assert_frame_equal(sorted_df, expected) + def test_sort_index_with_sliced_multiindex(): # GH 55379 From eb000884d0d04d3c4ce865a3842e547f89962ec3 Mon Sep 17 00:00:00 2001 From: Georgios Date: Wed, 29 Oct 2025 23:39:50 +0000 Subject: [PATCH 2/5] update tests, remove comments --- pandas/core/sorting.py | 18 +---- pandas/tests/frame/methods/test_sort_index.py | 65 ++++++++++--------- 2 files changed, 37 insertions(+), 46 deletions(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index c1e65d2d4a5b1..c13fab4adb9cf 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -88,17 +88,7 @@ def get_indexer_indexer( # "Union[ExtensionArray, ndarray[Any, Any], Index, Series]", variable has # type "Index") - # before: - # MultiIndex([('a', 'top10'), - # ('a', 'top2')], - # names=['A', 'B']) target = ensure_key_mapped(target, key, levels=level) # type: ignore[assignment] - # # after - # MultiIndex([('a', 1), - # ('a', 0)], - # names=['A', None]) - # the big problem is that the name is lost as well, - # but with the new change I preserve it target = target._sort_levels_monotonic() @@ -547,11 +537,6 @@ def _ensure_key_mapped_multiindex( else: sort_levels = range(index.nlevels) - # breakpoint() # the loops through the levels - # for the levels to be sorted, it applies the key function - # (uses the number, not the name) - # it returns the indexeer: ensure_key_mapped( - # index._get_level_values(1), key) = Index([1, 0], dtype='int64') mapped = [ ( ensure_key_mapped(index._get_level_values(level), key) @@ -585,13 +570,12 @@ def ensure_key_mapped( return values if isinstance(values, ABCMultiIndex): - # redirects to special MultiIndex handler return _ensure_key_mapped_multiindex(values, key, level=levels) result = key(values.copy()) if len(result) != len(values): raise ValueError( - "User-provided `key` bfunction must not change the shape of the array." + "User-provided `key` function must not change the shape of the array." ) try: diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py index 8b089ffe93833..6cc6c5bb7c6dc 100644 --- a/pandas/tests/frame/methods/test_sort_index.py +++ b/pandas/tests/frame/methods/test_sort_index.py @@ -944,7 +944,7 @@ def test_sort_index_multiindex_sort_remaining(self, ascending): tm.assert_frame_equal(result, expected) - def test_sort_multi_index_sort_by_level_name(self): + def test_sort_index_multiindex_by_level_name(self): # GH#62361 df = DataFrame( @@ -970,61 +970,68 @@ def test_sort_multi_index_sort_by_level_name_2(self): # GH#62361 df = DataFrame( - [[1, 2], [3, 4]], + [[1, 2, 3], [4, 5, 6]], columns=MultiIndex.from_tuples( - [("alpha10", "top10"), ("alpha3", "top2")], names=("A", "B") + [("a10", "b12"), ("a2", "b17"), ("a2", "b4")], names=("A", "B") ), ) - expected = DataFrame( - [[2, 1], [4, 3]], + expected_A = DataFrame( + [[2, 3, 1], [5, 6, 4]], + columns=MultiIndex.from_tuples( + [("a2", "b17"), ("a2", "b4"), ("a10", "b12")], names=("A", "B") + ), + ) + expected_B = DataFrame( + [[3, 1, 2], [6, 4, 5]], columns=MultiIndex.from_tuples( - [("alpha3", "top2"), ("alpha10", "top10")], names=("A", "B") + [("a2", "b4"), ("a10", "b12"), ("a2", "b17")], names=("A", "B") ), ) sorted_df = df.sort_index( axis=1, level=0, key=lambda x: np.argsort(index_natsorted(x)) ) - tm.assert_frame_equal(sorted_df, expected) + tm.assert_frame_equal(sorted_df, expected_A) sorted_df = df.sort_index( axis=1, level="A", key=lambda x: np.argsort(index_natsorted(x)) ) - tm.assert_frame_equal(sorted_df, expected) + tm.assert_frame_equal(sorted_df, expected_A) sorted_df = df.sort_index( axis=1, level=1, key=lambda x: np.argsort(index_natsorted(x)) ) - tm.assert_frame_equal(sorted_df, expected) + tm.assert_frame_equal(sorted_df, expected_B) sorted_df = df.sort_index( axis=1, level="B", key=lambda x: np.argsort(index_natsorted(x)) ) - tm.assert_frame_equal(sorted_df, expected) + tm.assert_frame_equal(sorted_df, expected_B) + # actually, only 1 element of list matters for sorting (2nd is ignored) sorted_df = df.sort_index( axis=1, level=[0, 1], key=lambda x: np.argsort(index_natsorted(x)) ) - tm.assert_frame_equal(sorted_df, expected) - - sorted_df = df.sort_index( - axis=1, level=[1, 0], key=lambda x: np.argsort(index_natsorted(x)) - ) - tm.assert_frame_equal(sorted_df, expected) - - sorted_df = df.sort_index( - axis=1, level=[1, "A"], key=lambda x: np.argsort(index_natsorted(x)) - ) - tm.assert_frame_equal(sorted_df, expected) - - # repetition does not matter - sorted_df = df.sort_index( - axis=1, - level=["A", "B", 0, 1, "B"], - key=lambda x: np.argsort(index_natsorted(x)), - ) - tm.assert_frame_equal(sorted_df, expected) + tm.assert_frame_equal(sorted_df, expected_A) + + # sorted_df = df.sort_index( + # axis=1, level=[1, 0], key=lambda x: np.argsort(index_natsorted(x)) + # ) + # tm.assert_frame_equal(sorted_df, expected_B) + + # sorted_df = df.sort_index( + # axis=1, level=[1, "A"], key=lambda x: np.argsort(index_natsorted(x)) + # ) + # tm.assert_frame_equal(sorted_df, expected_B) + + # # repetition does not matter + # sorted_df = df.sort_index( + # axis=1, + # level=["A", "B", 0, 1, "B"], + # key=lambda x: np.argsort(index_natsorted(x)), + # ) + # tm.assert_frame_equal(sorted_df, expected_A) def test_sort_index_with_sliced_multiindex(): From 8a237edbd1e35a73539c53c3b1c3e9ab70acb5af Mon Sep 17 00:00:00 2001 From: Georgios Date: Wed, 29 Oct 2025 23:41:48 +0000 Subject: [PATCH 3/5] remove empty lines --- pandas/core/sorting.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index c13fab4adb9cf..3349a13d7f0bc 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -87,9 +87,7 @@ def get_indexer_indexer( # error: Incompatible types in assignment (expression has type # "Union[ExtensionArray, ndarray[Any, Any], Index, Series]", variable has # type "Index") - target = ensure_key_mapped(target, key, levels=level) # type: ignore[assignment] - target = target._sort_levels_monotonic() if level is not None: From ae88c18ba4d5382b1732c9f77835836640612446 Mon Sep 17 00:00:00 2001 From: Georgios Date: Wed, 29 Oct 2025 23:46:54 +0000 Subject: [PATCH 4/5] rename tes --- pandas/tests/frame/methods/test_sort_index.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py index 6cc6c5bb7c6dc..056c10f7a1dd0 100644 --- a/pandas/tests/frame/methods/test_sort_index.py +++ b/pandas/tests/frame/methods/test_sort_index.py @@ -966,7 +966,7 @@ def test_sort_index_multiindex_by_level_name(self): ) tm.assert_frame_equal(sorted_df, expected) - def test_sort_multi_index_sort_by_level_name_2(self): + def test_sort_index_multiindex_by_level_name_2(self): # GH#62361 df = DataFrame( From d08a1cefcfd5cf31c424dddd59a5cbf4808a5e8c Mon Sep 17 00:00:00 2001 From: Georgios Date: Wed, 29 Oct 2025 23:50:04 +0000 Subject: [PATCH 5/5] note in comment --- pandas/tests/frame/methods/test_sort_index.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py index 056c10f7a1dd0..1ee7d3a91f41b 100644 --- a/pandas/tests/frame/methods/test_sort_index.py +++ b/pandas/tests/frame/methods/test_sort_index.py @@ -1,4 +1,4 @@ -from natsort import index_natsorted +from natsort import index_natsorted # should we import this? or change test? import numpy as np import pytest