diff --git a/packages/graphrag/graphrag/query/indexer_adapters.py b/packages/graphrag/graphrag/query/indexer_adapters.py index 7119ad842..a6ded1f9f 100644 --- a/packages/graphrag/graphrag/query/indexer_adapters.py +++ b/packages/graphrag/graphrag/query/indexer_adapters.py @@ -221,5 +221,5 @@ def _filter_under_community_level( ) -> pd.DataFrame: return cast( "pd.DataFrame", - df[df.level <= community_level], + df[(df.level <= community_level) | df.level.isna()], ) diff --git a/tests/unit/query/test_filter_community_level.py b/tests/unit/query/test_filter_community_level.py new file mode 100644 index 000000000..5410edc3f --- /dev/null +++ b/tests/unit/query/test_filter_community_level.py @@ -0,0 +1,42 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License + +"""Tests for _filter_under_community_level preserving NaN-level nodes.""" + +import numpy as np +import pandas as pd + +from graphrag.query.indexer_adapters import _filter_under_community_level + + +def test_filter_preserves_nan_level_nodes(): + """Nodes with level=NaN should not be discarded by the filter. + + Regression test for issue #1808 where isolated nodes without a community + assignment (level=None) were incorrectly dropped. + """ + df = pd.DataFrame({ + "id": ["a", "b", "c", "d"], + "level": [0, 1, 2, np.nan], + "community": [1, 2, 3, np.nan], + }) + + result = _filter_under_community_level(df, community_level=1) + + # Should keep level 0, 1 (<=1) and NaN (unassigned) + assert len(result) == 3 + assert set(result["id"].tolist()) == {"a", "b", "d"} + + +def test_filter_excludes_higher_level_nodes(): + """Nodes with level > community_level should be excluded.""" + df = pd.DataFrame({ + "id": ["a", "b", "c"], + "level": [0, 2, 3], + "community": [1, 2, 3], + }) + + result = _filter_under_community_level(df, community_level=1) + + assert len(result) == 1 + assert result["id"].tolist() == ["a"]