Skip to content

Commit 788d922

Browse files
committed
refactor: copilot documentation for KDE_test.py
1 parent 696dfa4 commit 788d922

File tree

1 file changed

+86
-1
lines changed

1 file changed

+86
-1
lines changed

tests/factors/continuous/KDE_test.py

Lines changed: 86 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,13 @@
1010
df_float = df.astype("float32")
1111

1212

13-
def test_check_type():
13+
def test_check_type() -> None:
14+
"""
15+
Tests that the KDE factor raises a ValueError when the data type of the test dataset
16+
is different from the data type of the training dataset during log-likelihood and
17+
smoothed log-likelihood computations.
18+
"""
19+
1420
cpd = pbn.KDE(["A"])
1521
cpd.fit(df)
1622
with pytest.raises(ValueError) as ex:
@@ -30,12 +36,28 @@ def test_check_type():
3036

3137

3238
def test_kde_variables():
39+
"""
40+
Tests the initialization of the KDE class with different sets of variables.
41+
For each list of variable names, this test creates a KDE object and asserts
42+
that the object's variables match the input list. This ensures that the KDE
43+
class correctly stores and returns its variables upon initialization.
44+
"""
45+
3346
for variables in [["A"], ["B", "A"], ["C", "A", "B"], ["D", "A", "B", "C"]]:
3447
cpd = pbn.KDE(variables)
3548
assert cpd.variables() == variables
3649

3750

3851
def test_kde_bandwidth():
52+
"""
53+
Tests the bandwidth selection and assignment functionality of the KDE class.
54+
This test verifies:
55+
- That the KDE bandwidth computed using the normal reference rule matches the output of scipy's gaussian_kde with a custom bandwidth method, for various variable sets and sample sizes.
56+
- That the KDE bandwidth computed using Scott's rule matches the output of scipy's gaussian_kde default bandwidth, for various variable sets and sample sizes.
57+
- That the bandwidth attribute of the KDE object can be manually set and correctly reflects the assigned value.
58+
The test is performed for both integer and float dataframes.
59+
"""
60+
3961
for variables in [["A"], ["B", "A"], ["C", "A", "B"], ["D", "A", "B", "C"]]:
4062
for instances in [50, 1000, 10000]:
4163
npdata = df.loc[:, variables].to_numpy()
@@ -81,6 +103,28 @@ def test_kde_bandwidth():
81103

82104

83105
class UnitaryBandwidth(pbn.BandwidthSelector):
106+
"""
107+
A bandwidth selector that returns the identity matrix as the bandwidth.
108+
This class is a subclass of `pbn.BandwidthSelector` and implements a simple bandwidth selection strategy
109+
where the bandwidth matrix is always the identity matrix of size equal to the number of variables.
110+
Methods
111+
-------
112+
__init__():
113+
Initializes the UnitaryBandwidth selector.
114+
bandwidth(df, variables):
115+
Returns the identity matrix of shape (len(variables), len(variables)) as the bandwidth matrix.
116+
Parameters
117+
----------
118+
df : pandas.DataFrame
119+
The data frame containing the data (not used in this selector).
120+
variables : list
121+
The list of variables for which the bandwidth is to be computed.
122+
Returns
123+
-------
124+
numpy.ndarray
125+
An identity matrix of size equal to the number of variables.
126+
"""
127+
84128
def __init__(self):
85129
pbn.BandwidthSelector.__init__(self)
86130

@@ -89,6 +133,16 @@ def bandwidth(self, df, variables):
89133

90134

91135
def test_kde_new_bandwidth():
136+
"""
137+
Tests the behavior of the KDE class when using the UnitaryBandwidth bandwidth selector.
138+
This test verifies that:
139+
- When fitting a KDE with a single variable, the resulting bandwidth matrix is the 1x1 identity matrix.
140+
- When fitting a KDE with four variables, the resulting bandwidth matrix is the 4x4 identity matrix.
141+
- The behavior is consistent for both integer and float dataframes.
142+
Assertions:
143+
- The bandwidth matrix after fitting is as expected (identity matrix) for both data types and variable counts.
144+
"""
145+
92146
kde = pbn.KDE(["A"], UnitaryBandwidth())
93147
kde.fit(df)
94148
assert kde.bandwidth == np.eye(1)
@@ -105,6 +159,14 @@ def test_kde_new_bandwidth():
105159

106160

107161
def test_kde_data_type():
162+
"""
163+
Tests the `data_type` method of the KDE factor.
164+
This test verifies that:
165+
- Calling `data_type` before fitting the KDE raises a ValueError with the message "KDE factor not fitted".
166+
- After fitting the KDE with a DataFrame `df`, the returned data type is `pa.float64()`.
167+
- After fitting the KDE with a DataFrame `df_float`, the returned data type is `pa.float32()`.
168+
"""
169+
108170
k = pbn.KDE(["A"])
109171

110172
with pytest.raises(ValueError) as ex:
@@ -118,6 +180,19 @@ def test_kde_data_type():
118180

119181

120182
def test_kde_fit():
183+
"""
184+
Tests the fitting process of the KDE (Kernel Density Estimation) class in the PyBNesian library.
185+
This test verifies that:
186+
- The KDE object is not fitted before calling `fit`.
187+
- After fitting with a subset of the provided DataFrame, the KDE object is marked as fitted.
188+
- The number of training instances and variables in the fitted KDE matches those of a reference `scipy.stats.gaussian_kde` object.
189+
- The test is performed for different combinations of variables and different numbers of training instances, using both integer and float DataFrames.
190+
Tested scenarios:
191+
- Single and multiple variable KDEs.
192+
- Different sample sizes (50, 150, 500).
193+
- Both integer and float data types.
194+
"""
195+
121196
def _test_kde_fit_iter(variables, _df, instances):
122197
cpd = pbn.KDE(variables)
123198
assert not cpd.fitted()
@@ -141,6 +216,16 @@ def _test_kde_fit_iter(variables, _df, instances):
141216

142217

143218
def test_kde_fit_null():
219+
"""
220+
Test the fitting of the KDE (Kernel Density Estimator) model when input data contains null (NaN) values.
221+
This test verifies that:
222+
- The KDE model is not fitted before calling `fit` and is fitted after.
223+
- The model correctly ignores rows with null values during fitting.
224+
- The number of training instances and variables in the fitted model matches those in a reference `scipy.stats.gaussian_kde` fitted on the same data with nulls removed.
225+
- The computed bandwidth (covariance) of the KDE matches that of the reference implementation.
226+
The test is performed for different combinations of variables and different numbers of training instances, using both integer and float dataframes with randomly inserted NaN values.
227+
"""
228+
144229
def _test_kde_fit_null_iter(variables, _df, instances):
145230
cpd = pbn.KDE(variables)
146231
assert not cpd.fitted()

0 commit comments

Comments
 (0)