[MRG + 1] Labels of clustering should start at 0 or -1 if noise (scikit-learn#10015)

albertcthomas · agramfort · commit 1f7fa760d4a3 · 2017-10-27T10:11:14.000+02:00
* test labels of clustering should start at 0 or -1 if noise

* take into account agramfort's comment

* fix test
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
@@ -1051,20 +1051,25 @@ def check_clustering(name, clusterer_orig):
     assert_in(pred.dtype, [np.dtype('int32'), np.dtype('int64')])
     assert_in(pred2.dtype, [np.dtype('int32'), np.dtype('int64')])
 
+    # Add noise to X to test the possible values of the labels
+    rng = np.random.RandomState(7)
+    X_noise = np.concatenate([X, rng.uniform(low=-3, high=3, size=(5, 2))])
+    labels = clusterer.fit_predict(X_noise)
+
     # There should be at least one sample in every cluster. Equivalently
     # labels_ should contain all the consecutive values between its
     # min and its max.
-    pred_sorted = np.unique(pred)
-    assert_array_equal(pred_sorted, np.arange(pred_sorted[0],
-                                              pred_sorted[-1] + 1))
+    labels_sorted = np.unique(labels)
+    assert_array_equal(labels_sorted, np.arange(labels_sorted[0],
+                                                labels_sorted[-1] + 1))
 
-    # labels_ should be greater than -1
-    assert_greater_equal(pred_sorted[0], -1)
-    # labels_ should be less than n_clusters - 1
+    # Labels are expected to start at 0 (no noise) or -1 (if noise)
+    assert_true(labels_sorted[0] in [0, -1])
+    # Labels should be less than n_clusters - 1
     if hasattr(clusterer, 'n_clusters'):
         n_clusters = getattr(clusterer, 'n_clusters')
-        assert_greater_equal(n_clusters - 1, pred_sorted[-1])
-    # else labels_ should be less than max(labels_) which is necessarily true
+        assert_greater_equal(n_clusters - 1, labels_sorted[-1])
+    # else labels should be less than max(labels_) which is necessarily true
 
 
 @ignore_warnings(category=DeprecationWarning)