From 8bedc45a85b5f2fc38b87431c5a7818fb8aec081 Mon Sep 17 00:00:00 2001 From: Aldo Date: Wed, 2 Feb 2022 13:19:40 +0100 Subject: [PATCH 1/6] adding early stop for sequential feature selection --- .../sequential_feature_selector.py | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/mlxtend/feature_selection/sequential_feature_selector.py b/mlxtend/feature_selection/sequential_feature_selector.py index 41de9ce86..febdee4b8 100644 --- a/mlxtend/feature_selection/sequential_feature_selector.py +++ b/mlxtend/feature_selection/sequential_feature_selector.py @@ -114,6 +114,14 @@ class SequentialFeatureSelector(_BaseXComposition, MetaEstimatorMixin): n_jobs : int (default: 1) The number of CPUs to use for evaluating different feature subsets in parallel. -1 means 'all CPUs'. + early_stop : bool (default: False) + Determines whether to prematurely stop execution if the score does not + improve after a number of iterations set by the `early_stop_rounds` + parameter. + early_stop_rounds : int (default 3) + Used when early_stop is True, it determines the number of iterations + after which, if no performance boost has been seen, execution is + stopped. pre_dispatch : int, or string (default: '2*n_jobs') Controls the number of jobs that get dispatched during parallel execution if `n_jobs > 1` or `n_jobs=-1`. @@ -178,6 +186,8 @@ def __init__(self, estimator, k_features=1, forward=True, floating=False, verbose=0, scoring=None, cv=5, n_jobs=1, + early_stop=False, + early_stop_rounds=3, pre_dispatch='2*n_jobs', clone_estimator=True, fixed_features=None): @@ -201,6 +211,14 @@ def __init__(self, estimator, k_features=1, self.verbose = verbose self.clone_estimator = clone_estimator + if not isinstance(early_stop_rounds, int) or early_stop_rounds < 0: + raise ValueError('Number of early stopping round should be ' + 'an integer value greater than or equal to 0.' + 'Got %d' % early_stop_rounds) + + self.early_stop = early_stop + self.early_stop_rounds = early_stop_rounds + if fixed_features is not None: if isinstance(self.k_features, int) and \ self.k_features <= len(fixed_features): @@ -424,6 +442,8 @@ def fit(self, X, y, custom_feature_names=None, groups=None, **fit_params): } best_subset = None k_score = 0 + best_score = -np.inf + early_stop_count = self.early_stop_rounds try: while k != k_to_select: @@ -550,6 +570,18 @@ def fit(self, X, y, custom_feature_names=None, groups=None, **fit_params): X) raise KeyboardInterrupt + # early stop + if self.early_stop and k != k_to_select: + if k_score <= best_score: + early_stop_count -= 1 + if early_stop_count == 0: + print('Performances not improved for %d rounds. ' + 'Stopping now!' % self.early_stop_rounds) + break + else: + early_stop_count = self.early_stop_rounds + best_score = k_score + except KeyboardInterrupt: self.interrupted_ = True sys.stderr.write('\nSTOPPING EARLY DUE TO KEYBOARD INTERRUPT...') From fff216316bafd1682a30928c8352610b44d3dd4e Mon Sep 17 00:00:00 2001 From: Aldo Date: Wed, 2 Feb 2022 14:20:39 +0100 Subject: [PATCH 2/6] test and refactoring --- .../sequential_feature_selector.py | 10 ++-- .../tests/test_sequential_feature_selector.py | 48 +++++++++++++++++++ 2 files changed, 53 insertions(+), 5 deletions(-) diff --git a/mlxtend/feature_selection/sequential_feature_selector.py b/mlxtend/feature_selection/sequential_feature_selector.py index febdee4b8..eef003b76 100644 --- a/mlxtend/feature_selection/sequential_feature_selector.py +++ b/mlxtend/feature_selection/sequential_feature_selector.py @@ -115,12 +115,12 @@ class SequentialFeatureSelector(_BaseXComposition, MetaEstimatorMixin): The number of CPUs to use for evaluating different feature subsets in parallel. -1 means 'all CPUs'. early_stop : bool (default: False) - Determines whether to prematurely stop execution if the score does not - improve after a number of iterations set by the `early_stop_rounds` + Determines whether to prematurely stop execution if the score does not + improve after a number of iterations set by the `early_stop_rounds` parameter. early_stop_rounds : int (default 3) - Used when early_stop is True, it determines the number of iterations - after which, if no performance boost has been seen, execution is + Used when `early_stop` is True, it determines the number of iterations + after which, if no performance boost has been seen, execution is stopped. pre_dispatch : int, or string (default: '2*n_jobs') Controls the number of jobs that get dispatched @@ -186,7 +186,7 @@ def __init__(self, estimator, k_features=1, forward=True, floating=False, verbose=0, scoring=None, cv=5, n_jobs=1, - early_stop=False, + early_stop=False, early_stop_rounds=3, pre_dispatch='2*n_jobs', clone_estimator=True, diff --git a/mlxtend/feature_selection/tests/test_sequential_feature_selector.py b/mlxtend/feature_selection/tests/test_sequential_feature_selector.py index c679532a2..61944122c 100644 --- a/mlxtend/feature_selection/tests/test_sequential_feature_selector.py +++ b/mlxtend/feature_selection/tests/test_sequential_feature_selector.py @@ -983,3 +983,51 @@ def test_custom_feature_names(): assert sfs1.k_feature_names_ == ('sepal width', 'petal width') assert sfs1.subsets_[2]['feature_names'] == ('sepal width', 'petal width') + + +def test_run_forward_earlystop(): + np.random.seed(0) + iris = load_iris() + X_iris = iris.data + y_iris = iris.target + X_iris_with_noise = np.concatenate( + (X_iris, + np.random.randn(X_iris.shape[0], X_iris.shape[1])), + axis=1) + knn = KNeighborsClassifier() + esr = 2 + sfs = SFS(estimator=knn, + k_features=X_iris_with_noise.shape[1], + forward=True, + floating=False, + early_stop=True, + early_stop_rounds=esr, + verbose=0) + sfs.fit(X_iris_with_noise, y_iris) + assert len(sfs.subsets_) < X_iris_with_noise.shape[1] + assert all([sfs.subsets_[list(sfs.subsets_)[-esr-1]]['avg_score'] + >= sfs.subsets_[i]['avg_score'] for i in sfs.subsets_.keys()]) + + +def test_run_backward_earlystop(): + np.random.seed(0) + iris = load_iris() + X_iris = iris.data + y_iris = iris.target + X_iris_with_noise = np.concatenate( + (X_iris, + np.random.randn(X_iris.shape[0], X_iris.shape[1])), + axis=1) + knn = KNeighborsClassifier() + esr = 2 + sfs = SFS(estimator=knn, + k_features=1, + forward=False, + floating=False, + early_stop=True, + early_stop_rounds=esr, + verbose=0) + sfs.fit(X_iris_with_noise, y_iris) + assert len(sfs.subsets_) > 1 + assert all([sfs.subsets_[list(sfs.subsets_)[-esr-1]]['avg_score'] + >= sfs.subsets_[i]['avg_score'] for i in sfs.subsets_.keys()]) From f99fec4c2c16d5d35db8df8d869b812a4eae1d91 Mon Sep 17 00:00:00 2001 From: Aldo Date: Wed, 2 Feb 2022 16:48:51 +0100 Subject: [PATCH 3/6] improvements: https://github.com/rasbt/mlxtend/pull/886#issuecomment-1027987459 --- .../sequential_feature_selector.py | 21 ++++++++----------- .../tests/test_sequential_feature_selector.py | 4 ++-- 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/mlxtend/feature_selection/sequential_feature_selector.py b/mlxtend/feature_selection/sequential_feature_selector.py index eef003b76..db9811d17 100644 --- a/mlxtend/feature_selection/sequential_feature_selector.py +++ b/mlxtend/feature_selection/sequential_feature_selector.py @@ -114,14 +114,11 @@ class SequentialFeatureSelector(_BaseXComposition, MetaEstimatorMixin): n_jobs : int (default: 1) The number of CPUs to use for evaluating different feature subsets in parallel. -1 means 'all CPUs'. - early_stop : bool (default: False) - Determines whether to prematurely stop execution if the score does not - improve after a number of iterations set by the `early_stop_rounds` - parameter. - early_stop_rounds : int (default 3) - Used when `early_stop` is True, it determines the number of iterations - after which, if no performance boost has been seen, execution is - stopped. + early_stop_rounds : int (default 0) + Enable early stopping criterion when > 0, this value determines the + number of iterations after which, if no performance boost has been + seen, execution is stopped. + Used only when `k_features == 'best'` or `k_features == 'parsimonious'` pre_dispatch : int, or string (default: '2*n_jobs') Controls the number of jobs that get dispatched during parallel execution if `n_jobs > 1` or `n_jobs=-1`. @@ -186,8 +183,7 @@ def __init__(self, estimator, k_features=1, forward=True, floating=False, verbose=0, scoring=None, cv=5, n_jobs=1, - early_stop=False, - early_stop_rounds=3, + early_stop_rounds=0, pre_dispatch='2*n_jobs', clone_estimator=True, fixed_features=None): @@ -216,7 +212,6 @@ def __init__(self, estimator, k_features=1, 'an integer value greater than or equal to 0.' 'Got %d' % early_stop_rounds) - self.early_stop = early_stop self.early_stop_rounds = early_stop_rounds if fixed_features is not None: @@ -571,7 +566,9 @@ def fit(self, X, y, custom_feature_names=None, groups=None, **fit_params): raise KeyboardInterrupt # early stop - if self.early_stop and k != k_to_select: + if self.early_stop_rounds \ + and k != k_to_select \ + and self.k_features in {'best', 'parsimonious'}: if k_score <= best_score: early_stop_count -= 1 if early_stop_count == 0: diff --git a/mlxtend/feature_selection/tests/test_sequential_feature_selector.py b/mlxtend/feature_selection/tests/test_sequential_feature_selector.py index 61944122c..f30981ced 100644 --- a/mlxtend/feature_selection/tests/test_sequential_feature_selector.py +++ b/mlxtend/feature_selection/tests/test_sequential_feature_selector.py @@ -997,7 +997,7 @@ def test_run_forward_earlystop(): knn = KNeighborsClassifier() esr = 2 sfs = SFS(estimator=knn, - k_features=X_iris_with_noise.shape[1], + k_features='best', forward=True, floating=False, early_stop=True, @@ -1021,7 +1021,7 @@ def test_run_backward_earlystop(): knn = KNeighborsClassifier() esr = 2 sfs = SFS(estimator=knn, - k_features=1, + k_features='best', forward=False, floating=False, early_stop=True, From 4db4c971cb2917dcd293a059f4ad12c4126ddef2 Mon Sep 17 00:00:00 2001 From: Aldo Date: Wed, 2 Feb 2022 16:52:12 +0100 Subject: [PATCH 4/6] refactoring --- mlxtend/feature_selection/sequential_feature_selector.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mlxtend/feature_selection/sequential_feature_selector.py b/mlxtend/feature_selection/sequential_feature_selector.py index db9811d17..acc23271d 100644 --- a/mlxtend/feature_selection/sequential_feature_selector.py +++ b/mlxtend/feature_selection/sequential_feature_selector.py @@ -567,8 +567,8 @@ def fit(self, X, y, custom_feature_names=None, groups=None, **fit_params): # early stop if self.early_stop_rounds \ - and k != k_to_select \ - and self.k_features in {'best', 'parsimonious'}: + and k != k_to_select \ + and self.k_features in {'best', 'parsimonious'}: if k_score <= best_score: early_stop_count -= 1 if early_stop_count == 0: From d5595a936e13faefd03fb037eb5c7c9c173308d4 Mon Sep 17 00:00:00 2001 From: Aldo Date: Thu, 3 Feb 2022 11:37:09 +0100 Subject: [PATCH 5/6] fix https://github.com/rasbt/mlxtend/pull/886#discussion_r797950840 and tests --- .../feature_selection/sequential_feature_selector.py | 2 +- .../tests/test_sequential_feature_selector.py | 12 +++++------- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/mlxtend/feature_selection/sequential_feature_selector.py b/mlxtend/feature_selection/sequential_feature_selector.py index acc23271d..b0cf6b7e2 100644 --- a/mlxtend/feature_selection/sequential_feature_selector.py +++ b/mlxtend/feature_selection/sequential_feature_selector.py @@ -210,7 +210,7 @@ def __init__(self, estimator, k_features=1, if not isinstance(early_stop_rounds, int) or early_stop_rounds < 0: raise ValueError('Number of early stopping round should be ' 'an integer value greater than or equal to 0.' - 'Got %d' % early_stop_rounds) + 'Got %s' % early_stop_rounds) self.early_stop_rounds = early_stop_rounds diff --git a/mlxtend/feature_selection/tests/test_sequential_feature_selector.py b/mlxtend/feature_selection/tests/test_sequential_feature_selector.py index f30981ced..2636decff 100644 --- a/mlxtend/feature_selection/tests/test_sequential_feature_selector.py +++ b/mlxtend/feature_selection/tests/test_sequential_feature_selector.py @@ -978,7 +978,7 @@ def test_custom_feature_names(): n_jobs=1) sfs1 = sfs1.fit(X, y, custom_feature_names=( - 'sepal length', 'sepal width', 'petal length', 'petal width')) + 'sepal length', 'sepal width', 'petal length', 'petal width')) assert sfs1.k_feature_idx_ == (1, 3) assert sfs1.k_feature_names_ == ('sepal width', 'petal width') assert sfs1.subsets_[2]['feature_names'] == ('sepal width', @@ -1000,13 +1000,12 @@ def test_run_forward_earlystop(): k_features='best', forward=True, floating=False, - early_stop=True, early_stop_rounds=esr, verbose=0) sfs.fit(X_iris_with_noise, y_iris) assert len(sfs.subsets_) < X_iris_with_noise.shape[1] - assert all([sfs.subsets_[list(sfs.subsets_)[-esr-1]]['avg_score'] - >= sfs.subsets_[i]['avg_score'] for i in sfs.subsets_.keys()]) + assert all([sfs.k_score_ >= sfs.subsets_[i]['avg_score'] + for i in sfs.subsets_]) def test_run_backward_earlystop(): @@ -1024,10 +1023,9 @@ def test_run_backward_earlystop(): k_features='best', forward=False, floating=False, - early_stop=True, early_stop_rounds=esr, verbose=0) sfs.fit(X_iris_with_noise, y_iris) assert len(sfs.subsets_) > 1 - assert all([sfs.subsets_[list(sfs.subsets_)[-esr-1]]['avg_score'] - >= sfs.subsets_[i]['avg_score'] for i in sfs.subsets_.keys()]) + assert all([sfs.k_score_ >= sfs.subsets_[i]['avg_score'] + for i in sfs.subsets_]) From c6e9ca8fee5233be0f5dc1966953127fe76713f3 Mon Sep 17 00:00:00 2001 From: Aldo Date: Mon, 7 Feb 2022 16:06:55 +0100 Subject: [PATCH 6/6] fix https://github.com/rasbt/mlxtend/pull/886#discussion_r797954274 --- .../feature_selection/sequential_feature_selector.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/mlxtend/feature_selection/sequential_feature_selector.py b/mlxtend/feature_selection/sequential_feature_selector.py index b0cf6b7e2..25ea7cb56 100644 --- a/mlxtend/feature_selection/sequential_feature_selector.py +++ b/mlxtend/feature_selection/sequential_feature_selector.py @@ -179,6 +179,7 @@ class SequentialFeatureSelector(_BaseXComposition, MetaEstimatorMixin): http://rasbt.github.io/mlxtend/user_guide/feature_selection/SequentialFeatureSelector/ """ + def __init__(self, estimator, k_features=1, forward=True, floating=False, verbose=0, scoring=None, @@ -398,6 +399,12 @@ def fit(self, X, y, custom_feature_names=None, groups=None, **fit_params): select_in_range = False k_to_select = self.k_features + if self.early_stop_rounds and isinstance(self.k_features, str) and\ + not self.k_features in {'best', 'parsimonious'}: + raise ValueError('Early stopping is allowed only when `k_features`' + ' is "best" or "parsimonious". Got' + ' `k_features=%s`' % self.k_features) + orig_set = set(range(X_.shape[1])) n_features = X_.shape[1] @@ -566,9 +573,7 @@ def fit(self, X, y, custom_feature_names=None, groups=None, **fit_params): raise KeyboardInterrupt # early stop - if self.early_stop_rounds \ - and k != k_to_select \ - and self.k_features in {'best', 'parsimonious'}: + if self.early_stop_rounds and k != k_to_select: if k_score <= best_score: early_stop_count -= 1 if early_stop_count == 0: