1212import time
1313from scipy .misc import logsumexp
1414from scipy .optimize import minimize
15+ from sklearn .preprocessing import OneHotEncoder
1516
1617from ..base import BaseEstimator , TransformerMixin
1718from ..preprocessing import LabelEncoder
2223from ..externals .six import integer_types
2324
2425
25- class NeighborhoodComponentAnalysis (BaseEstimator , TransformerMixin ):
26- """Neighborhood Component Analysis
26+ class NeighborhoodComponentsAnalysis (BaseEstimator , TransformerMixin ):
27+ """Neighborhood Components Analysis
2728
2829 Parameters
2930 ----------
@@ -98,16 +99,16 @@ class NeighborhoodComponentAnalysis(BaseEstimator, TransformerMixin):
9899
99100 Examples
100101 --------
101- >>> from sklearn.neighbors.nca import NeighborhoodComponentAnalysis
102+ >>> from sklearn.neighbors.nca import NeighborhoodComponentsAnalysis
102103 >>> from sklearn.neighbors import KNeighborsClassifier
103104 >>> from sklearn.datasets import load_iris
104105 >>> from sklearn.model_selection import train_test_split
105106 >>> X, y = load_iris(return_X_y=True)
106107 >>> X_train, X_test, y_train, y_test = train_test_split(X, y,
107108 ... stratify=y, test_size=0.7, random_state=42)
108- >>> nca = NeighborhoodComponentAnalysis (None,random_state=42)
109+ >>> nca = NeighborhoodComponentsAnalysis (None,random_state=42)
109110 >>> nca.fit(X_train, y_train) # doctest: +ELLIPSIS
110- NeighborhoodComponentAnalysis (...)
111+ NeighborhoodComponentsAnalysis (...)
111112 >>> knn = KNeighborsClassifier(n_neighbors=3)
112113 >>> knn.fit(X_train, y_train) # doctest: +ELLIPSIS
113114 KNeighborsClassifier(...)
@@ -123,23 +124,21 @@ class NeighborhoodComponentAnalysis(BaseEstimator, TransformerMixin):
123124 Neighborhood Component Analysis (NCA) is a machine learning algorithm for
124125 metric learning. It learns a linear transformation in a supervised fashion
125126 to improve the classification accuracy of a stochastic nearest neighbors
126- rule in the new space.
127-
128- .. warning::
129-
130- As NCA is optimizing a non-convex objective function, it will
131- likely end up in a local optimum. Several runs with independent random
132- init might be necessary to get a good convergence.
127+ rule in the transformed space.
133128
134129 References
135130 ----------
136131 .. [1] J. Goldberger, G. Hinton, S. Roweis, R. Salakhutdinov.
137132 "Neighbourhood Components Analysis". Advances in Neural Information
138133 Processing Systems. 17, 513-520, 2005.
139134 http://www.cs.nyu.edu/~roweis/papers/ncanips.pdf
135+
136+ .. [2] Wikipedia entry on Neighborhood Components Analysis
137+ https://en.wikipedia.org/wiki/Neighbourhood_components_analysis
138+
140139 """
141140
142- def __init__ (self , n_features_out = None , init = 'identity ' , max_iter = 50 ,
141+ def __init__ (self , n_features_out = None , init = 'pca ' , max_iter = 50 ,
143142 tol = 1e-5 , callback = None , store_opt_result = False , verbose = 0 ,
144143 random_state = None ):
145144
@@ -167,7 +166,7 @@ def fit(self, X, y):
167166 Returns
168167 -------
169168 self : object
170- returns a trained NeighborhoodComponentAnalysis model.
169+ returns a trained NeighborhoodComponentsAnalysis model.
171170 """
172171
173172 # Verify inputs X and y and NCA parameters, and transform a copy if
@@ -182,7 +181,8 @@ def fit(self, X, y):
182181
183182 # Compute arrays that stay fixed during optimization:
184183 # mask for fast lookup of same-class samples
185- masks = _make_masks (y_valid )
184+ masks = OneHotEncoder (sparse = False ,
185+ dtype = bool ).fit_transform (y_valid [:, np .newaxis ])
186186 # pairwise differences
187187 diffs = X_valid [:, np .newaxis ] - X_valid [np .newaxis ]
188188
@@ -193,7 +193,7 @@ def fit(self, X, y):
193193 disp = self .verbose - 2 if self .verbose > 1 else - 1
194194 optimizer_params = {'method' : 'L-BFGS-B' ,
195195 'fun' : self ._loss_grad_lbfgs ,
196- 'args' : (X_valid , y_valid , diffs , masks ),
196+ 'args' : (X_valid , y_valid , diffs , masks , - 1.0 ),
197197 'jac' : True ,
198198 'x0' : transformation ,
199199 'tol' : self .tol ,
@@ -401,7 +401,7 @@ def _callback(self, transformation):
401401 self .n_iter_ += 1
402402
403403 def _loss_grad_lbfgs (self , transformation , X , y , diffs ,
404- masks ):
404+ masks , sign = 1.0 ):
405405 """Compute the loss and the loss gradient w.r.t. ``transformation``.
406406
407407 Parameters
@@ -448,23 +448,29 @@ def _loss_grad_lbfgs(self, transformation, X, y, diffs,
448448 gradient = np .zeros (transformation .shape )
449449 X_embedded = transformation .dot (X .T ).T
450450
451- # for every sample, compute its contribution to loss and gradient
451+ # for every sample x_i , compute its contribution to loss and gradient
452452 for i in range (X .shape [0 ]):
453+ # compute distances to x_i in embedded space
453454 diff_embedded = X_embedded [i ] - X_embedded
454- sum_of_squares = np .einsum ('ij,ij->i' , diff_embedded ,
455- diff_embedded )
456- sum_of_squares [i ] = np .inf
457- soft = np .exp (- sum_of_squares - logsumexp (- sum_of_squares ))
458- ci = masks [:, y [i ]]
459- p_i_j = soft [ci ]
460- not_ci = np .logical_not (ci )
455+ dist_embedded = np .einsum ('ij,ij->i' , diff_embedded ,
456+ diff_embedded )
457+ dist_embedded [i ] = np .inf
458+
459+ # compute exponentiated distances (use the log-sum-exp trick to
460+ # avoid numerical instabilities
461+ exp_dist_embedded = np .exp (- dist_embedded -
462+ logsumexp (- dist_embedded ))
463+ ci = masks [:, y [i ]] # samples that are in the same class as x_i
464+ p_i_j = exp_dist_embedded [ci ]
461465 diff_ci = diffs [i , ci , :]
462- diff_not_ci = diffs [i , not_ci , :]
466+ diff_not_ci = diffs [i , ~ ci , :]
463467 sum_ci = diff_ci .T .dot (
464468 (p_i_j [:, np .newaxis ] * diff_embedded [ci , :]))
465- sum_not_ci = diff_not_ci .T .dot ((soft [not_ci ][:, np .newaxis ] *
466- diff_embedded [not_ci , :]))
467- p_i = np .sum (p_i_j )
469+ sum_not_ci = diff_not_ci .T .dot ((exp_dist_embedded [~ ci ][:,
470+ np .newaxis ] *
471+ diff_embedded [~ ci , :]))
472+ p_i = np .sum (p_i_j ) # probability of x_i to be correctly
473+ # classified
468474 gradient += 2 * (p_i * (sum_ci .T + sum_not_ci .T ) - sum_ci .T )
469475 loss += p_i
470476
@@ -475,7 +481,7 @@ def _loss_grad_lbfgs(self, transformation, X, y, diffs,
475481 loss , t_funcall ))
476482 sys .stdout .flush ()
477483
478- return - loss , - gradient .ravel ()
484+ return sign * loss , sign * gradient .ravel ()
479485
480486
481487##########################
@@ -538,8 +544,9 @@ def _make_masks(y):
538544 masks: array, shape (n_samples, n_classes)
539545 One-hot encoding of ``y``.
540546 """
541-
542- n = y .shape [0 ]
543- masks = np .zeros ((n , y .max () + 1 ))
544- masks [np .arange (n ), y ] = [1 ]
545- return masks .astype (bool )
547+ masks = OneHotEncoder (sparse = False , dtype = bool ).fit_transform (y [:,
548+ np .newaxis ])
549+ # n = y.shape[0]
550+ # masks = np.zeros((n, y.max() + 1), dtype=bool)
551+ # masks[np.arange(n), y] = [True]
552+ return masks
0 commit comments