1515
1616
1717class AHC (HypModel ):
18+ """Agglomerative Hierarchical Clustering class.
19+
20+ Attributes:
21+ method: linkage method to calculate the distance between a new agglomerated
22+ cluster and the rest of clusters.
23+ This can be ["average", "single", "complete", "weighted", "centroid", "median", "ward"].
24+ See: https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html
25+ metric: indicates the type of metric used to calculate the input scores.
26+ It can be: "llr" (log-likelihood ratios), "prob" (probabilities), "distance": (distance metric).
27+ """
28+
1829 def __init__ (self , method = "average" , metric = "llr" , ** kwargs ):
1930 super ().__init__ (** kwargs )
2031 self .method = method
@@ -23,6 +34,15 @@ def __init__(self, method="average", metric="llr", **kwargs):
2334 self .flat_clusters = None
2435
2536 def fit (self , x , mask = None ):
37+ """Performs the clustering.
38+ It stores the AHC tree in the Z property of the object.
39+
40+ Args:
41+ x: input score matrix (num_samples, num_samples).
42+ It will use the upper triangular matrix only.
43+ mask: boolean mask where False in position i,j means that
44+ nodes i and j should not be merged.
45+ """
2646
2747 if mask is not None :
2848 x = copy (x )
@@ -44,12 +64,27 @@ def fit(self, x, mask=None):
4464 self .Z = linkage (scores , method = self .method , metric = self .metric )
4565
4666 def get_flat_clusters (self , t , criterion = "threshold" ):
67+ """Computes the flat clusters from the AHC tree.
68+
69+ Args:
70+ t: threshold or number of clusters
71+ criterion: if "threshold" with llr/prob larger than threshold or
72+ distance lower than threshold.
73+ if "num_clusters" returns the clusters corresponding
74+ to selecting a given number of clusters.
75+
76+ Returns:
77+ Clusters assigments for x as numpy integer vector (num_samples,).
78+ """
4779 if criterion == "threshold" :
4880 return self .get_flat_clusters_from_thr (t )
4981 else :
5082 return self .get_flat_clusters_from_num_clusters (t )
5183
5284 def get_flat_clusters_from_num_clusters (self , num_clusters ):
85+ """Computes the flat clusters from the AHC tree using
86+ num_clusters criterion"
87+ """
5388 N = self .Z .shape [0 ] + 1
5489 num_clusters = min (N , num_clusters )
5590 p_idx = N - num_clusters
@@ -67,14 +102,23 @@ def get_flat_clusters_from_num_clusters(self, num_clusters):
67102 return flat_clusters
68103
69104 def get_flat_clusters_from_thr (self , thr ):
105+ """Computes the flat clusters from the AHC tree using
106+ threshold criterion"
107+ """
70108 if self .metric == "llr" or self .metric == "prob" :
71109 idx = self .Z [:, 2 ] >= thr
72110 else :
73111 idx = self .Z [:, 2 ] <= thr
74112 num_clusters = self .Z .shape [0 ] + 1 - np .sum (idx )
75113 return self .get_flat_clusters_from_num_clusters (num_clusters )
76114
77- def compute_flat_clusters ():
115+ def compute_flat_clusters (self ):
116+ """Computes the flat clusters for all possible number of clusters
117+
118+ Returns:
119+ numpy matrix (num_samples, num_samples) where row i contains the
120+ clusters assignments for the case of choosing num_samples - i clusters.
121+ """
78122 N = self .Z .shape [0 ] + 1
79123 flat_clusters = np .zeros ((N , N ), dtype = int )
80124 flat_clusters [0 ] = np .arange (N , dtype = int )
@@ -86,20 +130,29 @@ def compute_flat_clusters():
86130 flat_clusters [i + 1 ][segm_idx ] = N + i
87131
88132 for i in range (1 , N ):
89- _ , flat_clusters [i ] = np .unique (flat_clusters , return_inverse = True )
133+ _ , flat_clusters [i ] = np .unique (flat_clusters [ i ] , return_inverse = True )
90134 self .flat_clusters = flat_clusters
91135
92- def evaluate_impurity_det (self , labels_true ):
136+ def evaluate_homogeneity_completeness_tradeoff (self , true_labels ):
137+ """Evaluates the curve homogeneity versus completeness where
138+ Homogeneity: each cluster contains only members of a single class. (cluster purity)
139+ Completeness: all members of a given class are assigned to the same cluster. (class purity)
140+
141+ Args:
142+ true_labels: true cluster labels
143+
144+ Returns:
145+ homogeneity vector (num_samples,)
146+ completenes vector (num_samples,)
147+ """
93148 if self .flat_clusters is None :
94149 self .compute_flat_clusters ()
95150
96- # homogeneity: each cluster contains only members of a single class. (cluster purity)
97- # completeness: all members of a given class are assigned to the same cluster. (class purity)
98151 N = self .flat_clusters .shape [0 ]
99152 h = np .zeros ((N ,), dtype = float_cpu ())
100153 c = np .zeros ((N ,), dtype = float_cpu ())
101154 for i in range (self .flat_clusters .shape [0 ]):
102- h [i ] = homogeneity_score (labels_true , self .flat_clusters [i ])
103- c [i ] = completeness_score (labels_true , self .flat_clusters [i ])
155+ h [i ] = homogeneity_score (true_labels , self .flat_clusters [i ])
156+ c [i ] = completeness_score (true_labels , self .flat_clusters [i ])
104157
105- return 1 - h , 1 - c
158+ return h , c
0 commit comments