radip/MDN_clustering.py at master · azyner/radip · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
import numpy as np
import scipy
import sklearn.cluster
import pandas as pd
import copy

def KL_divergence():
    return None


def bhattacharyya_distance():
    return None


def euclid_distance(a, b):
    return np.sqrt(np.square(a[1]-b[1]) + np.square(a[2]-b[2]))

# This implementation is done as a BFS search. That is, for each depth in the tree, it will cluster those mixtures,
# add them to a path, and kill / split paths as necessary.
def cluster_MDN_into_sets(MDN_model_output, mix_weight_threshold=0.5, eps=1.0, min_samples=1):
    np.set_printoptions(precision=1)
    # In hindsight this should have been done via a tree structure. I mean, it is, just the representation is poor
    mdn = MDN_model_output[0]
    old_clusterer = None
    clusterer = sklearn.cluster.DBSCAN(eps=eps, min_samples=min_samples, metric=euclid_distance, algorithm='brute')
    cluster_centroids = []  # Dims: groups, timestep, [x, y]
    MDN_groups = []         # Dims: groups, timesteps, [MDN PARAMS]
    MDN_group_padding = []
    live_MDN_groups = []
    try:
        for t in range(len(mdn)):
            model_out_this_timestep = mdn[t]
            # Delete all mixtures that have a weight smaller than threshhold * (1/num_mixes)
            strong_mixes = model_out_this_timestep[:, 0] > ((mix_weight_threshold) / model_out_this_timestep.shape[0])
            model_out_this_timestep = model_out_this_timestep[strong_mixes, :]
            if t == 0:
                groupings = clusterer.fit_predict(model_out_this_timestep)
                if (groupings == -1).all():
                    raise AssertionError
                timestep_groups = []
                for group_idx in range(max(groupings) + 1):
                    # Create a list that only contains the Mixes for group number IDX
                    # Add them to the timetep set.
                    timestep_groups.append(model_out_this_timestep[groupings==group_idx])
                MDN_groups = [[g] for g in timestep_groups]
                live_MDN_groups = range(len(timestep_groups))
            else:
                # We run the grouper again, and then match each group to its closest in t-1. If there are two matches, the
                # tree has split, and so we must copy out history.
                timestep_groups = [[] for i in live_MDN_groups] #range(len(MDN_groups))]

                groupings = clusterer.fit_predict(model_out_this_timestep)
                if (groupings == -1).all():
                    raise AssertionError
                if max(groupings) > 0:
                    ideas = None
                temp_groups_this_timestep = []
                for mdn_output_group_idx in range(max(groupings + 1)):
                    # Collect all groups at that idx
                    this_group_idxs = np.array(range(len(groupings)))[groupings == mdn_output_group_idx]
                    temp_groups_this_timestep.append(model_out_this_timestep[this_group_idxs])

                # Now I have this timestep grouped, I need to find their closest group from t-1 (MDN_groups)
                # If there are two with the same group, the closer wins, and the further spawns a new group, copying
                # its history
                # So now I need a map between temp_groups and MDN t-1, and distances

                ideas = None
                temp_groups_closest = []
                temp_groups_distances = []
                for i in range(len(temp_groups_this_timestep)):
                    temp_group_centroid = np.average(temp_groups_this_timestep[i][:, 1:3],
                                                     weights=temp_groups_this_timestep[i][:, 0], axis=0)
                    closest_group = None
                    closest_distance = 999999999
                    for MDN_group_idx in live_MDN_groups:
                        try:
                            MDN_group_centroid = np.average(MDN_groups[MDN_group_idx][t-1][:, 1:3],
                                                        weights=MDN_groups[MDN_group_idx][t-1][:, 0], axis=0)
                        except IndexError:
                            ideas = None
                        this_distance = scipy.spatial.distance.euclidean(temp_group_centroid,MDN_group_centroid)
                        if closest_group is None:
                            closest_group = MDN_group_idx
                            closest_distance = this_distance
                            continue
                        elif this_distance < closest_distance:
                            closest_group = MDN_group_idx
                            closest_distance = this_distance
                    temp_groups_closest.append(closest_group)
                    temp_groups_distances.append(closest_distance)

                temp_groups_closest = np.array(temp_groups_closest)
                temp_groups_distances = np.array(temp_groups_distances)

                # We now have the map of the closest parent to these children. But only 1 child per parent, unless a new
                # track has spawned.

                used_parent_idxs = []
                live_idx_vals_to_remove = []
                for idx in range(len(live_MDN_groups)):
                    temp_group_idxs_that_match_parent = np.array(range(len(temp_groups_closest)))[temp_groups_closest == live_MDN_groups[idx]]
                    # Because I used the index to remap to a limited range of idxs, I have to un map it again
                    if len(temp_groups_distances[temp_group_idxs_that_match_parent]) < 1:
                        # If a track dies
                        live_idx_vals_to_remove.append(live_MDN_groups[idx])
                        continue
                    closest_parent_group_idx = temp_group_idxs_that_match_parent[np.argmin(temp_groups_distances[temp_group_idxs_that_match_parent])]
                    # Now that I have found the closest group, I can assign it
                    timestep_groups[idx] = temp_groups_this_timestep[closest_parent_group_idx]
                    used_parent_idxs.append(closest_parent_group_idx)

                ################ PATH DIES
                # It is undefined to remove when iterating over a list
                for idx_val in live_idx_vals_to_remove:
                    idx = live_MDN_groups.index(idx_val)  # First, find the mapping_idx in the live_MDN_groups list (NOT a regular idx)
                    del(live_MDN_groups[idx])             # Then delete those from the list
                    del(timestep_groups[idx])             # As a group is deleted in the master tree, the addition tree must also be deleted
                ################ /PATH DIES

                unused_temp_group_idxs = np.delete(np.array(range(len(temp_groups_closest))), used_parent_idxs)
                if len(unused_temp_group_idxs) > 0:
                    ######## PATH SPAWNS
                    for unused_temp_group_idx in unused_temp_group_idxs:
                        MDN_group_idx += 1
                        MDN_groups.append(copy.deepcopy(MDN_groups[temp_groups_closest[unused_temp_group_idx]]))
                        timestep_groups.append(temp_groups_this_timestep[unused_temp_group_idx])
                        live_MDN_groups.append(len(MDN_groups)-1)

                for i in range(len(timestep_groups)):
                    # if i not in live_MDN_groups:
                    #     continue
                    # if len(timestep_groups[i]) == 0:
                    #     continue
                    try:
                        MDN_groups[live_MDN_groups[i]].append(np.array(timestep_groups[i]))
                    except IndexError:
                        ideas = None
                if len(MDN_groups) > 6:
                    WTFisGOINGon = None
                if len(MDN_groups[0]) > t+1:
                    ohno = None

        # Return 2 things: The raw MDNS and centroid clusters for simplicity
        centroid_groups = []
        centroid_weights = []
        for path in MDN_groups:
            simple_path = []
            path_weights = 0
            for vals_at_timestep in path:
                simple_path.append(np.average(vals_at_timestep[:, 1:3], weights=vals_at_timestep[:, 0], axis=0))
                path_weights += np.sum(vals_at_timestep[:, 0])

            # Normalize according to length of path
            # This avoid bias where longer paths sum more weight and are therefore more important
            #path_weights /= len(path)
            centroid_weights.append(path_weights)
            centroid_groups.append(simple_path)
        #Normalize
        centroid_weights = np.array(centroid_weights) / np.sum(centroid_weights)
    except (AssertionError, ValueError) as e:
        # Oh no! At least one timestep had no groupings! This could be because threshold or min points is set too high
        # The only thing to do now is return the strongest single track as a sort of placeholder solution
        print "Warning: The clustering algorithm failed to find a solution. Either threshold or min_samples is too high"
        MDN_groups = None
        for MDN_timestep_idx in range(len(MDN_model_output[0])):
            MDN_timestep = MDN_model_output[0,MDN_timestep_idx]
            strongest_MDN_idx = np.argmax(MDN_timestep[:, 0])
            strongest_MDN = MDN_timestep[strongest_MDN_idx]
            if MDN_timestep_idx == 0:
                MDN_groups = [strongest_MDN]
            else:
                MDN_groups.append(strongest_MDN)
        MDN_groups = [np.array(MDN_groups)]
        centroid_weights = [1.0]
        centroid_groups = [MDN_groups[0][:, 1:3]]

    return MDN_groups, centroid_groups, centroid_weights