LChange22/cluster.py at main · umilISLab/LChange22 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
import types
import warnings
import numpy as np
from tqdm import tqdm
from sklearn.metrics import pairwise
from sklearn.cluster import AffinityPropagation


def pairwise_similarity(X: np.array, Y: np.array = None, affinity: str = 'cosine'):
    '''
    Compute affinity between samples in X and Y
    Args:
        X: Input data
        Y(object, optional, default=None): Input data. If None, the output will be the pairwise similarities between all samples in X
        affinity(object, optional, default='cosine'): the measure of similarity. Cosine Similarity is set as default, but you can pass your own function
    Returns:
        Returns similarity between samples in X and Y
    '''
    Y = X if Y is None else Y

    if affinity == 'cosine':
        return pairwise.cosine_similarity(X, Y)

    elif isinstance(affinity, types.FunctionType):
        # TODO: speed-up
        return np.array([[affinity(x, y) for y in Y] for x in tqdm(X, leave=True, position=0)])
    else:
        raise Exception('You must define a similarity measure')


class IncrementalAffinityPropagation:

    def __init__(self, damping:float=0.5, max_iter:int=200, convergence_iter:int=15,
                 preference:object=None, affinity:object='cosine', verbose:bool=False):
        '''Affinity Propagation Clustering of data.

        Args:
            preference(object, optional, default=None): array-like of shape (n_samples,) or float, default=None
                Preferences for each point - points with larger values of
                preferences are more likely to be chosen as exemplars. The number of
                exemplars, i.e. of clusters, is influenced by the input preferences
                value. If the preferences are not passed as arguments, they will be
                set to the median of the input similarities (resulting in a moderate
                number of clusters). For a smaller amount of clusters, this can be set
                to the minimum value of the similarities.
            damping(float, optional, default=0.5): damping factor
            affinity(object, optional, default=0.5): similarity measure for compute similarity. By default, cosine similarity. You can provide your own function.
            convergence_iter(int, optional, default=15): Number of iterations with no change in the number of estimated clusters that stops the convergence.
            max_iter(int, optional, default=200): Maximum number of iterations
            damping(float, optional, default=0.5): Damping factor between 0.5 and 1.
            verbose(bool, optional, default=False): The verbosity level.
        '''

        self.damping = damping
        self.max_iter = max_iter
        self.convergence_iter = convergence_iter
        self.preference = preference
        self.verbose = verbose

        self.affinity = affinity

        # time step index of data arriving
        self.time_tag = 0

        # keep in memory data point
        self._X = None

        # keep in memory previous labels and clusters
        self.labels = dict()
        self.centroids = dict()

    def __step(self, ind:int, tmp) -> tuple:
        '''
        This is meant to return the new Availability (A) and Repsonsibility (R) matrices for all the data points

        Args:
            tmp(object): array-like of shape (n_samples, n_samples), Intermediate results
            ind(int): tmp index

        Returns:
            R(object): array-like of shape (n_samples, n_samples). Responsibility matrix
            A(object): array-like of shape (n_samples, n_samples). Availability matrix
        '''
        N, N = tmp.shape  # == self.R.shape == self.N.shape
        old_R, old_A = self.R, self.A

        # R UPDATE STEP - from sklearn
        np.add(old_A, self.S, tmp)
        I = np.argmax(tmp, axis=1)
        Y = tmp[ind, I]  # np.max(A + S, axis=1)
        tmp[ind, I] = -np.inf
        Y2 = np.max(tmp, axis=1)
        # tmp = Rnew
        np.subtract(self.S, Y[:, None], tmp)
        tmp[ind, I] = self.S[ind, I] - Y2

        # Damping
        tmp *= 1 - self.damping
        R = old_R * self.damping
        R += tmp

        # A UPDATE STEP - from sklearn
        # tmp = Rp; compute availabilities
        np.maximum(R, 0, tmp)
        tmp.flat[:: N + 1] = R.flat[:: N + 1]

        # tmp = -Anew
        tmp -= np.sum(tmp, axis=0)
        dA = np.diag(tmp).copy()
        tmp.clip(0, np.inf, tmp)
        tmp.flat[:: N + 1] = dA

        # Damping
        tmp *= 1 - self.damping
        A = old_A * self.damping
        A -= tmp

        return R, A

    def fit(self, X) -> tuple:
        '''
        This runs the Incremental Affinity Propagation Clustering Based on Nearest Neighbor Assignment until convergence

        Args:
            X(object): array-like of shape (n_samples, n_samples). Training instances to cluster

        Returns:
            exemplar : ndarray. Cluster exemplars
            labels : ndarray of shape (n_samples,). Cluster labels
        '''

        # first data
        if self.time_tag == 0:
            self._X = X

            # COMPUTE THE SIMILARITY MATRIX
            self.S = pairwise_similarity(X, affinity=self.affinity)

            if self.preference is None:
                self.preference = np.median(self.S)

            N, N = self.S.shape

            # Place preference on the diagonal of S
            self.preference = np.array(self.preference)
            self.S.flat[:: (N + 1)] = self.preference

            # Remove degeneracies
            self.S = self.S.reshape(N, N)

            #  INIITALISE THE RESPONSIBILITY AND THE AVAILABILITY MATRICES
            N, N = self.S.shape
            self.R, self.A = np.zeros((N, N)), np.zeros((N, N))

        # new data are available. Perform clustering from pre-computed matrices
        else:

            self._old_X = self._X

            # remove duplicates: data from timestep i that are already considered in timestep i-1
            if X.ndim == 1:
                new_X = np.setdiff1d(X, self._old_X)
            else:
                # TODO: remove duplicates
                new_X = X

            if len(new_X) == 0:
                warnings.warn("Incremental Affinity propagation needs new training instances.")
                return None, None

            self._X = np.concatenate([self._old_X, new_X])

            # Similarity matrix S update
            S_tmp1 = pairwise_similarity(new_X, self._old_X, affinity=self.affinity)
            S_tmp2 = pairwise_similarity(new_X, affinity=self.affinity)
            self.S = np.concatenate([np.concatenate([self.S, S_tmp1.T], axis=1),  # up & right
                                     np.concatenate([S_tmp1, S_tmp2], axis=1)])  # down
            if self.preference is None:
                self.preference = np.median(self.S)

            N, N = self.S.shape

            # Place preference on the diagonal of S
            #####self.preference = np.array(self.preference)
            #####self.S.flat[:: (N + 1)] = self.preference

            # Remove degeneracies
            #####self.S = self.S.reshape(N, N)

            # Responsibility matrix R update
            old_R, old_A, old_N = self.R, self.A, self.R.shape[1]

            R_tmp1 = np.array([[self.__matrix_update(self.S, old_R, i, j) for j in range(old_N)]
                               for i in range(old_N, N)]).T
            R_tmp2 = np.array([[self.__matrix_update(self.S, old_R, i, j) for i in range(old_N)]
                               for j in range(old_N, N)])
            R_tmp3 = np.zeros((new_X.shape[0], new_X.shape[0]))
            self.R = np.concatenate([np.concatenate([old_R, R_tmp1], axis=1), np.concatenate([R_tmp2, R_tmp3], axis=1)])

            # Availability matrix A update
            A_tmp1 = np.array([[self.__matrix_update(self.S, old_A, i, j) for j in range(old_N)]
                               for i in range(old_N, N)]).T
            A_tmp2 = np.array([[self.__matrix_update(self.S, old_A, i, j) for i in range(old_N)]
                               for j in range(old_N, N)])
            A_tmp3 = np.zeros((new_X.shape[0], new_X.shape[0]))
            self.A = np.concatenate([np.concatenate([old_A, A_tmp1], axis=1), np.concatenate([A_tmp2, A_tmp3], axis=1)])

        # -- #
        # Check for convergence (from scikit-learn)
        e = np.zeros((N, self.convergence_iter))
        # -- #

        ind = np.arange(N)
        tmp = np.zeros((N, N))

        for i in range(self.max_iter):
            if self.verbose:
                print("processing iteration %d" % (i,))
            self.R, self.A = self.__step(ind, tmp)

            # -- #
            # Check for convergence (from scikit-learn)
            E = (np.diag(self.A) + np.diag(self.R)) > 0
            e[:, i % self.convergence_iter] = E
            K = np.sum(E, axis=0)

            if i >= self.convergence_iter:
                se = np.sum(e, axis=1)
                unconverged = np.sum((se == self.convergence_iter) + (se == 0)) != N
                if (not unconverged and (K > 0)) or (i == self.max_iter):
                    never_converged = False
                    if self.verbose:
                        print("Converged after %d iterations." % i)
                    break
            # -- #

        else:
            never_converged = True
            if self.verbose:
                print("Did not converge")

        I = np.flatnonzero(E)
        K = I.size  # Identify exemplars

        if K > 0 and not never_converged:
            c = np.argmax(self.S[:, I], axis=1)
            c[I] = np.arange(K)  # Identify clusters
            # Refine the final set of exemplars and clusters and return results
            for k in range(K):
                ii = np.where(c == k)[0]
                j = np.argmax(np.sum(self.S[ii[:, np.newaxis], ii], axis=0))
                I[k] = ii[j]

            c = np.argmax(self.S[:, I], axis=1)
            c[I] = np.arange(K)
            labels = I[c]
            # Reduce labels to a sorted, gapless, list
            cluster_centers_indices = np.unique(labels)
            labels = np.searchsorted(cluster_centers_indices, labels)
        else:
            warnings.warn(
                "Affinity propagation did not converge, this model "
                "will not have any cluster centers.")
            labels = np.array([-1] * N)
            cluster_centers_indices = []

        self.labels[self.time_tag] = labels
        self.centroids[self.time_tag] = cluster_centers_indices
        self.time_tag += 1

        self.labels_ = labels
        self.cluster_centers_ = cluster_centers_indices

        return labels, cluster_centers_indices

    def __matrix_update(self, S, M, i, j):
        '''Matrix update for Incremental Affinity Propagation
        Args:
            S: similarity matrix
            M: matrix to update (Availability or Responsibility)
            i: row index
            j: column index
        '''

        # update as in the original paper
        old_N = M.shape[1]

        if i >= old_N:
            i = S[:old_N, i].argmax()
            return M[i, j]

        if j >= old_N:
            j = S[j, :old_N].argmax()
            return M[i, j]


class AffinityPropagationPosteriori:
    def __init__(self, trim:float=2, damping:float=0.7, max_iter:int=200):
        self.time_tag = 0
        self.data = list()
        self.trim = trim
        self.damping = damping
        self.max_iter = max_iter

    def _trim(self, X:np.array, labels:np.array, ignore=None) -> (np.array, np.array):
        '''Trim small size clusters'''

        unique, counts = np.unique(labels, return_counts=True)

        if ignore is None:
            condition = (counts > X.shape[0]*self.trim/100)
        else:
            condition = (counts > X.shape[0]*self.trim/100) | (np.isin(unique, ignore))

        survivors = unique[condition]

        X_trim = X[np.isin(labels, survivors)]
        labels_trim = labels[np.isin(labels, survivors)]

        return X_trim, labels_trim

    def _pack(self, X:np.array, labels:np.array) -> (np.array, np.array):
        '''Pack clusters of vectors into single representations'''
        unique = np.unique(labels)
        return np.array([X[labels==label].mean(0) for label in unique]), unique

    def fit(self, X: np.array) -> None:

        self.data.append(X)

        # standard AP
        if self.time_tag == 0:
            ap = AffinityPropagation(damping=self.damping, affinity='precomputed', max_iter=self.max_iter)
            ap.fit(pairwise_similarity(X))

            self._X_trim, self._labels_trim = self._trim(X, ap.labels_)
            self._X_pack, self._labels_pack = self._pack(self._X_trim, self._labels_trim)
            self.X_, self.labels_ = self._X_trim, self._labels_trim

            # didnt converge or single cluster
            if self.labels_.shape[0] <= 1:
                self.X_, self.labels_ = X, np.array([1] * X.shape[0])
                self._X_pack = np.array([X.mean(0)])
                self._labels_pack = np.array([1])
                self._X_trim, self._labels_trim = self.X_, self.labels_

        else:
            X_prev, X_curr = self.X_, X
            X_prev_pack = self._X_pack
            X_next = np.concatenate([X_prev_pack, X_curr])

            # preference over packed vectors
            preference = None####np.array([10.0]*len(X_prev_pack) + [0.1]*len(X_curr))
            ap = AffinityPropagation(damping=self.damping, affinity='precomputed', max_iter=self.max_iter, preference=preference)
            ap.fit(pairwise_similarity(X_next))

            labels_prev, labels_next = self.labels_, ap.labels_
            labels_prev_pack = self._labels_pack
            new_labels_prev_pack = labels_next[:labels_prev_pack.shape[0]]
            labels_curr = labels_next[labels_prev_pack.shape[0]:]

            # map old pack labels to new label
            new_labels_prev = labels_prev.copy()
            for old_, new_ in zip(labels_prev_pack, new_labels_prev_pack):
                new_labels_prev[labels_prev == old_] = new_

            # trim t1|t0
            self._prev_X_trim, self._prev_labels_trim = self._X_trim, new_labels_prev
            self._prev_X_pack, self._prev_labels_pack = self._X_pack, new_labels_prev_pack
            X_next = np.concatenate([self._prev_X_trim, X_curr])
            labels_next = np.concatenate([self._prev_labels_trim, labels_curr])

            X_next_trim, labels_next_trim = self._trim(X_next, labels_next)#, ignore=new_labels_prev)

            survivors = np.unique(labels_next_trim)

            self._prev_X_trim, self._prev_labels_trim = self._prev_X_trim[np.isin(self._prev_labels_trim, survivors)], self._prev_labels_trim[np.isin(self._prev_labels_trim, survivors)]

            self._curr_X_trim, self._curr_labels_trim = X_curr[np.isin(labels_curr, survivors)], labels_curr[np.isin(labels_curr, survivors)]
            self._curr_X_pack, self._curr_labels_pack = self._pack(self._curr_X_trim, self._curr_labels_trim)

            self.X_, self.labels_ = X_next_trim, labels_next_trim
            self._X_pack, self._labels_pack = self._pack(X_next_trim, labels_next_trim)

        self.time_tag += 1