From 48314f1b5c6a54dbdcd6b3058253f7a7f8d47111 Mon Sep 17 00:00:00 2001 From: Nico de Vos Date: Wed, 15 Nov 2017 10:47:44 -0800 Subject: [PATCH] minor cleanup and documentation --- kmodes/kmodes.py | 12 +++++++++--- kmodes/util/dissim.py | 6 ++++-- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/kmodes/kmodes.py b/kmodes/kmodes.py index 3c38199..f04eabe 100644 --- a/kmodes/kmodes.py +++ b/kmodes/kmodes.py @@ -12,7 +12,7 @@ from sklearn.utils.validation import check_array from .util import get_max_value_key, encode_features, get_unique_rows, decode_centroids -from .util.dissim import matching_dissim +from .util.dissim import matching_dissim, ng_dissim def init_huang(X, n_clusters, dissim): @@ -38,7 +38,7 @@ def init_huang(X, n_clusters, dissim): # The previously chosen centroids could result in empty clusters, # so set centroid to closest point in X. for ik in range(n_clusters): - ndx = np.argsort(dissim(X, centroids[ik], X=X)) + ndx = np.argsort(dissim(X, centroids[ik])) # We want the centroid to be unique. while np.all(X[ndx[0]] == centroids, axis=1).any(): ndx = np.delete(ndx, 0) @@ -72,7 +72,7 @@ def init_cao(X, n_clusters, dissim): for ik in range(1, n_clusters): dd = np.empty((ik, n_points)) for ikk in range(ik): - dd[ikk] = dissim(X, centroids[ikk], X=X) * dens + dd[ikk] = dissim(X, centroids[ikk]) * dens centroids[ik] = X[np.argmax(np.min(dd, axis=0))] return centroids @@ -389,6 +389,12 @@ def predict(self, X, **kwargs): Index of the cluster each sample belongs to. """ assert hasattr(self, '_enc_cluster_centroids'), "Model not yet fitted." + + if self.verbose and self.cat_dissim == ng_dissim: + print("Ng's dissimilarity measure was used to train this model, " + "but now that it is predicting the model will fall back to " + "using simple matching dissimilarity.") + X = check_array(X, dtype=None) X, _ = encode_features(X, enc_map=self._enc_map) return _labels_cost(X, self._enc_cluster_centroids, self.cat_dissim)[0] diff --git a/kmodes/util/dissim.py b/kmodes/util/dissim.py index 2442d98..4f27471 100644 --- a/kmodes/util/dissim.py +++ b/kmodes/util/dissim.py @@ -17,18 +17,20 @@ def euclidean_dissim(a, b, **_): return np.sum((a - b) ** 2, axis=1) -def ng_dissim(a, b, X, membship=None): +def ng_dissim(a, b, X=None, membship=None): """Ng et al.'s dissimilarity measure, as presented in Michael K. Ng, Mark Junjie Li, Joshua Zhexue Huang, and Zengyou He, "On the Impact of Dissimilarity Measure in k-Modes Clustering Algorithm", IEEE Transactions on Pattern Analysis and Machine Intelligence, Vol. 29, No. 3, January, 2007 + This function can potentially speed up training convergence. + Note that membship must be a rectangular array such that the len(membship) = len(a) and len(membship[i]) = X.shape[1] In case of missing membship, this function reverts back to - matching dissimilarity. + matching dissimilarity (e.g., when predicting). """ # Without membership, revert to matching dissimilarity if membship is None: