From 48314f1b5c6a54dbdcd6b3058253f7a7f8d47111 Mon Sep 17 00:00:00 2001
From: Nico de Vos <njdevos@gmail.com>
Date: Wed, 15 Nov 2017 10:47:44 -0800
Subject: [PATCH] minor cleanup and documentation

---
 kmodes/kmodes.py      | 12 +++++++++---
 kmodes/util/dissim.py |  6 ++++--
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/kmodes/kmodes.py b/kmodes/kmodes.py
index 3c38199..f04eabe 100644
--- a/kmodes/kmodes.py
+++ b/kmodes/kmodes.py
@@ -12,7 +12,7 @@
 from sklearn.utils.validation import check_array
 
 from .util import get_max_value_key, encode_features, get_unique_rows, decode_centroids
-from .util.dissim import matching_dissim
+from .util.dissim import matching_dissim, ng_dissim
 
 
 def init_huang(X, n_clusters, dissim):
@@ -38,7 +38,7 @@ def init_huang(X, n_clusters, dissim):
     # The previously chosen centroids could result in empty clusters,
     # so set centroid to closest point in X.
     for ik in range(n_clusters):
-        ndx = np.argsort(dissim(X, centroids[ik], X=X))
+        ndx = np.argsort(dissim(X, centroids[ik]))
         # We want the centroid to be unique.
         while np.all(X[ndx[0]] == centroids, axis=1).any():
             ndx = np.delete(ndx, 0)
@@ -72,7 +72,7 @@ def init_cao(X, n_clusters, dissim):
         for ik in range(1, n_clusters):
             dd = np.empty((ik, n_points))
             for ikk in range(ik):
-                dd[ikk] = dissim(X, centroids[ikk], X=X) * dens
+                dd[ikk] = dissim(X, centroids[ikk]) * dens
             centroids[ik] = X[np.argmax(np.min(dd, axis=0))]
 
     return centroids
@@ -389,6 +389,12 @@ def predict(self, X, **kwargs):
             Index of the cluster each sample belongs to.
         """
         assert hasattr(self, '_enc_cluster_centroids'), "Model not yet fitted."
+
+        if self.verbose and self.cat_dissim == ng_dissim:
+            print("Ng's dissimilarity measure was used to train this model, "
+                  "but now that it is predicting the model will fall back to "
+                  "using simple matching dissimilarity.")
+
         X = check_array(X, dtype=None)
         X, _ = encode_features(X, enc_map=self._enc_map)
         return _labels_cost(X, self._enc_cluster_centroids, self.cat_dissim)[0]
diff --git a/kmodes/util/dissim.py b/kmodes/util/dissim.py
index 2442d98..4f27471 100644
--- a/kmodes/util/dissim.py
+++ b/kmodes/util/dissim.py
@@ -17,18 +17,20 @@ def euclidean_dissim(a, b, **_):
     return np.sum((a - b) ** 2, axis=1)
 
 
-def ng_dissim(a, b, X, membship=None):
+def ng_dissim(a, b, X=None, membship=None):
     """Ng et al.'s dissimilarity measure, as presented in
     Michael K. Ng, Mark Junjie Li, Joshua Zhexue Huang, and Zengyou He, "On the
     Impact of Dissimilarity Measure in k-Modes Clustering Algorithm", IEEE
     Transactions on Pattern Analysis and Machine Intelligence, Vol. 29, No. 3,
     January, 2007
 
+    This function can potentially speed up training convergence.
+
     Note that membship must be a rectangular array such that the
     len(membship) = len(a) and len(membship[i]) = X.shape[1]
 
     In case of missing membship, this function reverts back to
-    matching dissimilarity.
+    matching dissimilarity (e.g., when predicting).
     """
     # Without membership, revert to matching dissimilarity
     if membship is None: