Merge pull request #52 from nicodv/pr

Pr
nicodv · Nov 15, 2017 · 7eea6aa · 7eea6aa
2 parents 1a6a7be + 48314f1
commit 7eea6aa
Show file tree

Hide file tree

Showing 8 changed files with 246 additions and 63 deletions.
diff --git a/kmodes/kmodes.py b/kmodes/kmodes.py
@@ -2,9 +2,6 @@
 K-modes clustering for categorical data
 """
 
-# Author: 'Nico de Vos' <[email protected]>
-# License: MIT
-
 # pylint: disable=unused-argument,attribute-defined-outside-init
 
 from collections import defaultdict
@@ -15,15 +12,15 @@
 from sklearn.utils.validation import check_array
 
 from .util import get_max_value_key, encode_features, get_unique_rows, decode_centroids
-from .util.dissim import matching_dissim
+from .util.dissim import matching_dissim, ng_dissim
 
 
 def init_huang(X, n_clusters, dissim):
     """Initialize centroids according to method by Huang [1997]."""
-    nattrs = X.shape[1]
-    centroids = np.empty((n_clusters, nattrs), dtype='object')
+    n_attrs = X.shape[1]
+    centroids = np.empty((n_clusters, n_attrs), dtype='object')
     # determine frequencies of attributes
-    for iattr in range(nattrs):
+    for iattr in range(n_attrs):
         freq = defaultdict(int)
         for curattr in X[:, iattr]:
             freq[curattr] += 1
@@ -55,25 +52,25 @@ def init_cao(X, n_clusters, dissim):
 
     Note: O(N * attr * n_clusters**2), so watch out with large n_clusters
     """
-    npoints, nattrs = X.shape
-    centroids = np.empty((n_clusters, nattrs), dtype='object')
-    # Method is base don determining density of points.
-    dens = np.zeros(npoints)
-    for iattr in range(nattrs):
+    n_points, n_attrs = X.shape
+    centroids = np.empty((n_clusters, n_attrs), dtype='object')
+    # Method is based on determining density of points.
+    dens = np.zeros(n_points)
+    for iattr in range(n_attrs):
         freq = defaultdict(int)
         for val in X[:, iattr]:
             freq[val] += 1
-        for ipoint in range(npoints):
-            dens[ipoint] += freq[X[ipoint, iattr]] / float(nattrs)
-    dens /= npoints
+        for ipoint in range(n_points):
+            dens[ipoint] += freq[X[ipoint, iattr]] / float(n_attrs)
+    dens /= n_points
 
     # Choose initial centroids based on distance and density.
     centroids[0] = X[np.argmax(dens)]
     if n_clusters > 1:
         # For the remaining centroids, choose maximum dens * dissim to the
         # (already assigned) centroid with the lowest dens * dissim.
         for ik in range(1, n_clusters):
-            dd = np.empty((ik, npoints))
+            dd = np.empty((ik, n_points))
             for ikk in range(ik):
                 dd[ikk] = dissim(X, centroids[ikk]) * dens
             centroids[ik] = X[np.argmax(np.min(dd, axis=0))]
@@ -113,18 +110,18 @@ def move_point_cat(point, ipoint, to_clust, from_clust, cl_attr_freq,
     return cl_attr_freq, membship, centroids
 
 
-def _labels_cost(X, centroids, dissim):
+def _labels_cost(X, centroids, dissim, membship=None):
     """Calculate labels and cost function given a matrix of points and
     a list of centroids for the k-modes algorithm.
     """
 
     X = check_array(X)
 
-    npoints = X.shape[0]
+    n_points = X.shape[0]
     cost = 0.
-    labels = np.empty(npoints, dtype=np.uint8)
+    labels = np.empty(n_points, dtype=np.uint8)
     for ipoint, curpoint in enumerate(X):
-        diss = dissim(centroids, curpoint)
+        diss = dissim(centroids, curpoint, X=X, membship=membship)
         clust = np.argmin(diss)
         labels[ipoint] = clust
         cost += diss[clust]
@@ -136,7 +133,7 @@ def _k_modes_iter(X, centroids, cl_attr_freq, membship, dissim):
     """Single iteration of k-modes clustering algorithm"""
     moves = 0
     for ipoint, curpoint in enumerate(X):
-        clust = np.argmin(dissim(centroids, curpoint))
+        clust = np.argmin(dissim(centroids, curpoint, X=X, membship=membship))
         if membship[clust, ipoint]:
             # Point is already in its right place.
             continue
@@ -179,9 +176,9 @@ def k_modes(X, n_clusters, max_iter, dissim, init, n_init, verbose):
     # Based on the unique values in X, we can make a mapping to achieve this.
     X, enc_map = encode_features(X)
 
-    npoints, nattrs = X.shape
-    assert n_clusters <= npoints, "Cannot have more clusters ({}) " \
-                                  "than data points ({}).".format(n_clusters, npoints)
+    n_points, n_attrs = X.shape
+    assert n_clusters <= n_points, "Cannot have more clusters ({}) " \
+                                   "than data points ({}).".format(n_clusters, n_points)
 
     # Are there more n_clusters than unique rows? Then set the unique
     # rows as initial values and skip iteration.
@@ -207,7 +204,7 @@ def k_modes(X, n_clusters, max_iter, dissim, init, n_init, verbose):
         elif isinstance(init, str) and init.lower() == 'cao':
             centroids = init_cao(X, n_clusters, dissim)
         elif isinstance(init, str) and init.lower() == 'random':
-            seeds = np.random.choice(range(npoints), n_clusters)
+            seeds = np.random.choice(range(n_points), n_clusters)
             centroids = X[seeds]
         elif hasattr(init, '__array__'):
             # Make sure init is a 2D array.
@@ -216,30 +213,30 @@ def k_modes(X, n_clusters, max_iter, dissim, init, n_init, verbose):
             assert init.shape[0] == n_clusters, \
                 "Wrong number of initial centroids in init ({}, should be {})."\
                 .format(init.shape[0], n_clusters)
-            assert init.shape[1] == nattrs, \
+            assert init.shape[1] == n_attrs, \
                 "Wrong number of attributes in init ({}, should be {})."\
-                .format(init.shape[1], nattrs)
+                .format(init.shape[1], n_attrs)
             centroids = np.asarray(init, dtype=np.uint8)
         else:
             raise NotImplementedError
 
         if verbose:
             print("Init: initializing clusters")
-        membship = np.zeros((n_clusters, npoints), dtype=np.uint8)
+        membship = np.zeros((n_clusters, n_points), dtype=np.uint8)
         # cl_attr_freq is a list of lists with dictionaries that contain the
         # frequencies of values per cluster and attribute.
-        cl_attr_freq = [[defaultdict(int) for _ in range(nattrs)]
+        cl_attr_freq = [[defaultdict(int) for _ in range(n_attrs)]
                         for _ in range(n_clusters)]
         for ipoint, curpoint in enumerate(X):
             # Initial assignment to clusters
-            clust = np.argmin(dissim(centroids, curpoint))
+            clust = np.argmin(dissim(centroids, curpoint, X=X, membship=membship))
             membship[clust, ipoint] = 1
             # Count attribute values per cluster.
             for iattr, curattr in enumerate(curpoint):
                 cl_attr_freq[clust][iattr][curattr] += 1
         # Perform an initial centroid update.
         for ik in range(n_clusters):
-            for iattr in range(nattrs):
+            for iattr in range(n_attrs):
                 if sum(membship[ik]) == 0:
                     # Empty centroid, choose randomly
                     centroids[ik, iattr] = np.random.choice(X[:, iattr])
@@ -256,7 +253,7 @@ def k_modes(X, n_clusters, max_iter, dissim, init, n_init, verbose):
             itr += 1
             centroids, moves = _k_modes_iter(X, centroids, cl_attr_freq, membship, dissim)
             # All points seen in this iteration
-            labels, ncost = _labels_cost(X, centroids, dissim)
+            labels, ncost = _labels_cost(X, centroids, dissim, membship)
             converged = (moves == 0) or (ncost >= cost)
             cost = ncost
             if verbose:
@@ -292,7 +289,7 @@ class KModes(BaseEstimator, ClusterMixin):
         single run.
 
     cat_dissim : func, default: matching_dissim
-        Dissimilarity function used by the algorithm for categorical variables.
+        Dissimilarity function used by the k-modes algorithm for categorical variables.
         Defaults to the matching dissimilarity function.
 
     init : {'Huang', 'Cao', 'random' or an ndarray}, default: 'Cao'
@@ -376,7 +373,7 @@ def fit_predict(self, X, y=None, **kwargs):
         Convenience method; equivalent to calling fit(X) followed by
         predict(X).
         """
-        return self.fit(X, **kwargs).labels_
+        return self.fit(X, **kwargs).predict(X, **kwargs)
 
     def predict(self, X, **kwargs):
         """Predict the closest cluster each sample in X belongs to.
@@ -392,6 +389,12 @@ def predict(self, X, **kwargs):
             Index of the cluster each sample belongs to.
         """
         assert hasattr(self, '_enc_cluster_centroids'), "Model not yet fitted."
+
+        if self.verbose and self.cat_dissim == ng_dissim:
+            print("Ng's dissimilarity measure was used to train this model, "
+                  "but now that it is predicting the model will fall back to "
+                  "using simple matching dissimilarity.")
+
         X = check_array(X, dtype=None)
         X, _ = encode_features(X, enc_map=self._enc_map)
         return _labels_cost(X, self._enc_cluster_centroids, self.cat_dissim)[0]

diff --git a/kmodes/kprototypes.py b/kmodes/kprototypes.py
@@ -2,9 +2,6 @@
 K-prototypes clustering for mixed categorical and numerical data
 """
 
-# Author: 'Nico de Vos' <[email protected]>
-# License: MIT
-
 # pylint: disable=super-on-old-class,unused-argument,attribute-defined-outside-init
 
 from collections import defaultdict
@@ -50,20 +47,20 @@ def _split_num_cat(X, categorical):
     return Xnum, Xcat
 
 
-def _labels_cost(Xnum, Xcat, centroids, num_dissim, cat_dissim, gamma):
+def _labels_cost(Xnum, Xcat, centroids, num_dissim, cat_dissim, gamma, membship=None):
     """Calculate labels and cost function given a matrix of points and
     a list of centroids for the k-prototypes algorithm.
     """
 
-    npoints = Xnum.shape[0]
+    n_points = Xnum.shape[0]
     Xnum = check_array(Xnum)
 
     cost = 0.
-    labels = np.empty(npoints, dtype=np.uint8)
-    for ipoint in range(npoints):
+    labels = np.empty(n_points, dtype=np.uint8)
+    for ipoint in range(n_points):
         # Numerical cost = sum of Euclidean distances
         num_costs = num_dissim(centroids[0], Xnum[ipoint])
-        cat_costs = cat_dissim(centroids[1], Xcat[ipoint])
+        cat_costs = cat_dissim(centroids[1], Xcat[ipoint], X=Xcat, membship=membship)
         # Gamma relates the categorical cost to the numerical cost.
         tot_costs = num_costs + gamma * cat_costs
         clust = np.argmin(tot_costs)
@@ -80,7 +77,7 @@ def _k_prototypes_iter(Xnum, Xcat, centroids, cl_attr_sum, cl_memb_sum, cl_attr_
     for ipoint in range(Xnum.shape[0]):
         clust = np.argmin(
             num_dissim(centroids[0], Xnum[ipoint]) +
-            gamma * cat_dissim(centroids[1], Xcat[ipoint])
+            gamma * cat_dissim(centroids[1], Xcat[ipoint], X=Xcat, membship=membship)
         )
         if membship[clust, ipoint]:
             # Point is already in its right place.
@@ -153,9 +150,9 @@ def k_prototypes(X, categorical, n_clusters, max_iter, num_dissim, cat_dissim,
 
     ncatattrs = len(categorical)
     nnumattrs = X.shape[1] - ncatattrs
-    npoints = X.shape[0]
-    assert n_clusters <= npoints, "Cannot have more clusters ({}) " \
-                                  "than data points ({}).".format(n_clusters, npoints)
+    n_points = X.shape[0]
+    assert n_clusters <= n_points, "Cannot have more clusters ({}) " \
+                                   "than data points ({}).".format(n_clusters, n_points)
 
     Xnum, Xcat = _split_num_cat(X, categorical)
     Xnum, Xcat = check_array(Xnum), check_array(Xcat, dtype=None)
@@ -200,7 +197,7 @@ def k_prototypes(X, categorical, n_clusters, max_iter, num_dissim, cat_dissim,
             elif isinstance(init, str) and init.lower() == 'cao':
                 centroids = kmodes.init_cao(Xcat, n_clusters, cat_dissim)
             elif isinstance(init, str) and init.lower() == 'random':
-                seeds = np.random.choice(range(npoints), n_clusters)
+                seeds = np.random.choice(range(n_points), n_clusters)
                 centroids = Xcat[seeds]
             elif isinstance(init, list):
                 # Make sure inits are 2D arrays.
@@ -236,7 +233,7 @@ def k_prototypes(X, categorical, n_clusters, max_iter, num_dissim, cat_dissim,
 
             if verbose:
                 print("Init: initializing clusters")
-            membship = np.zeros((n_clusters, npoints), dtype=np.uint8)
+            membship = np.zeros((n_clusters, n_points), dtype=np.uint8)
             # Keep track of the sum of attribute values per cluster so that we
             # can do k-means on the numerical attributes.
             cl_attr_sum = np.zeros((n_clusters, nnumattrs), dtype=np.float64)
@@ -246,11 +243,11 @@ def k_prototypes(X, categorical, n_clusters, max_iter, num_dissim, cat_dissim,
             # the frequencies of values per cluster and attribute.
             cl_attr_freq = [[defaultdict(int) for _ in range(ncatattrs)]
                             for _ in range(n_clusters)]
-            for ipoint in range(npoints):
+            for ipoint in range(n_points):
                 # Initial assignment to clusters
                 clust = np.argmin(
-                    num_dissim(centroids[0], Xnum[ipoint]) +
-                    gamma * cat_dissim(centroids[1], Xcat[ipoint])
+                    num_dissim(centroids[0], Xnum[ipoint]) + gamma *
+                    cat_dissim(centroids[1], Xcat[ipoint], X=Xcat, membship=membship)
                 )
                 membship[clust, ipoint] = 1
                 cl_memb_sum[clust] += 1
@@ -295,7 +292,7 @@ def k_prototypes(X, categorical, n_clusters, max_iter, num_dissim, cat_dissim,
 
             # All points seen in this iteration
             labels, ncost = _labels_cost(Xnum, Xcat, centroids,
-                                         num_dissim, cat_dissim, gamma)
+                                         num_dissim, cat_dissim, gamma, membship)
             converged = (moves == 0) or (ncost >= cost)
             cost = ncost
             if verbose:
@@ -335,7 +332,7 @@ class KPrototypes(kmodes.KModes):
         Defaults to the Euclidian dissimilarity function.
 
     cat_dissim : func, default: matching_dissim
-        Dissimilarity function used by the algorithm for categorical variables.
+        Dissimilarity function used by the kmodes algorithm for categorical variables.
         Defaults to the matching dissimilarity function.
 
     n_init : int, default: 10

diff --git a/kmodes/tests/test_kmodes.py b/kmodes/tests/test_kmodes.py
@@ -9,6 +9,7 @@
 from sklearn.utils.testing import assert_equal
 
 from kmodes.kmodes import KModes
+from kmodes.util.dissim import ng_dissim
 
 
 SOYBEAN = np.array([
@@ -198,3 +199,48 @@ def test_kmodes_nunique_nclusters(self):
         np.testing.assert_array_equal(kmodes_cao.cluster_centroids_,
                                       np.array([[0, 1],
                                                 [0, 2]]))
+
+    def test_kmodes_huang_soybean_ng(self):
+        np.random.seed(42)
+        kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2, cat_dissim=ng_dissim)
+        result = kmodes_huang.fit_predict(SOYBEAN)
+        expected = np.array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
+                             2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
+        assert_cluster_splits_equal(result, expected)
+        self.assertTrue(result.dtype == np.dtype(np.uint8))
+
+    def test_kmodes_cao_soybean_ng(self):
+        kmodes_cao = KModes(n_clusters=4, init='Cao', verbose=2, cat_dissim=ng_dissim)
+        result = kmodes_cao.fit_predict(SOYBEAN)
+        expected = np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1,
+                             1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+        assert_cluster_splits_equal(result, expected)
+        self.assertTrue(result.dtype == np.dtype(np.uint8))
+
+    def test_kmodes_predict_soybean_ng(self):
+        kmodes_cao = KModes(n_clusters=4, init='Cao', verbose=2, cat_dissim=ng_dissim)
+        kmodes_cao = kmodes_cao.fit(SOYBEAN)
+        result = kmodes_cao.predict(SOYBEAN2)
+        expected = np.array([2, 1, 3, 0])
+        assert_cluster_splits_equal(result, expected)
+        self.assertTrue(result.dtype == np.dtype(np.uint8))
+
+    def test_kmodes_nunique_nclusters_ng(self):
+        data = np.array([
+            [0, 1],
+            [0, 1],
+            [0, 1],
+            [0, 2],
+            [0, 2],
+            [0, 2]
+        ])
+        np.random.seed(42)
+        kmodes_cao = KModes(n_clusters=6, init='Cao', verbose=2, cat_dissim=ng_dissim)
+        result = kmodes_cao.fit_predict(data, categorical=[1])
+        expected = np.array([0, 0, 0, 1, 1, 1])
+        assert_cluster_splits_equal(result, expected)
+        np.testing.assert_array_equal(kmodes_cao.cluster_centroids_,
+                                      np.array([[0, 1],
+                                                [0, 2]]))