Skip to content

Commit

Permalink
Merge pull request #52 from nicodv/pr
Browse files Browse the repository at this point in the history
Pr
  • Loading branch information
nicodv authored Nov 15, 2017
2 parents 1a6a7be + 48314f1 commit 7eea6aa
Show file tree
Hide file tree
Showing 8 changed files with 246 additions and 63 deletions.
71 changes: 37 additions & 34 deletions kmodes/kmodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,6 @@
K-modes clustering for categorical data
"""

# Author: 'Nico de Vos' <[email protected]>
# License: MIT

# pylint: disable=unused-argument,attribute-defined-outside-init

from collections import defaultdict
Expand All @@ -15,15 +12,15 @@
from sklearn.utils.validation import check_array

from .util import get_max_value_key, encode_features, get_unique_rows, decode_centroids
from .util.dissim import matching_dissim
from .util.dissim import matching_dissim, ng_dissim


def init_huang(X, n_clusters, dissim):
"""Initialize centroids according to method by Huang [1997]."""
nattrs = X.shape[1]
centroids = np.empty((n_clusters, nattrs), dtype='object')
n_attrs = X.shape[1]
centroids = np.empty((n_clusters, n_attrs), dtype='object')
# determine frequencies of attributes
for iattr in range(nattrs):
for iattr in range(n_attrs):
freq = defaultdict(int)
for curattr in X[:, iattr]:
freq[curattr] += 1
Expand Down Expand Up @@ -55,25 +52,25 @@ def init_cao(X, n_clusters, dissim):
Note: O(N * attr * n_clusters**2), so watch out with large n_clusters
"""
npoints, nattrs = X.shape
centroids = np.empty((n_clusters, nattrs), dtype='object')
# Method is base don determining density of points.
dens = np.zeros(npoints)
for iattr in range(nattrs):
n_points, n_attrs = X.shape
centroids = np.empty((n_clusters, n_attrs), dtype='object')
# Method is based on determining density of points.
dens = np.zeros(n_points)
for iattr in range(n_attrs):
freq = defaultdict(int)
for val in X[:, iattr]:
freq[val] += 1
for ipoint in range(npoints):
dens[ipoint] += freq[X[ipoint, iattr]] / float(nattrs)
dens /= npoints
for ipoint in range(n_points):
dens[ipoint] += freq[X[ipoint, iattr]] / float(n_attrs)
dens /= n_points

# Choose initial centroids based on distance and density.
centroids[0] = X[np.argmax(dens)]
if n_clusters > 1:
# For the remaining centroids, choose maximum dens * dissim to the
# (already assigned) centroid with the lowest dens * dissim.
for ik in range(1, n_clusters):
dd = np.empty((ik, npoints))
dd = np.empty((ik, n_points))
for ikk in range(ik):
dd[ikk] = dissim(X, centroids[ikk]) * dens
centroids[ik] = X[np.argmax(np.min(dd, axis=0))]
Expand Down Expand Up @@ -113,18 +110,18 @@ def move_point_cat(point, ipoint, to_clust, from_clust, cl_attr_freq,
return cl_attr_freq, membship, centroids


def _labels_cost(X, centroids, dissim):
def _labels_cost(X, centroids, dissim, membship=None):
"""Calculate labels and cost function given a matrix of points and
a list of centroids for the k-modes algorithm.
"""

X = check_array(X)

npoints = X.shape[0]
n_points = X.shape[0]
cost = 0.
labels = np.empty(npoints, dtype=np.uint8)
labels = np.empty(n_points, dtype=np.uint8)
for ipoint, curpoint in enumerate(X):
diss = dissim(centroids, curpoint)
diss = dissim(centroids, curpoint, X=X, membship=membship)
clust = np.argmin(diss)
labels[ipoint] = clust
cost += diss[clust]
Expand All @@ -136,7 +133,7 @@ def _k_modes_iter(X, centroids, cl_attr_freq, membship, dissim):
"""Single iteration of k-modes clustering algorithm"""
moves = 0
for ipoint, curpoint in enumerate(X):
clust = np.argmin(dissim(centroids, curpoint))
clust = np.argmin(dissim(centroids, curpoint, X=X, membship=membship))
if membship[clust, ipoint]:
# Point is already in its right place.
continue
Expand Down Expand Up @@ -179,9 +176,9 @@ def k_modes(X, n_clusters, max_iter, dissim, init, n_init, verbose):
# Based on the unique values in X, we can make a mapping to achieve this.
X, enc_map = encode_features(X)

npoints, nattrs = X.shape
assert n_clusters <= npoints, "Cannot have more clusters ({}) " \
"than data points ({}).".format(n_clusters, npoints)
n_points, n_attrs = X.shape
assert n_clusters <= n_points, "Cannot have more clusters ({}) " \
"than data points ({}).".format(n_clusters, n_points)

# Are there more n_clusters than unique rows? Then set the unique
# rows as initial values and skip iteration.
Expand All @@ -207,7 +204,7 @@ def k_modes(X, n_clusters, max_iter, dissim, init, n_init, verbose):
elif isinstance(init, str) and init.lower() == 'cao':
centroids = init_cao(X, n_clusters, dissim)
elif isinstance(init, str) and init.lower() == 'random':
seeds = np.random.choice(range(npoints), n_clusters)
seeds = np.random.choice(range(n_points), n_clusters)
centroids = X[seeds]
elif hasattr(init, '__array__'):
# Make sure init is a 2D array.
Expand All @@ -216,30 +213,30 @@ def k_modes(X, n_clusters, max_iter, dissim, init, n_init, verbose):
assert init.shape[0] == n_clusters, \
"Wrong number of initial centroids in init ({}, should be {})."\
.format(init.shape[0], n_clusters)
assert init.shape[1] == nattrs, \
assert init.shape[1] == n_attrs, \
"Wrong number of attributes in init ({}, should be {})."\
.format(init.shape[1], nattrs)
.format(init.shape[1], n_attrs)
centroids = np.asarray(init, dtype=np.uint8)
else:
raise NotImplementedError

if verbose:
print("Init: initializing clusters")
membship = np.zeros((n_clusters, npoints), dtype=np.uint8)
membship = np.zeros((n_clusters, n_points), dtype=np.uint8)
# cl_attr_freq is a list of lists with dictionaries that contain the
# frequencies of values per cluster and attribute.
cl_attr_freq = [[defaultdict(int) for _ in range(nattrs)]
cl_attr_freq = [[defaultdict(int) for _ in range(n_attrs)]
for _ in range(n_clusters)]
for ipoint, curpoint in enumerate(X):
# Initial assignment to clusters
clust = np.argmin(dissim(centroids, curpoint))
clust = np.argmin(dissim(centroids, curpoint, X=X, membship=membship))
membship[clust, ipoint] = 1
# Count attribute values per cluster.
for iattr, curattr in enumerate(curpoint):
cl_attr_freq[clust][iattr][curattr] += 1
# Perform an initial centroid update.
for ik in range(n_clusters):
for iattr in range(nattrs):
for iattr in range(n_attrs):
if sum(membship[ik]) == 0:
# Empty centroid, choose randomly
centroids[ik, iattr] = np.random.choice(X[:, iattr])
Expand All @@ -256,7 +253,7 @@ def k_modes(X, n_clusters, max_iter, dissim, init, n_init, verbose):
itr += 1
centroids, moves = _k_modes_iter(X, centroids, cl_attr_freq, membship, dissim)
# All points seen in this iteration
labels, ncost = _labels_cost(X, centroids, dissim)
labels, ncost = _labels_cost(X, centroids, dissim, membship)
converged = (moves == 0) or (ncost >= cost)
cost = ncost
if verbose:
Expand Down Expand Up @@ -292,7 +289,7 @@ class KModes(BaseEstimator, ClusterMixin):
single run.
cat_dissim : func, default: matching_dissim
Dissimilarity function used by the algorithm for categorical variables.
Dissimilarity function used by the k-modes algorithm for categorical variables.
Defaults to the matching dissimilarity function.
init : {'Huang', 'Cao', 'random' or an ndarray}, default: 'Cao'
Expand Down Expand Up @@ -376,7 +373,7 @@ def fit_predict(self, X, y=None, **kwargs):
Convenience method; equivalent to calling fit(X) followed by
predict(X).
"""
return self.fit(X, **kwargs).labels_
return self.fit(X, **kwargs).predict(X, **kwargs)

def predict(self, X, **kwargs):
"""Predict the closest cluster each sample in X belongs to.
Expand All @@ -392,6 +389,12 @@ def predict(self, X, **kwargs):
Index of the cluster each sample belongs to.
"""
assert hasattr(self, '_enc_cluster_centroids'), "Model not yet fitted."

if self.verbose and self.cat_dissim == ng_dissim:
print("Ng's dissimilarity measure was used to train this model, "
"but now that it is predicting the model will fall back to "
"using simple matching dissimilarity.")

X = check_array(X, dtype=None)
X, _ = encode_features(X, enc_map=self._enc_map)
return _labels_cost(X, self._enc_cluster_centroids, self.cat_dissim)[0]
Expand Down
35 changes: 16 additions & 19 deletions kmodes/kprototypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,6 @@
K-prototypes clustering for mixed categorical and numerical data
"""

# Author: 'Nico de Vos' <[email protected]>
# License: MIT

# pylint: disable=super-on-old-class,unused-argument,attribute-defined-outside-init

from collections import defaultdict
Expand Down Expand Up @@ -50,20 +47,20 @@ def _split_num_cat(X, categorical):
return Xnum, Xcat


def _labels_cost(Xnum, Xcat, centroids, num_dissim, cat_dissim, gamma):
def _labels_cost(Xnum, Xcat, centroids, num_dissim, cat_dissim, gamma, membship=None):
"""Calculate labels and cost function given a matrix of points and
a list of centroids for the k-prototypes algorithm.
"""

npoints = Xnum.shape[0]
n_points = Xnum.shape[0]
Xnum = check_array(Xnum)

cost = 0.
labels = np.empty(npoints, dtype=np.uint8)
for ipoint in range(npoints):
labels = np.empty(n_points, dtype=np.uint8)
for ipoint in range(n_points):
# Numerical cost = sum of Euclidean distances
num_costs = num_dissim(centroids[0], Xnum[ipoint])
cat_costs = cat_dissim(centroids[1], Xcat[ipoint])
cat_costs = cat_dissim(centroids[1], Xcat[ipoint], X=Xcat, membship=membship)
# Gamma relates the categorical cost to the numerical cost.
tot_costs = num_costs + gamma * cat_costs
clust = np.argmin(tot_costs)
Expand All @@ -80,7 +77,7 @@ def _k_prototypes_iter(Xnum, Xcat, centroids, cl_attr_sum, cl_memb_sum, cl_attr_
for ipoint in range(Xnum.shape[0]):
clust = np.argmin(
num_dissim(centroids[0], Xnum[ipoint]) +
gamma * cat_dissim(centroids[1], Xcat[ipoint])
gamma * cat_dissim(centroids[1], Xcat[ipoint], X=Xcat, membship=membship)
)
if membship[clust, ipoint]:
# Point is already in its right place.
Expand Down Expand Up @@ -153,9 +150,9 @@ def k_prototypes(X, categorical, n_clusters, max_iter, num_dissim, cat_dissim,

ncatattrs = len(categorical)
nnumattrs = X.shape[1] - ncatattrs
npoints = X.shape[0]
assert n_clusters <= npoints, "Cannot have more clusters ({}) " \
"than data points ({}).".format(n_clusters, npoints)
n_points = X.shape[0]
assert n_clusters <= n_points, "Cannot have more clusters ({}) " \
"than data points ({}).".format(n_clusters, n_points)

Xnum, Xcat = _split_num_cat(X, categorical)
Xnum, Xcat = check_array(Xnum), check_array(Xcat, dtype=None)
Expand Down Expand Up @@ -200,7 +197,7 @@ def k_prototypes(X, categorical, n_clusters, max_iter, num_dissim, cat_dissim,
elif isinstance(init, str) and init.lower() == 'cao':
centroids = kmodes.init_cao(Xcat, n_clusters, cat_dissim)
elif isinstance(init, str) and init.lower() == 'random':
seeds = np.random.choice(range(npoints), n_clusters)
seeds = np.random.choice(range(n_points), n_clusters)
centroids = Xcat[seeds]
elif isinstance(init, list):
# Make sure inits are 2D arrays.
Expand Down Expand Up @@ -236,7 +233,7 @@ def k_prototypes(X, categorical, n_clusters, max_iter, num_dissim, cat_dissim,

if verbose:
print("Init: initializing clusters")
membship = np.zeros((n_clusters, npoints), dtype=np.uint8)
membship = np.zeros((n_clusters, n_points), dtype=np.uint8)
# Keep track of the sum of attribute values per cluster so that we
# can do k-means on the numerical attributes.
cl_attr_sum = np.zeros((n_clusters, nnumattrs), dtype=np.float64)
Expand All @@ -246,11 +243,11 @@ def k_prototypes(X, categorical, n_clusters, max_iter, num_dissim, cat_dissim,
# the frequencies of values per cluster and attribute.
cl_attr_freq = [[defaultdict(int) for _ in range(ncatattrs)]
for _ in range(n_clusters)]
for ipoint in range(npoints):
for ipoint in range(n_points):
# Initial assignment to clusters
clust = np.argmin(
num_dissim(centroids[0], Xnum[ipoint]) +
gamma * cat_dissim(centroids[1], Xcat[ipoint])
num_dissim(centroids[0], Xnum[ipoint]) + gamma *
cat_dissim(centroids[1], Xcat[ipoint], X=Xcat, membship=membship)
)
membship[clust, ipoint] = 1
cl_memb_sum[clust] += 1
Expand Down Expand Up @@ -295,7 +292,7 @@ def k_prototypes(X, categorical, n_clusters, max_iter, num_dissim, cat_dissim,

# All points seen in this iteration
labels, ncost = _labels_cost(Xnum, Xcat, centroids,
num_dissim, cat_dissim, gamma)
num_dissim, cat_dissim, gamma, membship)
converged = (moves == 0) or (ncost >= cost)
cost = ncost
if verbose:
Expand Down Expand Up @@ -335,7 +332,7 @@ class KPrototypes(kmodes.KModes):
Defaults to the Euclidian dissimilarity function.
cat_dissim : func, default: matching_dissim
Dissimilarity function used by the algorithm for categorical variables.
Dissimilarity function used by the kmodes algorithm for categorical variables.
Defaults to the matching dissimilarity function.
n_init : int, default: 10
Expand Down
46 changes: 46 additions & 0 deletions kmodes/tests/test_kmodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from sklearn.utils.testing import assert_equal

from kmodes.kmodes import KModes
from kmodes.util.dissim import ng_dissim


SOYBEAN = np.array([
Expand Down Expand Up @@ -198,3 +199,48 @@ def test_kmodes_nunique_nclusters(self):
np.testing.assert_array_equal(kmodes_cao.cluster_centroids_,
np.array([[0, 1],
[0, 2]]))

def test_kmodes_huang_soybean_ng(self):
np.random.seed(42)
kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2, cat_dissim=ng_dissim)
result = kmodes_huang.fit_predict(SOYBEAN)
expected = np.array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
assert_cluster_splits_equal(result, expected)
self.assertTrue(result.dtype == np.dtype(np.uint8))

def test_kmodes_cao_soybean_ng(self):
kmodes_cao = KModes(n_clusters=4, init='Cao', verbose=2, cat_dissim=ng_dissim)
result = kmodes_cao.fit_predict(SOYBEAN)
expected = np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
assert_cluster_splits_equal(result, expected)
self.assertTrue(result.dtype == np.dtype(np.uint8))

def test_kmodes_predict_soybean_ng(self):
kmodes_cao = KModes(n_clusters=4, init='Cao', verbose=2, cat_dissim=ng_dissim)
kmodes_cao = kmodes_cao.fit(SOYBEAN)
result = kmodes_cao.predict(SOYBEAN2)
expected = np.array([2, 1, 3, 0])
assert_cluster_splits_equal(result, expected)
self.assertTrue(result.dtype == np.dtype(np.uint8))

def test_kmodes_nunique_nclusters_ng(self):
data = np.array([
[0, 1],
[0, 1],
[0, 1],
[0, 2],
[0, 2],
[0, 2]
])
np.random.seed(42)
kmodes_cao = KModes(n_clusters=6, init='Cao', verbose=2, cat_dissim=ng_dissim)
result = kmodes_cao.fit_predict(data, categorical=[1])
expected = np.array([0, 0, 0, 1, 1, 1])
assert_cluster_splits_equal(result, expected)
np.testing.assert_array_equal(kmodes_cao.cluster_centroids_,
np.array([[0, 1],
[0, 2]]))
Loading

0 comments on commit 7eea6aa

Please sign in to comment.