Skip to content

Commit

Permalink
minor cleanup and documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
nicodv committed Nov 15, 2017
1 parent 69939d3 commit 48314f1
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 5 deletions.
12 changes: 9 additions & 3 deletions kmodes/kmodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from sklearn.utils.validation import check_array

from .util import get_max_value_key, encode_features, get_unique_rows, decode_centroids
from .util.dissim import matching_dissim
from .util.dissim import matching_dissim, ng_dissim


def init_huang(X, n_clusters, dissim):
Expand All @@ -38,7 +38,7 @@ def init_huang(X, n_clusters, dissim):
# The previously chosen centroids could result in empty clusters,
# so set centroid to closest point in X.
for ik in range(n_clusters):
ndx = np.argsort(dissim(X, centroids[ik], X=X))
ndx = np.argsort(dissim(X, centroids[ik]))
# We want the centroid to be unique.
while np.all(X[ndx[0]] == centroids, axis=1).any():
ndx = np.delete(ndx, 0)
Expand Down Expand Up @@ -72,7 +72,7 @@ def init_cao(X, n_clusters, dissim):
for ik in range(1, n_clusters):
dd = np.empty((ik, n_points))
for ikk in range(ik):
dd[ikk] = dissim(X, centroids[ikk], X=X) * dens
dd[ikk] = dissim(X, centroids[ikk]) * dens
centroids[ik] = X[np.argmax(np.min(dd, axis=0))]

return centroids
Expand Down Expand Up @@ -389,6 +389,12 @@ def predict(self, X, **kwargs):
Index of the cluster each sample belongs to.
"""
assert hasattr(self, '_enc_cluster_centroids'), "Model not yet fitted."

if self.verbose and self.cat_dissim == ng_dissim:
print("Ng's dissimilarity measure was used to train this model, "
"but now that it is predicting the model will fall back to "
"using simple matching dissimilarity.")

X = check_array(X, dtype=None)
X, _ = encode_features(X, enc_map=self._enc_map)
return _labels_cost(X, self._enc_cluster_centroids, self.cat_dissim)[0]
Expand Down
6 changes: 4 additions & 2 deletions kmodes/util/dissim.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,18 +17,20 @@ def euclidean_dissim(a, b, **_):
return np.sum((a - b) ** 2, axis=1)


def ng_dissim(a, b, X, membship=None):
def ng_dissim(a, b, X=None, membship=None):
"""Ng et al.'s dissimilarity measure, as presented in
Michael K. Ng, Mark Junjie Li, Joshua Zhexue Huang, and Zengyou He, "On the
Impact of Dissimilarity Measure in k-Modes Clustering Algorithm", IEEE
Transactions on Pattern Analysis and Machine Intelligence, Vol. 29, No. 3,
January, 2007
This function can potentially speed up training convergence.
Note that membship must be a rectangular array such that the
len(membship) = len(a) and len(membship[i]) = X.shape[1]
In case of missing membship, this function reverts back to
matching dissimilarity.
matching dissimilarity (e.g., when predicting).
"""
# Without membership, revert to matching dissimilarity
if membship is None:
Expand Down

0 comments on commit 48314f1

Please sign in to comment.