diff --git a/src/AI-KMeans-Tests/AIKMeansTest.class.st b/src/AI-KMeans-Tests/AIKMeansTest.class.st index 76d403b..1f7e4ac 100644 --- a/src/AI-KMeans-Tests/AIKMeansTest.class.st +++ b/src/AI-KMeans-Tests/AIKMeansTest.class.st @@ -22,10 +22,39 @@ AIKMeansTest >> testAssignClusterToPoints [ kMeans maxIterations: 5. kMeans centroids: #( #( 0.1 0.1 ) #( 0.4 0.4 ) #( 10 10 ) ). kMeans assignClusterToPoints: #( #( 0.1 0.1 ) #( 0.3 0.3 ) #( 9 9 ) ). - + self assertCollection: kMeans clusters hasSameElements: #( 1 2 3 ) ] +{ #category : #tests } +AIKMeansTest >> testChooseRandomCentroid [ + + | points randomCentroids | + points := #( #( 0 50 ) #( 10 1000 ) ). + "Get 10 random centroids" + randomCentroids := (1 to: 10) collect: [ :i | kMeans chooseRandomCentroid: points ]. + + randomCentroids do: [ :randomCentroid | + self assert: randomCentroid size equals: 2. + + self assert: randomCentroid first <= 10. + self assert: randomCentroid first >= 0. + + self assert: randomCentroid second <= 1000. + self assert: randomCentroid second >= 50 ] +] + +{ #category : #tests } +AIKMeansTest >> testDistanceBetweenAnd [ + + self + assert: (kMeans distanceBetween: #( 1 1 ) and: #( 2 2 )) + closeTo: 2 sqrt. "The distance is two if we follow the euclidean distance" + self + assert: (kMeans distanceBetween: #( 0 0 ) and: #( 1 1 )) + closeTo: 2 sqrt "Euclidean distance" +] + { #category : #tests } AIKMeansTest >> testEmptyDataset [ @@ -98,8 +127,26 @@ AIKMeansTest >> testInitializeCentroidsKMeansPlusPlus [ { #category : #tests } AIKMeansTest >> testInitializeRandomCentroids [ - kMeans := AIKMeans numberOfClusters: 3. - + | points | + points := #( #( 0 0 ) #( 0.5 0 ) #( 0.5 1 ) #( 1 1 ) ). + kMeans numberOfClusters: 3. + kMeans initializeRandomCentroids: points. + self assert: kMeans centroids size equals: 3. + self denyCollection: kMeans centroids includesAny: points +] + +{ #category : #tests } +AIKMeansTest >> testNearestCentroidToPoint [ + + | centroids | + centroids := #( #( 0 0 ) #( 1 1 ) #( 2 2 ) ). + kMeans + numberOfClusters: 3; + centroids: centroids. + + self assert: (kMeans nearestCentroidToPoint: #( 1.5 1 )) equals: 2. + self assert: (kMeans nearestCentroidToPoint: #( 0.5 0.2 )) equals: 1. + self assert: (kMeans nearestCentroidToPoint: #( 1.5 1.7 )) equals: 3 ] { #category : #tests } @@ -118,14 +165,26 @@ AIKMeansTest >> testPredict [ { #category : #tests } AIKMeansTest >> testScore [ - | data initCentroids | + | data initCentroids initClusters expectedScore | + data := #( #( 0 0 ) #( 5 2 ) #( 1 2 ) #( 1 1 ) ). + initCentroids := #( #( 0 0 ) #( 4 7 ) ). + initClusters := #(1 2 1 1). + "The score is the sum of the euclidean distance to each point to its cluster (centroid)" + expectedScore := 0 + 26 sqrt + 5 sqrt + 2 sqrt. + + kMeans numberOfClusters: 2. + kMeans centroids: initCentroids. + kMeans clusters: initClusters. + self assert: (kMeans score: data) closeTo: expectedScore. + + "Second case" data := #( #( 0 0 ) #( 0.5 0 ) #( 0.5 1 ) #( 1 1 ) ). initCentroids := #( #( 0 0 ) #( 10 1 ) ). - + initClusters := #(1 1 1 1). kMeans numberOfClusters: 2. kMeans centroids: initCentroids. - kMeans assignClusterToPoints: data. - self assert: (kMeans score: data) closeTo: 0.5 + 2 sqrt + 1.11803 + kMeans clusters: initClusters . + self assert: (kMeans score: data) closeTo: 0.25 sqrt + 1.25 sqrt + 2 sqrt ] { #category : #tests } @@ -149,5 +208,24 @@ AIKMeansTest >> testTransform [ numberOfClusters: 2; centroids: #( #( 0 0 ) #( 1 1 ) ). - self assert: (kMeans transform: points) equals: { { 0. euclideanDistance } . { euclideanDistance . 0 } } + self + assert: (kMeans transform: points) + equals: { { 0. euclideanDistance } . { euclideanDistance . 0 } } +] + +{ #category : #tests } +AIKMeansTest >> testUpdateCentroids [ + + | points initCentroids expectedFirstCentroid expectedSecondCentroid | + points := #( #( 0 0 ) #( 1 1 ) #( 2 2 ) #( 6 6 ) #( 5 5 ) ). + initCentroids := #( #( 0 0 ) #( 6 6 ) ). + expectedFirstCentroid := { #( 0 0 ) . #( 1 1 ) . #( 2 2 ) } average. + expectedSecondCentroid := { #( 6 6 ) . #( 5 5 ) } average. + kMeans + numberOfClusters: 2; + clusters: #( 1 1 1 2 2 ); + centroids: initCentroids. + kMeans updateCentroids: points. + self denyCollection: kMeans centroids equals: initCentroids. + self assertCollection: kMeans centroids equals: { expectedFirstCentroid . expectedSecondCentroid }. ] diff --git a/src/AI-KMeans/AIKMeans.class.st b/src/AI-KMeans/AIKMeans.class.st index 0371023..8358b52 100644 --- a/src/AI-KMeans/AIKMeans.class.st +++ b/src/AI-KMeans/AIKMeans.class.st @@ -1,7 +1,9 @@ " K-Means is a clustering, unsupervised machine learning algorithm. -Please refer to the Pharo wiki for more information [Pharo wiki](https://github.com/pharo-ai/wiki) +Please refer to the Pharo wiki for more information [Pharo wiki](https://github.com/pharo-ai/wiki). + +We used some comments from [scikitlearn](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans.transform) in our code. " Class { #name : #AIKMeans, @@ -61,19 +63,21 @@ AIKMeans >> centroids: aCollectionOfPoints [ { #category : #training } AIKMeans >> chooseRandomCentroid: aCollectionOfPoints [ + "Algorithm: + + - Take the min and max value for each dimension of the point. If the point is a normal X,Y + point, take the min and max X value, and the min and max Y value. + - Choose a random number between the min and max range for each point dimension. " - "Choose a random point as centroids" - - | min max pointDimension centroid | + | min max pointDimension | pointDimension := aCollectionOfPoints first size. - centroid := OrderedCollection new. - - 1 to: pointDimension do: [ :i | - max := aCollectionOfPoints max: [ :point | point at: i ]. - min := aCollectionOfPoints min: [ :point | point at: i ]. - centroid add: (rand nextBetween: min and: max) ]. - - ^ centroid asArray + + ^ (1 to: pointDimension) + collect: [ :i | + max := aCollectionOfPoints max: [ :point | point at: i ]. + min := aCollectionOfPoints min: [ :point | point at: i ]. + rand nextBetween: min and: max ] + as: Array ] { #category : #accessing } @@ -82,6 +86,13 @@ AIKMeans >> clusters [ ^ clusters ] +{ #category : #accessing } +AIKMeans >> clusters: aCollection [ + "Used for testing" + + clusters := aCollection +] + { #category : #training } AIKMeans >> distanceBetween: firstPoint and: secondPoint [ @@ -141,7 +152,7 @@ AIKMeans >> initialize [ timesToRun := self class defaultNumberOfTimesItIsRun ] -{ #category : #initialization } +{ #category : #training } AIKMeans >> initializeCentroidsKMeansPlusPlus: points [ " The k-means++ is an algorithm for initializing the centroids. It was proposed in 2007 by Arthur et Vassilvitskii. @@ -163,7 +174,7 @@ AIKMeans >> initializeCentroidsKMeansPlusPlus: points [ centroids add: pointWithMaxDistance ] ] -{ #category : #initialization } +{ #category : #training } AIKMeans >> initializeRandomCentroids: aCollectionOfPoints [ centroids := (1 to: numberOfClusters) collect: [ :i | @@ -184,7 +195,7 @@ AIKMeans >> kMeansAlgorithm: aCollectionOfPoints [ doWhileFalse: [ self hasConverged or: [ self hasReachedMaxIterations ] ]. ] -{ #category : #'api - configuration' } +{ #category : #accessing } AIKMeans >> maxIterations: anInteger [ maxIterations := anInteger @@ -207,7 +218,7 @@ AIKMeans >> nearestCentroidToPoint: aPoint [ ^ nearestCentroidIndex ] -{ #category : #'api - configuration' } +{ #category : #accessing } AIKMeans >> numberOfClusters: anObject [ numberOfClusters := anObject @@ -215,15 +226,12 @@ AIKMeans >> numberOfClusters: anObject [ { #category : #api } AIKMeans >> predict: aCollectionOfPoints [ + "Asign each point to the closest centroid (in other words, cluster the points)" - | predictions | - predictions := OrderedCollection new. - 1 to: aCollectionOfPoints size do: [ :index | - predictions add: (self nearestCentroidToPoint: (aCollectionOfPoints at: index)) ]. - ^ predictions + ^ aCollectionOfPoints collect: [ :point | self nearestCentroidToPoint: point ] ] -{ #category : #'api - evaluation' } +{ #category : #api } AIKMeans >> score: aCollectionOfPoints [ "The score is the sum of the mean square errors of the points and its cluster." @@ -235,7 +243,7 @@ AIKMeans >> score: aCollectionOfPoints [ ^ distances sum ] -{ #category : #'api - configuration' } +{ #category : #accessing } AIKMeans >> timesToRun: anInteger [ timesToRun := anInteger @@ -243,8 +251,10 @@ AIKMeans >> timesToRun: anInteger [ { #category : #api } AIKMeans >> transform: aCollectionOfPoints [ + "Transform X to a cluster-distance space. + Compute the distance matrix between each point to each of the centroids" - ^ aCollectionOfPoints collect: [ :aPoint | + ^ aCollectionOfPoints collect: [ :aPoint | centroids collect: [ :aCentroid | self distanceBetween: aPoint and: aCentroid ] ] ]