Skip to content

Commit

Permalink
Added tests. Refactored code. Improved comments
Browse files Browse the repository at this point in the history
  • Loading branch information
jordanmontt committed Sep 4, 2023
1 parent 8523971 commit a81aee3
Show file tree
Hide file tree
Showing 2 changed files with 120 additions and 32 deletions.
94 changes: 86 additions & 8 deletions src/AI-KMeans-Tests/AIKMeansTest.class.st
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,39 @@ AIKMeansTest >> testAssignClusterToPoints [
kMeans maxIterations: 5.
kMeans centroids: #( #( 0.1 0.1 ) #( 0.4 0.4 ) #( 10 10 ) ).
kMeans assignClusterToPoints: #( #( 0.1 0.1 ) #( 0.3 0.3 ) #( 9 9 ) ).

self assertCollection: kMeans clusters hasSameElements: #( 1 2 3 )
]

{ #category : #tests }
AIKMeansTest >> testChooseRandomCentroid [

| points randomCentroids |
points := #( #( 0 50 ) #( 10 1000 ) ).
"Get 10 random centroids"
randomCentroids := (1 to: 10) collect: [ :i | kMeans chooseRandomCentroid: points ].

randomCentroids do: [ :randomCentroid |
self assert: randomCentroid size equals: 2.

self assert: randomCentroid first <= 10.
self assert: randomCentroid first >= 0.

self assert: randomCentroid second <= 1000.
self assert: randomCentroid second >= 50 ]
]

{ #category : #tests }
AIKMeansTest >> testDistanceBetweenAnd [

self
assert: (kMeans distanceBetween: #( 1 1 ) and: #( 2 2 ))
closeTo: 2 sqrt. "The distance is two if we follow the euclidean distance"
self
assert: (kMeans distanceBetween: #( 0 0 ) and: #( 1 1 ))
closeTo: 2 sqrt "Euclidean distance"
]

{ #category : #tests }
AIKMeansTest >> testEmptyDataset [

Expand Down Expand Up @@ -98,8 +127,26 @@ AIKMeansTest >> testInitializeCentroidsKMeansPlusPlus [
{ #category : #tests }
AIKMeansTest >> testInitializeRandomCentroids [

kMeans := AIKMeans numberOfClusters: 3.

| points |
points := #( #( 0 0 ) #( 0.5 0 ) #( 0.5 1 ) #( 1 1 ) ).
kMeans numberOfClusters: 3.
kMeans initializeRandomCentroids: points.
self assert: kMeans centroids size equals: 3.
self denyCollection: kMeans centroids includesAny: points
]

{ #category : #tests }
AIKMeansTest >> testNearestCentroidToPoint [

| centroids |
centroids := #( #( 0 0 ) #( 1 1 ) #( 2 2 ) ).
kMeans
numberOfClusters: 3;
centroids: centroids.

self assert: (kMeans nearestCentroidToPoint: #( 1.5 1 )) equals: 2.
self assert: (kMeans nearestCentroidToPoint: #( 0.5 0.2 )) equals: 1.
self assert: (kMeans nearestCentroidToPoint: #( 1.5 1.7 )) equals: 3
]

{ #category : #tests }
Expand All @@ -118,14 +165,26 @@ AIKMeansTest >> testPredict [
{ #category : #tests }
AIKMeansTest >> testScore [

| data initCentroids |
| data initCentroids initClusters expectedScore |
data := #( #( 0 0 ) #( 5 2 ) #( 1 2 ) #( 1 1 ) ).
initCentroids := #( #( 0 0 ) #( 4 7 ) ).
initClusters := #(1 2 1 1).
"The score is the sum of the euclidean distance to each point to its cluster (centroid)"
expectedScore := 0 + 26 sqrt + 5 sqrt + 2 sqrt.

kMeans numberOfClusters: 2.
kMeans centroids: initCentroids.
kMeans clusters: initClusters.
self assert: (kMeans score: data) closeTo: expectedScore.

"Second case"
data := #( #( 0 0 ) #( 0.5 0 ) #( 0.5 1 ) #( 1 1 ) ).
initCentroids := #( #( 0 0 ) #( 10 1 ) ).

initClusters := #(1 1 1 1).
kMeans numberOfClusters: 2.
kMeans centroids: initCentroids.
kMeans assignClusterToPoints: data.
self assert: (kMeans score: data) closeTo: 0.5 + 2 sqrt + 1.11803
kMeans clusters: initClusters .
self assert: (kMeans score: data) closeTo: 0.25 sqrt + 1.25 sqrt + 2 sqrt
]

{ #category : #tests }
Expand All @@ -149,5 +208,24 @@ AIKMeansTest >> testTransform [
numberOfClusters: 2;
centroids: #( #( 0 0 ) #( 1 1 ) ).

self assert: (kMeans transform: points) equals: { { 0. euclideanDistance } . { euclideanDistance . 0 } }
self
assert: (kMeans transform: points)
equals: { { 0. euclideanDistance } . { euclideanDistance . 0 } }
]

{ #category : #tests }
AIKMeansTest >> testUpdateCentroids [

| points initCentroids expectedFirstCentroid expectedSecondCentroid |
points := #( #( 0 0 ) #( 1 1 ) #( 2 2 ) #( 6 6 ) #( 5 5 ) ).
initCentroids := #( #( 0 0 ) #( 6 6 ) ).
expectedFirstCentroid := { #( 0 0 ) . #( 1 1 ) . #( 2 2 ) } average.
expectedSecondCentroid := { #( 6 6 ) . #( 5 5 ) } average.
kMeans
numberOfClusters: 2;
clusters: #( 1 1 1 2 2 );
centroids: initCentroids.
kMeans updateCentroids: points.
self denyCollection: kMeans centroids equals: initCentroids.
self assertCollection: kMeans centroids equals: { expectedFirstCentroid . expectedSecondCentroid }.
]
58 changes: 34 additions & 24 deletions src/AI-KMeans/AIKMeans.class.st
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
"
K-Means is a clustering, unsupervised machine learning algorithm.
Please refer to the Pharo wiki for more information [Pharo wiki](https://github.com/pharo-ai/wiki)
Please refer to the Pharo wiki for more information [Pharo wiki](https://github.com/pharo-ai/wiki).
We used some comments from [scikitlearn](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans.transform) in our code.
"
Class {
#name : #AIKMeans,
Expand Down Expand Up @@ -61,19 +63,21 @@ AIKMeans >> centroids: aCollectionOfPoints [

{ #category : #training }
AIKMeans >> chooseRandomCentroid: aCollectionOfPoints [
"Algorithm:
- Take the min and max value for each dimension of the point. If the point is a normal X,Y
point, take the min and max X value, and the min and max Y value.
- Choose a random number between the min and max range for each point dimension. "

"Choose a random point as centroids"

| min max pointDimension centroid |
| min max pointDimension |
pointDimension := aCollectionOfPoints first size.
centroid := OrderedCollection new.

1 to: pointDimension do: [ :i |
max := aCollectionOfPoints max: [ :point | point at: i ].
min := aCollectionOfPoints min: [ :point | point at: i ].
centroid add: (rand nextBetween: min and: max) ].

^ centroid asArray

^ (1 to: pointDimension)
collect: [ :i |
max := aCollectionOfPoints max: [ :point | point at: i ].
min := aCollectionOfPoints min: [ :point | point at: i ].
rand nextBetween: min and: max ]
as: Array
]

{ #category : #accessing }
Expand All @@ -82,6 +86,13 @@ AIKMeans >> clusters [
^ clusters
]

{ #category : #accessing }
AIKMeans >> clusters: aCollection [
"Used for testing"

clusters := aCollection
]

{ #category : #training }
AIKMeans >> distanceBetween: firstPoint and: secondPoint [

Expand Down Expand Up @@ -141,7 +152,7 @@ AIKMeans >> initialize [
timesToRun := self class defaultNumberOfTimesItIsRun
]

{ #category : #initialization }
{ #category : #training }
AIKMeans >> initializeCentroidsKMeansPlusPlus: points [

" The k-means++ is an algorithm for initializing the centroids. It was proposed in 2007 by Arthur et Vassilvitskii.
Expand All @@ -163,7 +174,7 @@ AIKMeans >> initializeCentroidsKMeansPlusPlus: points [
centroids add: pointWithMaxDistance ]
]

{ #category : #initialization }
{ #category : #training }
AIKMeans >> initializeRandomCentroids: aCollectionOfPoints [

centroids := (1 to: numberOfClusters) collect: [ :i |
Expand All @@ -184,7 +195,7 @@ AIKMeans >> kMeansAlgorithm: aCollectionOfPoints [
doWhileFalse: [ self hasConverged or: [ self hasReachedMaxIterations ] ].
]

{ #category : #'api - configuration' }
{ #category : #accessing }
AIKMeans >> maxIterations: anInteger [

maxIterations := anInteger
Expand All @@ -207,23 +218,20 @@ AIKMeans >> nearestCentroidToPoint: aPoint [
^ nearestCentroidIndex
]

{ #category : #'api - configuration' }
{ #category : #accessing }
AIKMeans >> numberOfClusters: anObject [

numberOfClusters := anObject
]

{ #category : #api }
AIKMeans >> predict: aCollectionOfPoints [
"Asign each point to the closest centroid (in other words, cluster the points)"

| predictions |
predictions := OrderedCollection new.
1 to: aCollectionOfPoints size do: [ :index |
predictions add: (self nearestCentroidToPoint: (aCollectionOfPoints at: index)) ].
^ predictions
^ aCollectionOfPoints collect: [ :point | self nearestCentroidToPoint: point ]
]

{ #category : #'api - evaluation' }
{ #category : #api }
AIKMeans >> score: aCollectionOfPoints [
"The score is the sum of the mean square errors of the points and its cluster."

Expand All @@ -235,16 +243,18 @@ AIKMeans >> score: aCollectionOfPoints [
^ distances sum
]

{ #category : #'api - configuration' }
{ #category : #accessing }
AIKMeans >> timesToRun: anInteger [

timesToRun := anInteger
]

{ #category : #api }
AIKMeans >> transform: aCollectionOfPoints [
"Transform X to a cluster-distance space.
Compute the distance matrix between each point to each of the centroids"

^ aCollectionOfPoints collect: [ :aPoint |
^ aCollectionOfPoints collect: [ :aPoint |
centroids collect: [ :aCentroid | self distanceBetween: aPoint and: aCentroid ] ]
]

Expand Down

0 comments on commit a81aee3

Please sign in to comment.