Skip to content

Commit

Permalink
Add equality to partitioned data set
Browse files Browse the repository at this point in the history
Add split train test test method
  • Loading branch information
Hernán Morales Durand committed Dec 14, 2023
1 parent 22bf563 commit de2790e
Show file tree
Hide file tree
Showing 3 changed files with 96 additions and 5 deletions.
48 changes: 44 additions & 4 deletions src/AI-DataPartitioners-Tests/AIRandomPartitionerTest.class.st
Original file line number Diff line number Diff line change
@@ -1,10 +1,24 @@
Class {
#name : 'AIRandomPartitionerTest',
#superclass : 'TestCase',
#instVars : [
'partitioner',
'df'
],
#category : 'AI-DataPartitioners-Tests',
#package : 'AI-DataPartitioners-Tests'
}

{ #category : 'running' }
AIRandomPartitionerTest >> setUp [

super setUp.
df := DataFrame withRows: #( #( 'Barcelona' 1.609 true ) #( 'Dubai' 2.789 true ) #( 'London' 8.788 false ) ).

df rowNames: #( 'A' 'B' 'C' ).
df columnNames: #( 'City' 'Population' 'BeenThere' )
]

{ #category : 'tests' }
AIRandomPartitionerTest >> testSplitDataWithProportionsCase1 [

Expand Down Expand Up @@ -68,10 +82,36 @@ AIRandomPartitionerTest >> testSplitDataWithSizesCase2 [
]

{ #category : 'tests' }
AIRandomPartitionerTest >> testSplitTrainTestFromFeaturesUsingTargetWithProportionsShuffle [
AIRandomPartitionerTest >> testSplitTrainTestFromUsingTargetColumnWithProportionsShuffle [

| expected |
| expectedPartition partitionedDataSet |

expected := self.
self assert: false
expectedPartition := AIPartitionedDataSet new
xTrain: (DataFrame
withRows: #( #( 'Barcelona' 1.609 ) #( 'London' 8.788 ))
rowNames: #('A' 'C')
columnNames: #( 'City' 'Population' ));
xTest: (DataFrame
withRows: #( #( 'Dubai' 2.789 ))
rowNames: #('B')
columnNames: #( 'City' 'Population' ));
yTrain: (DataFrame
withRows: #( #( true ) #( false ))
rowNames: #('A' 'C')
columnNames: #( 'BeenThere' ));
yTest: (DataFrame
withRows: #( #( true ))
rowNames: #('B')
columnNames: #( 'BeenThere' ));
yourself.

partitionedDataSet := (AIRandomPartitioner new
splitTrainTestFrom: df
usingTargetColumn: #('BeenThere')
withProportions: #(0.7 0.3)
seed: 1).

self
assert: partitionedDataSet
equals: expectedPartition
]
28 changes: 28 additions & 0 deletions src/AI-DataPartitioners/AIPartitionedDataSet.class.st
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,34 @@ Class {
#package : 'AI-DataPartitioners'
}

{ #category : 'comparing' }
AIPartitionedDataSet >> = aPartitionedDataSet [

self == aPartitionedDataSet
ifTrue: [ ^ true ].
self class = aPartitionedDataSet class
ifFalse: [ ^ false ].
self xTest = aPartitionedDataSet xTest
ifFalse: [ ^ false ].
self yTest = aPartitionedDataSet yTest
ifFalse: [ ^ false ].
self xTrain = aPartitionedDataSet xTrain
ifFalse: [ ^ false ].
self yTrain = aPartitionedDataSet yTrain
ifFalse: [ ^ false ].
^ true
]

{ #category : 'comparing' }
AIPartitionedDataSet >> hash [
"hash is implemented because #= is implemented"

^ self species hash
bitXor: (self xTest hash
bitXor: (self xTrain hash bitXor: (self yTest hash
bitXor: self yTrain hash)))
]

{ #category : 'accessing' }
AIPartitionedDataSet >> xTest [

Expand Down
25 changes: 24 additions & 1 deletion src/AI-DataPartitioners/AIRandomPartitioner.class.st
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,30 @@ AIRandomPartitioner >> split: aCollection withSizes: aCollectionOfSizes [
]

{ #category : 'api' }
AIRandomPartitioner >> splitTrainTestFromFeatures: aDataFrame usingTarget: targetCollection withProportions: aTwoElementCollectionOfProportions shuffle: aBoolean [
AIRandomPartitioner >> splitTrainTestFrom: aDataFrame usingTargetColumn: targetCollection withProportions: aTwoElementCollectionOfProportions seed: aNumber [
"Answer a <AIPartitionedDataSet>. Split the receiver's data into two sets: train and test.
xTrain and yTrain sets are used for training and fitting the model.
xTest and yTest sets are used for testing the model.
"

| partition |

partition := self
split: aDataFrame
withProportions: aTwoElementCollectionOfProportions
seed: aNumber.
^ AIPartitionedDataSet new
xTrain: (partition first columnsAllBut: targetCollection);
yTrain: (partition first columns: targetCollection);

xTest: (partition second columnsAllBut: targetCollection);
yTest: (partition second columns: targetCollection);
yourself
]

{ #category : 'api' }
AIRandomPartitioner >> splitTrainTestFrom: aDataFrame usingTargetColumn: targetCollection withProportions: aTwoElementCollectionOfProportions shuffle: aBoolean [
"Answer a <AIPartitionedDataSet>. Split the receiver's data into two sets: train and test.
xTrain and yTrain sets are used for training and fitting the model.
Expand Down

0 comments on commit de2790e

Please sign in to comment.