Skip to content

Commit

Permalink
[openmp] init commit
Browse files Browse the repository at this point in the history
[travis] run unit tests

[notes] update for openmp
  • Loading branch information
Anastasios Zouzias committed Jul 3, 2019
1 parent cba91f6 commit 6fd7713
Show file tree
Hide file tree
Showing 17 changed files with 224 additions and 139 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
examples/*.log
examples/*.so
cmake-build-debug/
build/
cmake-build-debug/
pybind11/
Expand Down
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ script:
- cmake ..
- cmake --build .
- ctest
- ctest -T memcheck
- valgrind --leak-check=full --track-fds=yes --error-exitcode=1 ./bin/unit_tests
- ./bin/unit_tests

after_success:
- coveralls --root .. -E ".*pybind11.*" -E ".*external.*" -E ".*CMakeFiles.*" -E ".*test/.*.cpp.*"
Expand Down
37 changes: 32 additions & 5 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
cmake_minimum_required(VERSION 3.5 FATAL_ERROR)
cmake_minimum_required(VERSION 3.12 FATAL_ERROR)

project(microgbt CXX)
PROJECT(microgbt CXX)

OPTION(USE_OPENMP "Enable OpenMP" ON)

# project version
set(VERSION_MAJOR 0)
Expand All @@ -11,9 +13,23 @@ set(VERSION_PATCH 0)
# enable c++ language
enable_language(CXX)
set(CMAKE_CXX_STANDARD 11)
SET(CMAKE_POSITION_INDEPENDENT_CODE ON)

# Look for eigen 3.3
find_package (Eigen3 3.3 REQUIRED NO_MODULE)
FIND_PACKAGE (Eigen3 3.3 REQUIRED NO_MODULE)



# Based on https://iscinumpy.gitlab.io/post/omp-on-high-sierra/
if(USE_OPENMP)
FIND_PACKAGE(OpenMP REQUIRED)
else()
# Ignore unknown #pragma warning
if((CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
OR (CMAKE_CXX_COMPILER_ID STREQUAL "GNU"))
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unknown-pragmas")
endif()
endif(USE_OPENMP)

# place binaries and libraries according to GNU standards
include(GNUInstallDirs)
Expand All @@ -27,7 +43,7 @@ if(CMAKE_CXX_COMPILER_ID MATCHES GNU)
endif()

if(UNIX OR MINGW OR CYGWIN)
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -pthread -O3 -Wextra -Wall -Wno-ignored-attributes -Wno-unknown-pragmas -Wno-return-type")
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -pthread -O0 -Wextra -Wall -Wno-ignored-attributes -Wno-unknown-pragmas -Wno-return-type")
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -funroll-loops")
endif()
Expand Down Expand Up @@ -65,6 +81,17 @@ include_directories( ${EIGEN3_INCLUDE_DIRS} )
# For Python integration
add_subdirectory(pybind11)

pybind11_add_module(gbtpy src/metrics/metric.h src/trees/tree.h src/GBT.h src/dataset.h src/metrics/logloss.h src/trees/treenode.h src/trees/split_info.h
pybind11_add_module(gbtpy src/metrics/metric.h src/trees/tree.h src/GBT.h src/dataset.h src/metrics/logloss.h
src/trees/treenode.h src/trees/split_info.h
src/utils.h
src/python_api.cpp)
#########################


#########################
# Link openMP
#########################
if(USE_OPENMP)
target_link_libraries(gbtpy PRIVATE OpenMP::OpenMP_CXX)
endif(USE_OPENMP)
#########################
12 changes: 6 additions & 6 deletions NOTES.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,19 +10,19 @@ cloc src
returns

```text
11 text files.
11 unique files.
12 text files.
12 unique files.
0 files ignored.
github.com/AlDanial/cloc v 1.82 T=0.01 s (1179.0 files/s, 110184.6 lines/s)
github.com/AlDanial/cloc v 1.82 T=0.01 s (809.9 files/s, 73226.7 lines/s)
-------------------------------------------------------------------------------
Language files blank comment code
-------------------------------------------------------------------------------
C/C++ Header 8 177 254 518
C/C++ Header 9 191 252 561
C++ 2 10 3 48
CMake 1 4 0 14
CMake 1 4 0 16
-------------------------------------------------------------------------------
SUM: 11 191 257 580
SUM: 12 205 255 625
-------------------------------------------------------------------------------
```
4 changes: 3 additions & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
add_library(microgbt "" metrics/metric.h trees/tree.h GBT.h dataset.h metrics/logloss.h trees/treenode.h trees/split_info.h metrics/rmse.h)
add_library(microgbt "" metrics/metric.h trees/tree.h trees/split_info.h
GBT.h dataset.h metrics/logloss.h trees/treenode.h metrics/rmse.h
utils.h)


target_sources(
Expand Down
30 changes: 17 additions & 13 deletions src/GBT.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include <vector>
#include<iostream>
#include <memory>
#include <omp.h>
#include "dataset.h"
#include "trees/tree.h"
#include "metrics/metric.h"
Expand Down Expand Up @@ -74,10 +75,6 @@ namespace microgbt {
return _lambda;
}

inline double gamma() const {
return _gamma;
}

inline double minSplitGain() const {
return _minSplitGain;
}
Expand Down Expand Up @@ -116,6 +113,9 @@ namespace microgbt {

void train(Dataset &trainSet, Dataset &validSet, int numBoostRound, int earlyStoppingRounds) {

// Allow nested threading in OpenMP
omp_set_nested(1);

std::vector<Tree> trees;
long bestIteration = 0;
double learningRate = _shrinkageRate;
Expand Down Expand Up @@ -204,21 +204,25 @@ namespace microgbt {
return score;
}

double predictFromTrees(const Eigen::RowVectorXd &x, const std::vector<Tree> &trees) {
double score = std::accumulate(trees.begin(), trees.end(), 0.0, [&x](double acc,
const Tree& tree){
return acc + tree.score(x);
});
double predictFromTrees(const Eigen::RowVectorXd &x, const std::vector<Tree> &trees) const {
size_t n = trees.size();
double score = 0.0;
#pragma omp parallel for default(none) shared(n, x, trees) reduction(+: score)
for (size_t i = 0; i < n; i ++){
score += trees[i].score(x);
}
return _metric->scoreToPrediction(score);
}

Vector predictDatasetFromTrees(const Dataset &trainSet, std::vector<Tree> &trees) {
Vector scores(trainSet.nRows(), 0.0);
for (size_t i = 0; i < trainSet.nRows(); i++) {
Vector predictDatasetFromTrees(const Dataset &trainSet, const std::vector<Tree> &trees) const {
size_t numSamples = trainSet.nRows();
Vector scores(numSamples);
#pragma omp parallel for schedule(static)
for (size_t i = 0; i < numSamples; i++) {
scores[i] = predictFromTrees(trainSet.row(i), trees);
}

return scores;
}
};
}
} // namespace microgbt
9 changes: 4 additions & 5 deletions src/dataset.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,11 @@ namespace microgbt {
_y = Vector(dataset.y());
_X = Eigen::MatrixXd(rowSize, colSize);

int i = 0;
for (auto idx: rowIndices) {
#pragma omp parallel for schedule(static)
for (size_t i = 0; i < rowIndices.size(); i++) {
for (int j = 0 ; j < colSize; j++) {
_X(i, j) = dataset.X()(idx, j);
_X(i, j) = dataset.X()(rowIndices[i], j);
}
i++;
}
}

Expand Down Expand Up @@ -89,4 +88,4 @@ namespace microgbt {
return _X.col(colIndex);
}
};
}
}
33 changes: 19 additions & 14 deletions src/metrics/logloss.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@

namespace microgbt {

using Vector = std::vector<double>;

/**
* Log loss metric (a.k.a. logistic loss)
* Log loss metric
*
* Logistic loss: y_i ln(1 + exp(-pred_i)) + (1-y_i) ln( 1 + exp(pred_i))
*
* Reference: https://en.wikipedia.org/wiki/Loss_functions_for_classification#Logistic_loss
*/
class LogLoss :
public Metric {
Expand Down Expand Up @@ -50,30 +50,35 @@ namespace microgbt {
}

Vector gradients(const Vector &predictions, const Vector &labels) const override {
Vector gradients;
std::transform(labels.begin(), labels.end(),
predictions.begin(), std::back_inserter(gradients), std::minus<double>());
unsigned long sz = predictions.size();
Vector gradients(sz);

#pragma omp parallel for schedule(static)
for (unsigned long i = 0 ; i < sz; i++){
gradients[i] = labels[i] - predictions[i];
}

return gradients;
}

Vector hessian(const Vector &predictions) const override {
Vector hessians;
unsigned long sz = predictions.size();
Vector hessians(sz);

std::transform(predictions.begin(), predictions.end(),
std::back_inserter(hessians),
[]( double prediction) {
return abs(logit(prediction)) * ( 1- abs(prediction));
});
#pragma omp parallel for schedule(static)
for (unsigned long i = 0 ; i < sz; i++){
hessians[i] = abs(logit(predictions[i])) * ( 1- abs(predictions[i]));
}

return hessians;
}

double lossAt(const Vector &scores, const Vector &y) const override {
size_t n = scores.size();
double loss = 0.0;

size_t n = scores.size();
for (size_t i = 0; i< n; i ++){
#pragma omp parallel for shared(y, scores) reduction(+: loss)
for (size_t i = 0; i < n; i ++){
loss += y[i] * log(clip(scores[i])) + (1 - y[i]) * log(1 - clip(scores[i]));
}

Expand Down
22 changes: 11 additions & 11 deletions src/metrics/metric.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ namespace microgbt {
using Vector = std::vector<double>;

/**
* A generic metric is defined by:
* 1) gradient method: How gradient is computed based on current prediction vector and target vector.
* 2) hessian method: How Hessian is computed based on current prediction vector.
* A generic metric that is defined by:
* * How gradient is computed based on current prediction vector and target vector.
* * How Hessian is computed based on current prediction vector and target vector.
* * Loss evaluation based on current prediction vector and target vector.
*
*/
Expand All @@ -21,7 +21,7 @@ namespace microgbt {
virtual ~Metric() = default;

/**
* Return the gradient of the metric at given predictions vector
* Compute the gradient at given predictions vector
*
* @param predictions
* @param labels
Expand All @@ -30,16 +30,16 @@ namespace microgbt {
virtual Vector gradients(const Vector &scores, const Vector &labels) const = 0;

/**
* Return the Hessian vector of the metric at given predictions vector
* Return the Hessian vector at given predictions vector
*
* @param predictions
* @param labels
* @return
*/
virtual Vector hessian(const Vector &predictions) const = 0;
virtual Vector hessian(const Vector &scores) const = 0;

/**
* Evaluates the loss function at given prediction and target vector.
* Compute the loss at given predictions.
*
* @param predictions
* @param labels
Expand All @@ -48,11 +48,11 @@ namespace microgbt {
virtual double lossAt(const Vector &scores, const Vector &labels) const = 0;

/**
* Transformation used to convert Gradient Boosting Trees scores to prediction (classification or regression)
* Transformation required from Gradient Boosting Trees scores to final prediction
*
* @param score Sum of scores over all Gradient Boosting Trees
* @return GBT prediction
* @param score Sum of scores over all trees (of GBT)
* @return
*/
virtual double scoreToPrediction(double score) const = 0;
};
}
} // namespace microgbt
7 changes: 4 additions & 3 deletions src/metrics/rmse.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,8 @@

namespace microgbt {

/**
* Root Mean Square Error (RMSE)
*/
using Vector = std::vector<double>;

class RMSE: public Metric {

public:
Expand All @@ -14,6 +13,7 @@ namespace microgbt {

Vector gradients(const Vector &predictions, const Vector &labels) const override {
Vector grads(predictions.size());

for (size_t i = 0; i < predictions.size(); i++) {
grads[i] = 2* (labels[i] - predictions[i]);
}
Expand All @@ -29,6 +29,7 @@ namespace microgbt {

double lossAt(const Vector &scores, const Vector &y) const override {
double loss = 0.0;

size_t n = scores.size();
for (size_t i = 0; i< n; i ++){
loss += pow(y[i] - scores[i], 2.0);
Expand Down
8 changes: 4 additions & 4 deletions src/python_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@ gbt.def(py::init<std::map<std::string, double> >())
.def("best_iteration", &microgbt::GBT::getBestIteration);

// Train API
gbt.def("train", &microgbt::GBT::trainPython, "Python API for microGBT training",
gbt.def("train", &microgbt::GBT::trainPython,
"Python API for microGBT training",
py::call_guard<py::gil_scoped_release>(),
pybind11::arg("train_X"), pybind11::arg("train_y"),
pybind11::arg("valid_x"), pybind11::arg("valid_y"),
pybind11::arg("num_iterations"), pybind11::arg("early_stopping_rounds") = 5);
Expand All @@ -46,12 +48,10 @@ gbt.def("__repr__",
repr += std::to_string(a.minSplitGain());
repr += ",lambda:";
repr += std::to_string(a.lambda());
repr += ",gamma:";
repr += std::to_string(a.gamma());
repr += "]";
return repr;
}
);
}
} // PYBIND11_MODULE


2 changes: 1 addition & 1 deletion src/trees/split_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,4 +89,4 @@ namespace microgbt {
return splitVector;
}
};
}
} // namespace microgbt
Loading

0 comments on commit 6fd7713

Please sign in to comment.