diff --git a/.travis.yml b/.travis.yml
index 1b12e16..d342407 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -16,8 +16,8 @@ script:
   - cmake ..
   - cmake --build .
   - ctest
-  - ctest -T memcheck
   - valgrind --leak-check=full --track-fds=yes --error-exitcode=1 ./bin/unit_tests
+  - ./bin/unit_tests
 
 after_success:
   - coveralls --root .. -E ".*pybind11.*" -E ".*external.*" -E ".*CMakeFiles.*" -E ".*test/.*.cpp.*"
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 380897f..79678ac 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,8 @@
-cmake_minimum_required(VERSION 3.5 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.12 FATAL_ERROR)
 
-project(microgbt CXX)
+PROJECT(microgbt CXX)
+
+OPTION(USE_OPENMP "Enable OpenMP" ON)
 
 # project version
 set(VERSION_MAJOR 0)
@@ -11,9 +13,23 @@ set(VERSION_PATCH 0)
 # enable c++ language
 enable_language(CXX)
 set(CMAKE_CXX_STANDARD 11)
+SET(CMAKE_POSITION_INDEPENDENT_CODE ON)
 
 # Look for eigen 3.3
-find_package (Eigen3 3.3 REQUIRED NO_MODULE)
+FIND_PACKAGE (Eigen3 3.3 REQUIRED NO_MODULE)
+
+
+
+# Based on https://iscinumpy.gitlab.io/post/omp-on-high-sierra/
+if(USE_OPENMP)
+    FIND_PACKAGE(OpenMP REQUIRED)
+else()
+    # Ignore unknown #pragma warning
+    if((CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+            OR (CMAKE_CXX_COMPILER_ID STREQUAL "GNU"))
+        SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unknown-pragmas")
+    endif()
+endif(USE_OPENMP)
 
 # place binaries and libraries according to GNU standards
 include(GNUInstallDirs)
@@ -27,7 +43,7 @@ if(CMAKE_CXX_COMPILER_ID MATCHES GNU)
 endif()
 
 if(UNIX OR MINGW OR CYGWIN)
-    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -pthread -O3 -Wextra -Wall -Wno-ignored-attributes -Wno-unknown-pragmas -Wno-return-type")
+    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -pthread -O0 -Wextra -Wall -Wno-ignored-attributes -Wno-unknown-pragmas -Wno-return-type")
     SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -funroll-loops")
 endif()
@@ -65,9 +81,17 @@ include_directories( ${EIGEN3_INCLUDE_DIRS} )
 # For Python integration
 add_subdirectory(pybind11)
 
-pybind11_add_module(microgbtpy src/metrics/metric.h src/trees/tree.h
-        src/GBT.h src/dataset.h
-        src/metrics/logloss.h
+pybind11_add_module(microgbtpy src/metrics/metric.h src/trees/tree.h src/GBT.h src/dataset.h src/metrics/logloss.h
         src/trees/treenode.h src/trees/split_info.h
+        src/utils.h
         src/python_api.cpp)
-#########################
\ No newline at end of file
+#########################
+
+
+#########################
+# Link openMP
+#########################
+if(USE_OPENMP)
+    target_link_libraries(microgbtpy PRIVATE OpenMP::OpenMP_CXX)
+endif(USE_OPENMP)
+#########################
diff --git a/NOTES.md b/NOTES.md
index a363b8c..4e6c9bc 100644
--- a/NOTES.md
+++ b/NOTES.md
@@ -10,19 +10,19 @@ cloc src
 returns
 
 ```text
-      11 text files.
-      11 unique files.                              
+     12 text files.
+      12 unique files.                              
        0 files ignored.
 
-github.com/AlDanial/cloc v 1.82  T=0.01 s (1179.0 files/s, 110184.6 lines/s)
+github.com/AlDanial/cloc v 1.82  T=0.01 s (809.9 files/s, 73226.7 lines/s)
 -------------------------------------------------------------------------------
 Language                     files          blank        comment           code
 -------------------------------------------------------------------------------
-C/C++ Header                     8            177            254            518
+C/C++ Header                     9            191            252            561
 C++                              2             10              3             48
-CMake                            1              4              0             14
+CMake                            1              4              0             16
 -------------------------------------------------------------------------------
-SUM:                            11            191            257            580
+SUM:                            12            205            255            625
 -------------------------------------------------------------------------------
 
 ```
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 2f2520a..2f9ba9e 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,4 +1,6 @@
-add_library(microgbt "" metrics/metric.h trees/tree.h GBT.h dataset.h metrics/logloss.h trees/treenode.h trees/split_info.h metrics/rmse.h)
+add_library(microgbt "" metrics/metric.h trees/tree.h trees/split_info.h
+        GBT.h dataset.h metrics/logloss.h trees/treenode.h metrics/rmse.h
+        utils.h)
 
 
 target_sources(
diff --git a/src/GBT.h b/src/GBT.h
index e16506b..8dc7e0f 100644
--- a/src/GBT.h
+++ b/src/GBT.h
@@ -2,6 +2,7 @@
 #include <vector>
 #include<iostream>
 #include <memory>
+#include <omp.h>
 #include "dataset.h"
 #include "trees/tree.h"
 #include "metrics/metric.h"
@@ -74,10 +75,6 @@ namespace microgbt {
             return _lambda;
         }
 
-        inline double gamma() const {
-            return _gamma;
-        }
-
         inline double minSplitGain() const {
             return _minSplitGain;
         }
@@ -116,6 +113,9 @@ namespace microgbt {
 
         void train(Dataset &trainSet, Dataset &validSet, int numBoostRound, int earlyStoppingRounds) {
 
+            // Allow nested threading in OpenMP
+            omp_set_nested(1);
+
             std::vector<Tree> trees;
             long bestIteration = 0;
             double learningRate = _shrinkageRate;
@@ -204,21 +204,25 @@ namespace microgbt {
             return score;
         }
 
-        double predictFromTrees(const Eigen::RowVectorXd &x, const std::vector<Tree> &trees) {
-            double score = std::accumulate(trees.begin(), trees.end(), 0.0, [&x](double acc,
-                    const Tree& tree){
-                return acc + tree.score(x);
-            });
+        double predictFromTrees(const Eigen::RowVectorXd &x, const std::vector<Tree> &trees) const {
+            size_t n = trees.size();
+            double score = 0.0;
+            #pragma omp parallel for default(none) shared(n, x, trees) reduction(+: score)
+            for (size_t i = 0; i < n; i ++){
+                score += trees[i].score(x);
+            }
             return _metric->scoreToPrediction(score);
         }
 
-        Vector predictDatasetFromTrees(const Dataset &trainSet, std::vector<Tree> &trees) {
-            Vector scores(trainSet.nRows(), 0.0);
-            for (size_t i = 0; i < trainSet.nRows(); i++) {
+        Vector predictDatasetFromTrees(const Dataset &trainSet, const std::vector<Tree> &trees) const {
+            size_t numSamples = trainSet.nRows();
+            Vector scores(numSamples);
+            #pragma omp parallel for schedule(static)
+            for (size_t i = 0; i < numSamples; i++) {
                 scores[i] = predictFromTrees(trainSet.row(i), trees);
             }
 
             return scores;
         }
     };
-}
\ No newline at end of file
+} // namespace microgbt
\ No newline at end of file
diff --git a/src/dataset.h b/src/dataset.h
index 7aaf7ac..373c3a2 100644
--- a/src/dataset.h
+++ b/src/dataset.h
@@ -56,12 +56,11 @@ namespace microgbt {
             _y = Vector(dataset.y());
             _X = Eigen::MatrixXd(rowSize, colSize);
 
-            int i = 0;
-            for (auto idx: rowIndices) {
+            #pragma omp parallel for schedule(static)
+            for (size_t i = 0; i < rowIndices.size(); i++) {
                 for (int j = 0 ; j < colSize; j++) {
-                    _X(i, j) = dataset.X()(idx, j);
+                    _X(i, j) = dataset.X()(rowIndices[i], j);
                 }
-                i++;
             }
         }
 
@@ -89,4 +88,4 @@ namespace microgbt {
             return _X.col(colIndex);
         }
     };
-}
\ No newline at end of file
+}
diff --git a/src/metrics/logloss.h b/src/metrics/logloss.h
index ed6a7f6..711ad9f 100644
--- a/src/metrics/logloss.h
+++ b/src/metrics/logloss.h
@@ -6,12 +6,12 @@
 
 namespace microgbt {
 
+    using Vector = std::vector<double>;
+
     /**
-     * Log loss metric (a.k.a. logistic loss)
+     * Log loss metric
      *
      * Logistic loss: y_i ln(1 + exp(-pred_i)) + (1-y_i) ln( 1 + exp(pred_i))
-     *
-     * Reference: https://en.wikipedia.org/wiki/Loss_functions_for_classification#Logistic_loss
      */
     class LogLoss :
             public Metric {
@@ -50,30 +50,35 @@ namespace microgbt {
         }
 
         Vector gradients(const Vector &predictions, const Vector &labels) const override {
-            Vector gradients;
-            std::transform(labels.begin(), labels.end(),
-                           predictions.begin(), std::back_inserter(gradients), std::minus<double>());
+            unsigned long sz = predictions.size();
+            Vector gradients(sz);
+
+            #pragma omp parallel for schedule(static)
+            for (unsigned long i = 0 ; i < sz; i++){
+                gradients[i] = labels[i] - predictions[i];
+            }
 
             return gradients;
         }
 
         Vector hessian(const Vector &predictions) const override {
-            Vector hessians;
+            unsigned long sz = predictions.size();
+            Vector hessians(sz);
 
-            std::transform(predictions.begin(), predictions.end(),
-                           std::back_inserter(hessians),
-                           []( double prediction) {
-                return abs(logit(prediction)) * ( 1- abs(prediction));
-            });
+            #pragma omp parallel for schedule(static)
+            for (unsigned long i = 0 ; i < sz; i++){
+                hessians[i] = abs(logit(predictions[i])) * ( 1- abs(predictions[i]));
+            }
 
             return hessians;
         }
 
         double lossAt(const Vector &scores, const Vector &y) const override {
+            size_t n = scores.size();
             double loss = 0.0;
 
-            size_t n = scores.size();
-            for (size_t i = 0; i< n; i ++){
+            #pragma omp parallel for shared(y, scores) reduction(+: loss)
+            for (size_t i = 0; i < n; i ++){
                 loss += y[i] * log(clip(scores[i])) + (1 - y[i]) * log(1 - clip(scores[i]));
             }
 
diff --git a/src/metrics/metric.h b/src/metrics/metric.h
index 2ed0562..0fd7e89 100644
--- a/src/metrics/metric.h
+++ b/src/metrics/metric.h
@@ -8,9 +8,9 @@ namespace microgbt {
     using Vector = std::vector<double>;
 
     /**
-     * A generic metric is defined by:
-     * 1) gradient method: How gradient is computed based on current prediction vector and target vector.
-     * 2) hessian method: How Hessian is computed based on current prediction vector.
+     * A generic metric that is defined by:
+     * * How gradient is computed based on current prediction vector and target vector.
+     * * How Hessian is computed based on current prediction vector and target vector.
      * * Loss evaluation based on current prediction vector and target vector.
      *
      */
@@ -21,7 +21,7 @@ namespace microgbt {
         virtual ~Metric() = default;
 
         /**
-         * Return the gradient of the metric at given predictions vector
+         * Compute the gradient at given predictions vector
          *
          * @param predictions
          * @param labels
@@ -30,16 +30,16 @@ namespace microgbt {
         virtual Vector gradients(const Vector &scores, const Vector &labels) const = 0;
 
         /**
-         * Return the Hessian vector of the metric at given predictions vector
+         * Return the Hessian vector at given predictions vector
          *
          * @param predictions
          * @param labels
          * @return
          */
-        virtual Vector hessian(const Vector &predictions) const = 0;
+        virtual Vector hessian(const Vector &scores) const = 0;
 
         /**
-         * Evaluates the loss function at given prediction and target vector.
+         * Compute the loss at given predictions.
          *
          * @param predictions
          * @param labels
@@ -48,11 +48,11 @@ namespace microgbt {
         virtual double lossAt(const Vector &scores, const Vector &labels) const = 0;
 
         /**
-         * Transformation used to convert Gradient Boosting Trees scores to prediction (classification or regression)
+         * Transformation required from Gradient Boosting Trees scores to final prediction
          *
-         * @param score Sum of scores over all Gradient Boosting Trees
-         * @return GBT prediction
+         * @param score Sum of scores over all trees (of GBT)
+         * @return
          */
         virtual double scoreToPrediction(double score) const = 0;
     };
-}
\ No newline at end of file
+} // namespace microgbt
\ No newline at end of file
diff --git a/src/metrics/rmse.h b/src/metrics/rmse.h
index 41b54d3..fa3161a 100644
--- a/src/metrics/rmse.h
+++ b/src/metrics/rmse.h
@@ -3,9 +3,8 @@
 
 namespace microgbt {
 
-    /**
-     * Root Mean Square Error (RMSE)
-     */
+    using Vector = std::vector<double>;
+
     class RMSE: public Metric {
 
     public:
@@ -14,6 +13,7 @@ namespace microgbt {
 
         Vector gradients(const Vector &predictions, const Vector &labels) const override {
             Vector grads(predictions.size());
+
             for (size_t i = 0; i < predictions.size(); i++) {
                 grads[i] = 2* (labels[i] - predictions[i]);
             }
@@ -29,6 +29,7 @@ namespace microgbt {
 
         double lossAt(const Vector &scores, const Vector &y) const override {
             double loss = 0.0;
+
             size_t n = scores.size();
             for (size_t i = 0; i< n; i ++){
                 loss += pow(y[i] - scores[i], 2.0);
diff --git a/src/python_api.cpp b/src/python_api.cpp
index 3bb4f90..a7f67a3 100644
--- a/src/python_api.cpp
+++ b/src/python_api.cpp
@@ -22,7 +22,9 @@ gbt.def(py::init<std::map<std::string, double> >())
         .def("best_iteration", &microgbt::GBT::getBestIteration);
 
 // Train API
-gbt.def("train", &microgbt::GBT::trainPython, "Python API for microgbtpy training",
+gbt.def("train", &microgbt::GBT::trainPython,
+        "Python API for microGBT training",
+        py::call_guard<py::gil_scoped_release>(),
             pybind11::arg("train_X"), pybind11::arg("train_y"),
             pybind11::arg("valid_x"), pybind11::arg("valid_y"),
             pybind11::arg("num_iterations"), pybind11::arg("early_stopping_rounds") = 5);
@@ -46,12 +48,8 @@ gbt.def("__repr__",
                      repr += std::to_string(a.minSplitGain());
                      repr += ",lambda:";
                      repr += std::to_string(a.lambda());
-                     repr += ",gamma:";
-                     repr += std::to_string(a.gamma());
                      repr += "]";
                      return repr;
                  }
             );
-}
-
-
+} // PYBIND11_MODULE
diff --git a/src/trees/split_info.h b/src/trees/split_info.h
index 319eb62..34a05b0 100644
--- a/src/trees/split_info.h
+++ b/src/trees/split_info.h
@@ -89,4 +89,4 @@ namespace microgbt {
                 return splitVector;
             }
         };
-}
\ No newline at end of file
+} // namespace microgbt
diff --git a/src/trees/tree.h b/src/trees/tree.h
index a861871..8984053 100644
--- a/src/trees/tree.h
+++ b/src/trees/tree.h
@@ -28,7 +28,7 @@ namespace microgbt {
         double _lambda, _minSplitGain, _minTreeSize;
 
         /** Root of tree */
-        std::shared_ptr<TreeNode> _root;
+        std::shared_ptr<TreeNode> root;
 
     public:
 
@@ -54,9 +54,9 @@ namespace microgbt {
                    const Vector &hessian,
                    double shrinkage) {
 
-            this->_root = std::unique_ptr<TreeNode>(new TreeNode(_lambda, _minSplitGain, _minTreeSize, _maxDepth));
+            this->root = std::unique_ptr<TreeNode>(new TreeNode(_lambda, _minSplitGain, _minTreeSize, _maxDepth));
             int depth = 0;
-            this->_root->build(trainSet, previousPreds, gradient, hessian, shrinkage, depth);
+            this->root->build(trainSet, previousPreds, gradient, hessian, shrinkage, depth);
         }
 
         /**
@@ -66,7 +66,7 @@ namespace microgbt {
          * @return Score of tree
          */
         double score(const Eigen::RowVectorXd &sample) const {
-            return _root->score(sample);
+            return root->score(sample);
         }
     };
-}
\ No newline at end of file
+} // namespace microgbt
\ No newline at end of file
diff --git a/src/trees/treenode.h b/src/trees/treenode.h
index 157d14a..b85313c 100644
--- a/src/trees/treenode.h
+++ b/src/trees/treenode.h
@@ -7,15 +7,18 @@
 #include<numeric>
 #include<iterator>
 #include<iostream>
+#include <thread>
 
 #include "../dataset.h"
 #include "split_info.h"
+#include "../utils.h"
 
 
 namespace microgbt {
 
     using Vector = std::vector<double>;
 
+
     /**
      * A node of a tree
      */
@@ -24,15 +27,15 @@ namespace microgbt {
         int _maxDepth;
         double _lambda, _minSplitGain, _minTreeSize;
         bool isLeaf = false;
-        std::unique_ptr<TreeNode> _leftSubTree;
-        std::unique_ptr<TreeNode> _rightSubTree;
-        long _splitFeatureIndex;
+        std::unique_ptr<TreeNode> leftSubTree;
+        std::unique_ptr<TreeNode> rightSubTree;
+        long splitFeatureIndex;
 
         /**
          * Numeric value on which a binary tree split took place
          */
-        double _splitNumericValue;
-        double _weight = 0.0;
+        double splitNumericValue;
+        double weight = 0.0;
 
         /**
          * Return sorted indices from a vector
@@ -76,8 +79,6 @@ namespace microgbt {
     public:
 
         TreeNode(double lambda, double minSplitGain, double minTreeSize, int maxDepth){
-            _splitNumericValue = 0.0;
-            _splitFeatureIndex = -1;
             _lambda = lambda;
             _minSplitGain = minSplitGain;
             _maxDepth = maxDepth;
@@ -110,8 +111,10 @@ namespace microgbt {
          * @param lambd Regularization xgboost parameter, see Eqn. 7 in [1]
          * @return
          */
-        inline double calc_split_gain(double G, double H, double G_l, double H_l, double G_r, double H_r) const {
-            return objective(G_l, H_l) + objective(G_r, H_r) - objective(G, H) / 2.0; // TODO: minus gamma ?
+        inline double calc_split_gain(double G, double H, double G_l, double H_l) const {
+            double G_r = G - G_l;
+            double H_r = H - H_l;
+            return objective(G_l, H_l) + objective(G_r, H_r) - objective(G, H) / 2.0; // TODO: minus \gamma
         }
 
         /**
@@ -127,17 +130,18 @@ namespace microgbt {
           */
         inline double calc_leaf_weight(const Vector &gradient,
                                        const Vector &hessian) const {
-            return accumulate(gradient.begin(), gradient.end(), 0.0)
-                   / (accumulate(hessian.begin(), hessian.end(), 0.0) + _lambda);
+            return par_simd_accumulate(gradient)
+                   / (par_simd_accumulate(hessian) + _lambda);
         }
 
         /**
          * Returns an optimal binary split for a given feature index of a Dataset.
          *
-         * @param trainSet Training dataset
-         * @param gradient Gradient vector
-         * @param hessian Hessian vector
-         * @param featureId Feature index
+         * @param trainSet
+         * @param previousPreds
+         * @param gradient
+         * @param hessian
+         * @param featureId
          * @return
          */
         SplitInfo optimumGainByFeature(const Dataset &trainSet,
@@ -145,31 +149,38 @@ namespace microgbt {
                                   const Vector &hessian,
                                   int featureId) const {
 
-            double G = accumulate(gradient.begin(), gradient.end(), 0.0);
-            double H = accumulate(hessian.begin(), hessian.end(), 0.0);
-
-            double G_l = 0.0, H_l = 0.0, bestGain = std::numeric_limits<double>::min(), bestSplitNumericValue = 0;
-            size_t bestSortedIndex = 0;
+            double G = par_simd_accumulate(gradient);
+            double H = par_simd_accumulate(hessian);
 
             // Sort the feature by value and return permutation of indices (i.e., argsort)
-            std::vector<size_t> sortedInstanceIds = sortSamplesByFeature(trainSet, featureId);
+            std::vector<size_t> sortedInstanceIds = TreeNode::sortSamplesByFeature(trainSet, featureId);
+
+            // Cummulative sum of gradients and Hessian
+            Vector cum_sum_G(trainSet.nRows());
+            Vector cum_sum_H(trainSet.nRows());
+            double cum_sum_g = 0.0, cum_sum_h = 0.0;
+            for (size_t i = 0 ; i < trainSet.nRows(); i++) {
+                cum_sum_g += gradient[sortedInstanceIds[i]];
+                cum_sum_h += hessian[sortedInstanceIds[i]];
+                cum_sum_G[i] = cum_sum_g;
+                cum_sum_H[i] = cum_sum_h;
+            }
 
             // For each feature, compute split gain and keep the split index with maximum gain
+            Vector gainPerOrderedSampleIndex(trainSet.nRows());
+            #pragma omp parallel for shared(gainPerOrderedSampleIndex) schedule(static)
             for (size_t i = 0 ; i < trainSet.nRows(); i++){
-                G_l += gradient[sortedInstanceIds[i]];
-                H_l += hessian[sortedInstanceIds[i]];
-                double G_r = G - G_l;
-                double H_r = H - H_l;
-                double currentGain = calc_split_gain(G, H, G_l, H_l, G_r, H_r);
-
-                if ( currentGain > bestGain) {
-                    bestGain = currentGain;
-                    bestSplitNumericValue = trainSet.X()(sortedInstanceIds[i], featureId);
-                    bestSortedIndex = i + 1;
-                }
-
+                gainPerOrderedSampleIndex[i] = calc_split_gain(G, H, cum_sum_G[i], cum_sum_H[i]);
             }
 
+            long bestGainIndex =
+                    std::max_element(gainPerOrderedSampleIndex.begin(), gainPerOrderedSampleIndex.end())
+                    - gainPerOrderedSampleIndex.begin();
+            double bestGain = gainPerOrderedSampleIndex[bestGainIndex];
+            double bestSplitNumericValue = trainSet.X()(sortedInstanceIds[bestGainIndex], featureId);
+            size_t bestSortedIndex = bestGainIndex + 1;
+
+
             // Return a SplitInfo object
             std::vector<size_t> bestLeftInstances(sortedInstanceIds.begin(), sortedInstanceIds.begin() + bestSortedIndex);
             std::vector<size_t> bestRightInstances(sortedInstanceIds.begin() + bestSortedIndex, sortedInstanceIds.end());
@@ -203,60 +214,76 @@ namespace microgbt {
                    double shrinkage,
                    int depth) {
 
+            size_t numFeatures = trainSet.numFeatures();
+
             // Check if depth is reached
             if (depth > _maxDepth) {
                 this->isLeaf = true;
-                this->_weight = this->calc_leaf_weight(gradient, hessian) * shrinkage;
+                this->weight = this->calc_leaf_weight(gradient, hessian) * shrinkage;
                 return;
             }
 
             // Check if # of sample is too small
             if ( trainSet.nRows() <= _minTreeSize) {
                 this->isLeaf = true;
-                this->_weight = this->calc_leaf_weight(gradient, hessian) * shrinkage;
+                this->weight = this->calc_leaf_weight(gradient, hessian) * shrinkage;
                 return;
             }
 
             // 1) For each tree node, enumerate over all features:
             // 2) For each feature, sorted the instances by feature numeric value
             //    - Compute gain for every feature (column of design matrix)
-            std::vector<SplitInfo> splitInfoPerFeature(trainSet.numFeatures());
-            for (size_t featureId = 0; featureId < trainSet.numFeatures(); featureId++) {
-                splitInfoPerFeature[featureId] = optimumGainByFeature(trainSet, gradient, hessian, featureId);
+            std::vector<SplitInfo> gainPerFeature(numFeatures);
+            #pragma omp parallel for schedule(static)
+            for (size_t featureId = 0; featureId < numFeatures; featureId++) {
+                gainPerFeature[featureId] = optimumGainByFeature(trainSet, gradient, hessian, featureId);
             }
 
             // 3) Use a linear scan to decide the best split along that feature (if categorical perform Mean Target Encoding)
             // 4) Take the best split solution (that maximises gain reduction) over all features
             long bestFeatureId =
-                    std::max_element(splitInfoPerFeature.begin(), splitInfoPerFeature.end()) - splitInfoPerFeature.begin();
-            SplitInfo bestGain = splitInfoPerFeature[bestFeatureId];
+                    std::max_element(gainPerFeature.begin(), gainPerFeature.end()) - gainPerFeature.begin();
+            SplitInfo bestGain = gainPerFeature[bestFeatureId];
 
             // Check if best gain is less than minimum split gain (threshold)
             if (bestGain.bestGain() < this->_minSplitGain) {
                 this->isLeaf = true;
-                this->_weight = this->calc_leaf_weight(gradient, hessian) * shrinkage;
+                this->weight = this->calc_leaf_weight(gradient, hessian) * shrinkage;
                 return;
             }
 
-            this->_splitFeatureIndex = bestFeatureId;
-            this->_splitNumericValue = bestGain.splitValue();
-
-
-            // Recurse on the left subtree
-            this->_leftSubTree = std::unique_ptr<TreeNode>(new TreeNode(_lambda, _minSplitGain, _minTreeSize, _maxDepth));
-            Dataset leftDataset(trainSet, bestGain, SplitInfo::Side::Left);
-            Vector leftGradient = bestGain.split(gradient, SplitInfo::Side::Left);
-            Vector leftHessian = bestGain.split(hessian, SplitInfo::Side::Left);
-            Vector leftPreviousPreds = bestGain.split(previousPreds, SplitInfo::Side::Left);
-            _leftSubTree->build(leftDataset, leftPreviousPreds, leftGradient, leftHessian, shrinkage, depth + 1);
-
-            // Recurse on the right subtree
-            this->_rightSubTree = std::unique_ptr<TreeNode>(new TreeNode(_lambda, _minSplitGain, _minTreeSize, _maxDepth));
-            Dataset rightDataset(trainSet, bestGain, SplitInfo::Side::Right);
-            Vector rightGradient = bestGain.split(gradient, SplitInfo::Side::Right);
-            Vector rightHessian = bestGain.split(hessian, SplitInfo::Side::Right);
-            Vector rightPreviousPreds = bestGain.split(previousPreds, SplitInfo::Side::Right);
-            _rightSubTree->build(rightDataset, rightPreviousPreds, rightGradient, rightHessian, shrinkage, depth + 1);
+            this->splitFeatureIndex = bestFeatureId;
+            this->splitNumericValue = bestGain.splitValue();
+
+
+            #pragma omp parallel sections
+            {
+                // Recurse on the left subtree
+                #pragma omp section
+                {
+                    Dataset leftDataset(trainSet, bestGain, SplitInfo::Side::Left);
+                    Vector leftGradient = bestGain.split(gradient, SplitInfo::Side::Left);
+                    Vector leftHessian = bestGain.split(hessian, SplitInfo::Side::Left);
+                    Vector leftPreviousPreds = bestGain.split(previousPreds, SplitInfo::Side::Left);
+                    this->leftSubTree = std::unique_ptr<TreeNode>(
+                            new TreeNode(_lambda, _minSplitGain, _minTreeSize, _maxDepth));
+                    leftSubTree->build(leftDataset, leftPreviousPreds, leftGradient, leftHessian, shrinkage, depth + 1);
+                }
+
+
+                // Recurse on the right subtree
+                #pragma omp section
+                {
+                    Dataset rightDataset(trainSet, bestGain, SplitInfo::Side::Right);
+                    Vector rightGradient = bestGain.split(gradient, SplitInfo::Side::Right);
+                    Vector rightHessian = bestGain.split(hessian, SplitInfo::Side::Right);
+                    Vector rightPreviousPreds = bestGain.split(previousPreds, SplitInfo::Side::Right);
+
+                    this->rightSubTree = std::unique_ptr<TreeNode>(
+                            new TreeNode(_lambda, _minSplitGain, _minTreeSize, _maxDepth));
+                    rightSubTree->build(rightDataset, rightPreviousPreds, rightGradient, rightHessian, shrinkage, depth + 1);
+                }
+            }
         }
 
         /**
@@ -267,12 +294,12 @@ namespace microgbt {
          */
         double score(const Eigen::RowVectorXd &sample) const {
             if (this->isLeaf) {
-                return this->_weight;
-            } else if (sample[this->_splitFeatureIndex] < this->_splitNumericValue) {
-                return this->_leftSubTree->score(sample);
+                return this->weight;
+            } else if (sample[this->splitFeatureIndex] < this->splitNumericValue) {
+                return this->leftSubTree->score(sample);
             } else {
-                return this->_rightSubTree->score(sample);
+                return this->rightSubTree->score(sample);
             }
         }
     };
-}
+} // namespace microgbt
diff --git a/src/utils.h b/src/utils.h
new file mode 100644
index 0000000..d3b1aca
--- /dev/null
+++ b/src/utils.h
@@ -0,0 +1,19 @@
+#pragma once
+#include<vector>
+
+
+namespace microgbt {
+
+    using Vector = std::vector<double>;
+
+    static double par_simd_accumulate(const Vector& vector) {
+        size_t n = vector.size();
+        double accumulate = 0.0;
+        #pragma omp simd reduction(+: accumulate)
+        for (size_t i = 0; i < n; i ++){
+            accumulate  += vector[i];
+        }
+
+        return accumulate;
+    }
+} // namespace microgbt
diff --git a/test/test_dataset.cpp b/test/test_dataset.cpp
index b8851cf..d7f238a 100644
--- a/test/test_dataset.cpp
+++ b/test/test_dataset.cpp
@@ -4,7 +4,7 @@
 TEST(Dataset, DefaultConstructor)
 {
 
-    int m = 2, n = 3;
+    size_t m = 2, n = 3;
     Eigen::MatrixXd A(m, n);
     microgbt::Vector y = {1.0, 2.0, 3.0};
     microgbt::Dataset dataset(A, y);
@@ -16,7 +16,7 @@ TEST(Dataset, DefaultConstructor)
 TEST(Dataset, Constructor)
 {
 
-    int m = 2, n = 3;
+    size_t m = 2, n = 3;
     Eigen::MatrixXd A(m, n);
     microgbt::Vector y = {1.0, 2.0, 3.0};
     microgbt::Dataset dataset(A, y);
diff --git a/test/test_split_info.cpp b/test/test_split_info.cpp
index 55737e1..dbc1538 100644
--- a/test/test_split_info.cpp
+++ b/test/test_split_info.cpp
@@ -3,7 +3,7 @@
 
 TEST(microgbt, SplitInfo)
 {
-    microgbt::SplitInfo splitInfo(0.0, 1.0);
-    ASSERT_NEAR(splitInfo.bestGain(), 0.0, 1.0e-11);
-    ASSERT_NEAR(splitInfo.splitValue(), 1.0, 1.0e-11);
+    microgbt::SplitInfo gain(0.0, 1.0);
+    ASSERT_NEAR(gain.bestGain(), 0.0, 1.0e-11);
+    ASSERT_NEAR(gain.splitValue(), 1.0, 1.0e-11);
 }