-
Notifications
You must be signed in to change notification settings - Fork 73
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Layer] enhance ConcatLayer algorithms for efficient concatenation an…
…d split This PR renovates reshape/concatenation algorithms to facilitate efficient concatenation and split in ConcatLayer. Previously, dimension 2 (height) was set as a standard axis to operate concatenation. However, this causes an overhead by copying a tensor size of 1 when the concat dimension is 3 (width). The new algorithm consolidates all dimensions to the first and last axes based on the concat dimension, sets the standard axis to be 3, and performs concat and split. **Changes proposed in this PR:** - Revise creating helper dimension logic in finalize(). - Update forwarding() and calcDeriv() workflow to be efficient. - Add descriptions for the new concat algorithm. Signed-off-by: Donghyeon Jeong <[email protected]>
- Loading branch information
Showing
1 changed file
with
127 additions
and
77 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,6 +6,7 @@ | |
* @date 27 Oct 2020 | ||
* @see https://github.com/nnstreamer/nntrainer | ||
* @author Jijoong Moon <[email protected]> | ||
* @author Donghyeon Jeong <[email protected]> | ||
* @bug No known bugs except for NYI items | ||
* @brief This is Concat Layer Class for Neural Network | ||
* | ||
|
@@ -70,85 +71,128 @@ void ConcatLayer::finalize(InitLayerContext &context) { | |
context.setOutputDimensions({output_dim}); | ||
|
||
/** | ||
* Setup output_reshape_helper to which output will be reshaped in forwarding | ||
* to facilitate easier processing. | ||
* The following helper shapes facilitate efficient concatenation and split of | ||
* the data. | ||
* | ||
* The helper shapes are created by consolidating all the dimensions before | ||
* the concat dimension to the first axis and all the remaining dimensions to | ||
* the last axis. | ||
* | ||
* @note This is possible since the data starting from the concat dimension to | ||
* the end is always continuous. | ||
* | ||
* @example the following shows how the helper dimension will look with given | ||
* inputs and concat dimension. | ||
* | ||
* | cat_dim 1 | cat_dim 2 | cat_dim 3 | ||
* --------|-----------|-----------|----------- | ||
* input0 | 2:1:2:3 | 1:2:1:3 | 1:2:2:3 | ||
* input1 | 2:3:2:3 | 1:2:3:3 | 1:2:2:1 | ||
* --------|-----------|-----------|----------- | ||
* helper0 | 2:1:1:6 | 2:1:1:3 | 4:1:1:3 | ||
* helper1 | 2:1:1:18 | 2:1:1:9 | 4:1:1:1 | ||
* | ||
* The helper shape consolidates all the dimensions before the axis | ||
* together and all the dimensions after the axis to facilitate | ||
* easier splitting of the data. | ||
*/ | ||
leading_helper_dim = 1; | ||
/// Setup output_reshape_helper (how output should be reshaped) | ||
output_reshape_helper.channel(1); | ||
output_reshape_helper.height(1); | ||
output_reshape_helper.width(1); | ||
for (unsigned int idx = 1; idx < concat_dimension; ++idx) { | ||
leading_helper_dim *= output_dim.getTensorDim(idx); | ||
} | ||
|
||
output_reshape_helper.height(output_dim.getTensorDim(concat_dimension)); | ||
|
||
for (unsigned int idx = concat_dimension + 1; | ||
idx < ml::train::TensorDim::getNumDim(); ++idx) { | ||
for (unsigned int axis = concat_dimension; | ||
axis < ml::train::TensorDim::getNumDim(); ++axis) { | ||
output_reshape_helper.width(output_reshape_helper.width() * | ||
output_dim.getTensorDim(idx)); | ||
output_dim.getTensorDim(axis)); | ||
} | ||
|
||
/** | ||
* Setup input_reshape_helper to which inputs will be reshaped in forwarding | ||
* to facilitate easier processing. | ||
*/ | ||
/// Setup input_reshape_helper (how inputs should be reshaped) | ||
input_reshape_helper.resize(input_dims.size()); | ||
|
||
for (unsigned int idx = 0; idx < input_reshape_helper.size(); idx++) { | ||
input_reshape_helper[idx] = output_reshape_helper; | ||
input_reshape_helper[idx].height( | ||
input_dims[idx].getTensorDim(concat_dimension)); | ||
input_reshape_helper[idx].channel(1); | ||
input_reshape_helper[idx].height(1); | ||
input_reshape_helper[idx].width(1); | ||
|
||
for (unsigned int axis = concat_dimension; | ||
axis < ml::train::TensorDim::getNumDim(); ++axis) { | ||
|
||
input_reshape_helper[idx].width(input_reshape_helper[idx].width() * | ||
input_dims[idx].getTensorDim(axis)); | ||
} | ||
} | ||
|
||
leading_helper_dim = 1; | ||
for (unsigned int idx = 1; idx < concat_dimension; ++idx) { | ||
leading_helper_dim *= output_dim.getTensorDim(idx); | ||
} | ||
|
||
setBatch(input_dims[SINGLE_INOUT_IDX].batch()); | ||
} | ||
|
||
void ConcatLayer::forwarding(RunLayerContext &context, bool training) { | ||
/** | ||
* Forwarding in ConcatLayer works as follows | ||
* | ||
* in1 in2 in3 output | ||
* |---0---| |----3----| |--6--| |---0---||----3----||--6--| | ||
* |---1---| |----4----| |--7--| => |---1---||----4----||--7--| | ||
* |---2---| |----5----| |--8--| |---2---||----5----||--8--| | ||
* | ||
* @note the number of each block indicates the order of copy to output | ||
* | ||
* @todo avoid copy by creating input here as a shared_tensor of the output | ||
* here and then this layer can be in_place as well | ||
*/ | ||
Tensor &output = context.getOutput(SINGLE_INOUT_IDX); | ||
|
||
// Store original input tensor dimensions, then reshape input tensors. | ||
std::vector<Tensor> input_tensors; | ||
std::vector<TensorDim> original_input_dims; | ||
const TensorDim out_dim = output.getDim(); | ||
output.reshape(output_reshape_helper); | ||
unsigned int output_width_offset = 0; | ||
TensorDim::TensorType tensor_type = output.getTensorType(); | ||
|
||
for (unsigned int idx = 0; idx < context.getNumInputs(); idx++) { | ||
Tensor &input = context.getInput(idx); | ||
original_input_dims.push_back(input.getDim()); | ||
input.reshape(input_reshape_helper[idx]); | ||
input_tensors.push_back(input); | ||
} | ||
|
||
// Store the original output tensor dimension, then reshape the output tensor. | ||
Tensor &output = context.getOutput(SINGLE_INOUT_IDX); | ||
const TensorDim original_output_dim = output.getDim(); | ||
output.reshape(output_reshape_helper); | ||
const TensorDim in_dim = input.getDim(); | ||
auto const &irh = input_reshape_helper[idx]; | ||
input.reshape(irh); | ||
unsigned int data_copy_size = irh.width(); | ||
|
||
// Search for an axis and concatenate tensors. | ||
const TensorDim out_dim = output.getDim(); | ||
const TensorDim in_dim = context.getInput(0).getDim(); | ||
|
||
for (int axis = 0; axis < 4; ++axis) { | ||
if (out_dim[axis] != in_dim[axis]) { | ||
/// @todo Currently a new output tensor is created. This can be optimized. | ||
Tensor result = Tensor::cat(input_tensors, axis); | ||
output.copy(result); | ||
break; | ||
/** loop over the dimensions before the concat dimension */ | ||
if (in_dim.getDataType() == TensorDim::DataType::FP32) { | ||
/** copy continous tensor data (reshaped width) */ | ||
for (unsigned int batch = 0; batch < output.batch(); batch++) { | ||
Tensor dest_tensor = Tensor::Map<float>( | ||
output.getAddress<float>(batch, 0, 0, output_width_offset), | ||
data_copy_size * sizeof(float), | ||
{1, 1, 1, data_copy_size, tensor_type}); | ||
const Tensor source_tensor = | ||
Tensor::Map<float>(input.getAddress<float>(batch, 0, 0, 0), | ||
data_copy_size * sizeof(float), | ||
{1, 1, 1, data_copy_size, tensor_type}); | ||
dest_tensor.copy(source_tensor); | ||
} | ||
} else if (in_dim.getDataType() == TensorDim::DataType::FP16) { | ||
#ifdef ENABLE_FP16 | ||
/** copy continous tensor data (reshaped width) */ | ||
for (unsigned int batch = 0; batch < output.batch(); batch++) { | ||
Tensor dest_tensor = Tensor::Map<_FP16>( | ||
output.getAddress<_FP16>(batch, 0, 0, output_width_offset), | ||
data_copy_size * sizeof(_FP16), | ||
{1, 1, 1, data_copy_size, tensor_type}); | ||
const Tensor source_tensor = | ||
Tensor::Map<_FP16>(input.getAddress<_FP16>(batch, 0, 0, 0), | ||
data_copy_size * sizeof(_FP16), | ||
{1, 1, 1, data_copy_size, tensor_type}); | ||
dest_tensor.copy(source_tensor); | ||
} | ||
#else | ||
throw std::invalid_argument("Error: enable-fp16 is not enabled"); | ||
#endif | ||
} | ||
} | ||
|
||
// Revert the tensors' dimensions back to their original shape. | ||
for (unsigned int idx = 0; idx < context.getNumInputs(); idx++) { | ||
Tensor &in = context.getInput(idx); | ||
in.reshape(original_input_dims[idx]); | ||
output_width_offset += irh.width(); | ||
input.reshape(in_dim); | ||
} | ||
|
||
output.reshape(original_output_dim); | ||
output.reshape(out_dim); | ||
} | ||
|
||
void ConcatLayer::incremental_forwarding(RunLayerContext &context, | ||
|
@@ -199,62 +243,68 @@ void ConcatLayer::incremental_forwarding(RunLayerContext &context, | |
|
||
void ConcatLayer::calcDerivative(RunLayerContext &context) { | ||
/** | ||
* calcDerivative in ConcatLayer works as follows | ||
* | ||
* output in1 in2 in3 | ||
* |---0---||----3----||--6--| |---0---| |----3----| |--6--| | ||
* |---1---||----4----||--7--| => |---1---| |----4----| |--7--| | ||
* |---2---||----5----||--8--| |---2---| |----5----| |--8--| | ||
* | ||
* @note the number of each block indicates the order of copy to inputs | ||
* | ||
* @todo avoid copy by creating input here as a shared_tensor of the output | ||
* here and then this layer can be in_place as well | ||
*/ | ||
|
||
Tensor output = context.getIncomingDerivative(SINGLE_INOUT_IDX); | ||
|
||
output.reshape(output_reshape_helper); | ||
unsigned int output_height_offset = 0; | ||
unsigned int data_copy_size = output_reshape_helper.width(); | ||
unsigned int output_width_offset = 0; | ||
TensorDim::TensorType tensor_type = output.getTensorType(); | ||
|
||
for (unsigned int idx = 0; idx < context.getNumInputs(); idx++) { | ||
Tensor &input = context.getOutgoingDerivative(idx); | ||
const TensorDim in_dim = input.getDim(); | ||
auto const &irh = input_reshape_helper[idx]; | ||
input.reshape(irh); | ||
unsigned int data_copy_size = irh.width(); | ||
|
||
if (in_dim.getDataType() == TensorDim::DataType::FP32) { | ||
/** loop over the dimensions before the concat dimension */ | ||
for (unsigned int batch = 0; batch < output.batch(); batch++) { | ||
/** loop over the concat dimension itself */ | ||
for (unsigned int count = 0; count < irh.height(); count++) { | ||
const Tensor source_tensor = Tensor::Map<float>( | ||
output.getAddress<float>(batch, 0, output_height_offset + count, 0), | ||
data_copy_size * sizeof(float), | ||
{1, 1, 1, data_copy_size, tensor_type}); | ||
Tensor dest_tensor = | ||
Tensor::Map<float>(input.getAddress<float>(batch, 0, count, 0), | ||
data_copy_size * sizeof(float), | ||
{1, 1, 1, data_copy_size, tensor_type}); | ||
dest_tensor.copy(source_tensor); | ||
} | ||
/** copy continous data (reshaped width size) in a tensor */ | ||
const Tensor source_tensor = Tensor::Map<float>( | ||
output.getAddress<float>(batch, 0, 0, output_width_offset), | ||
data_copy_size * sizeof(float), | ||
{1, 1, 1, data_copy_size, tensor_type}); | ||
Tensor dest_tensor = | ||
Tensor::Map<float>(input.getAddress<float>(batch, 0, 0, 0), | ||
data_copy_size * sizeof(float), | ||
{1, 1, 1, data_copy_size, tensor_type}); | ||
dest_tensor.copy(source_tensor); | ||
} | ||
} else if (in_dim.getDataType() == TensorDim::DataType::FP16) { | ||
#ifdef ENABLE_FP16 | ||
/** loop over the dimensions before the concat dimension */ | ||
for (unsigned int batch = 0; batch < output.batch(); batch++) { | ||
/** loop over the concat dimension itself */ | ||
for (unsigned int count = 0; count < irh.height(); count++) { | ||
const Tensor source_tensor = Tensor::Map<_FP16>( | ||
output.getAddress<_FP16>(batch, 0, output_height_offset + count, 0), | ||
data_copy_size * sizeof(_FP16), | ||
{1, 1, 1, data_copy_size, tensor_type}); | ||
Tensor dest_tensor = | ||
Tensor::Map<_FP16>(input.getAddress<_FP16>(batch, 0, count, 0), | ||
data_copy_size * sizeof(_FP16), | ||
{1, 1, 1, data_copy_size, tensor_type}); | ||
dest_tensor.copy(source_tensor); | ||
} | ||
/** copy continous data (reshaped width size) in a tensor */ | ||
const Tensor source_tensor = Tensor::Map<_FP16>( | ||
output.getAddress<_FP16>(batch, 0, 0, output_width_offset), | ||
data_copy_size * sizeof(_FP16), | ||
{1, 1, 1, data_copy_size, tensor_type}); | ||
Tensor dest_tensor = | ||
Tensor::Map<_FP16>(input.getAddress<_FP16>(batch, 0, 0, 0), | ||
data_copy_size * sizeof(_FP16), | ||
{1, 1, 1, data_copy_size, tensor_type}); | ||
dest_tensor.copy(source_tensor); | ||
} | ||
#else | ||
throw std::invalid_argument("Error: enable-fp16 is not enabled"); | ||
#endif | ||
} | ||
|
||
input.reshape(in_dim); | ||
output_height_offset += irh.height(); | ||
output_width_offset += irh.width(); | ||
} | ||
} | ||
|
||
|