diff --git a/nntrainer/layers/concat_layer.cpp b/nntrainer/layers/concat_layer.cpp index 5536c4a82d..d482c6fa14 100644 --- a/nntrainer/layers/concat_layer.cpp +++ b/nntrainer/layers/concat_layer.cpp @@ -6,6 +6,7 @@ * @date 27 Oct 2020 * @see https://github.com/nnstreamer/nntrainer * @author Jijoong Moon + * @author Donghyeon Jeong * @bug No known bugs except for NYI items * @brief This is Concat Layer Class for Neural Network * @@ -70,38 +71,57 @@ void ConcatLayer::finalize(InitLayerContext &context) { context.setOutputDimensions({output_dim}); /** - * Setup output_reshape_helper to which output will be reshaped in forwarding - * to facilitate easier processing. + * The following helper shapes facilitate efficient concatenation and split of + * the data. + * + * The helper shapes are created by consolidating all the dimensions before + * the concat dimension to the first axis and all the remaining dimensions to + * the last axis. + * + * @note This is possible since the data starting from the concat dimension to + * the end is always continuous. + * + * @example the following shows how the helper dimension will look with given + * inputs and concat dimension. + * + * | cat_dim 1 | cat_dim 2 | cat_dim 3 + * --------|-----------|-----------|----------- + * input0 | 2:1:2:3 | 1:2:1:3 | 1:2:2:3 + * input1 | 2:3:2:3 | 1:2:3:3 | 1:2:2:1 + * --------|-----------|-----------|----------- + * helper0 | 2:1:1:6 | 2:1:1:3 | 4:1:1:3 + * helper1 | 2:1:1:18 | 2:1:1:9 | 4:1:1:1 * - * The helper shape consolidates all the dimensions before the axis - * together and all the dimensions after the axis to facilitate - * easier splitting of the data. */ - leading_helper_dim = 1; + /// Setup output_reshape_helper (how output should be reshaped) output_reshape_helper.channel(1); output_reshape_helper.height(1); output_reshape_helper.width(1); - for (unsigned int idx = 1; idx < concat_dimension; ++idx) { - leading_helper_dim *= output_dim.getTensorDim(idx); - } - - output_reshape_helper.height(output_dim.getTensorDim(concat_dimension)); - - for (unsigned int idx = concat_dimension + 1; - idx < ml::train::TensorDim::getNumDim(); ++idx) { + for (unsigned int axis = concat_dimension; + axis < ml::train::TensorDim::getNumDim(); ++axis) { output_reshape_helper.width(output_reshape_helper.width() * - output_dim.getTensorDim(idx)); + output_dim.getTensorDim(axis)); } - /** - * Setup input_reshape_helper to which inputs will be reshaped in forwarding - * to facilitate easier processing. - */ + /// Setup input_reshape_helper (how inputs should be reshaped) input_reshape_helper.resize(input_dims.size()); + for (unsigned int idx = 0; idx < input_reshape_helper.size(); idx++) { - input_reshape_helper[idx] = output_reshape_helper; - input_reshape_helper[idx].height( - input_dims[idx].getTensorDim(concat_dimension)); + input_reshape_helper[idx].channel(1); + input_reshape_helper[idx].height(1); + input_reshape_helper[idx].width(1); + + for (unsigned int axis = concat_dimension; + axis < ml::train::TensorDim::getNumDim(); ++axis) { + + input_reshape_helper[idx].width(input_reshape_helper[idx].width() * + input_dims[idx].getTensorDim(axis)); + } + } + + leading_helper_dim = 1; + for (unsigned int idx = 1; idx < concat_dimension; ++idx) { + leading_helper_dim *= output_dim.getTensorDim(idx); } setBatch(input_dims[SINGLE_INOUT_IDX].batch()); @@ -109,46 +129,70 @@ void ConcatLayer::finalize(InitLayerContext &context) { void ConcatLayer::forwarding(RunLayerContext &context, bool training) { /** + * Forwarding in ConcatLayer works as follows + * + * in1 in2 in3 output + * |---0---| |----3----| |--6--| |---0---||----3----||--6--| + * |---1---| |----4----| |--7--| => |---1---||----4----||--7--| + * |---2---| |----5----| |--8--| |---2---||----5----||--8--| + * + * @note the number of each block indicates the order of copy to output + * * @todo avoid copy by creating input here as a shared_tensor of the output * here and then this layer can be in_place as well */ + Tensor &output = context.getOutput(SINGLE_INOUT_IDX); - // Store original input tensor dimensions, then reshape input tensors. - std::vector input_tensors; - std::vector original_input_dims; + const TensorDim out_dim = output.getDim(); + output.reshape(output_reshape_helper); + unsigned int output_width_offset = 0; + TensorDim::TensorType tensor_type = output.getTensorType(); for (unsigned int idx = 0; idx < context.getNumInputs(); idx++) { Tensor &input = context.getInput(idx); - original_input_dims.push_back(input.getDim()); - input.reshape(input_reshape_helper[idx]); - input_tensors.push_back(input); - } - - // Store the original output tensor dimension, then reshape the output tensor. - Tensor &output = context.getOutput(SINGLE_INOUT_IDX); - const TensorDim original_output_dim = output.getDim(); - output.reshape(output_reshape_helper); + const TensorDim in_dim = input.getDim(); + auto const &irh = input_reshape_helper[idx]; + input.reshape(irh); + unsigned int data_copy_size = irh.width(); - // Search for an axis and concatenate tensors. - const TensorDim out_dim = output.getDim(); - const TensorDim in_dim = context.getInput(0).getDim(); - - for (int axis = 0; axis < 4; ++axis) { - if (out_dim[axis] != in_dim[axis]) { - /// @todo Currently a new output tensor is created. This can be optimized. - Tensor result = Tensor::cat(input_tensors, axis); - output.copy(result); - break; + /** loop over the dimensions before the concat dimension */ + if (in_dim.getDataType() == TensorDim::DataType::FP32) { + /** copy continous tensor data (reshaped width) */ + for (unsigned int batch = 0; batch < output.batch(); batch++) { + Tensor dest_tensor = Tensor::Map( + output.getAddress(batch, 0, 0, output_width_offset), + data_copy_size * sizeof(float), + {1, 1, 1, data_copy_size, tensor_type}); + const Tensor source_tensor = + Tensor::Map(input.getAddress(batch, 0, 0, 0), + data_copy_size * sizeof(float), + {1, 1, 1, data_copy_size, tensor_type}); + dest_tensor.copy(source_tensor); + } + } else if (in_dim.getDataType() == TensorDim::DataType::FP16) { +#ifdef ENABLE_FP16 + /** copy continous tensor data (reshaped width) */ + for (unsigned int batch = 0; batch < output.batch(); batch++) { + Tensor dest_tensor = Tensor::Map<_FP16>( + output.getAddress<_FP16>(batch, 0, 0, output_width_offset), + data_copy_size * sizeof(_FP16), + {1, 1, 1, data_copy_size, tensor_type}); + const Tensor source_tensor = + Tensor::Map<_FP16>(input.getAddress<_FP16>(batch, 0, 0, 0), + data_copy_size * sizeof(_FP16), + {1, 1, 1, data_copy_size, tensor_type}); + dest_tensor.copy(source_tensor); + } +#else + throw std::invalid_argument("Error: enable-fp16 is not enabled"); +#endif } - } - // Revert the tensors' dimensions back to their original shape. - for (unsigned int idx = 0; idx < context.getNumInputs(); idx++) { - Tensor &in = context.getInput(idx); - in.reshape(original_input_dims[idx]); + output_width_offset += irh.width(); + input.reshape(in_dim); } - output.reshape(original_output_dim); + output.reshape(out_dim); } void ConcatLayer::incremental_forwarding(RunLayerContext &context, @@ -199,14 +243,23 @@ void ConcatLayer::incremental_forwarding(RunLayerContext &context, void ConcatLayer::calcDerivative(RunLayerContext &context) { /** + * calcDerivative in ConcatLayer works as follows + * + * output in1 in2 in3 + * |---0---||----3----||--6--| |---0---| |----3----| |--6--| + * |---1---||----4----||--7--| => |---1---| |----4----| |--7--| + * |---2---||----5----||--8--| |---2---| |----5----| |--8--| + * + * @note the number of each block indicates the order of copy to inputs + * * @todo avoid copy by creating input here as a shared_tensor of the output * here and then this layer can be in_place as well */ + Tensor output = context.getIncomingDerivative(SINGLE_INOUT_IDX); output.reshape(output_reshape_helper); - unsigned int output_height_offset = 0; - unsigned int data_copy_size = output_reshape_helper.width(); + unsigned int output_width_offset = 0; TensorDim::TensorType tensor_type = output.getTensorType(); for (unsigned int idx = 0; idx < context.getNumInputs(); idx++) { @@ -214,39 +267,36 @@ void ConcatLayer::calcDerivative(RunLayerContext &context) { const TensorDim in_dim = input.getDim(); auto const &irh = input_reshape_helper[idx]; input.reshape(irh); + unsigned int data_copy_size = irh.width(); if (in_dim.getDataType() == TensorDim::DataType::FP32) { /** loop over the dimensions before the concat dimension */ for (unsigned int batch = 0; batch < output.batch(); batch++) { - /** loop over the concat dimension itself */ - for (unsigned int count = 0; count < irh.height(); count++) { - const Tensor source_tensor = Tensor::Map( - output.getAddress(batch, 0, output_height_offset + count, 0), - data_copy_size * sizeof(float), - {1, 1, 1, data_copy_size, tensor_type}); - Tensor dest_tensor = - Tensor::Map(input.getAddress(batch, 0, count, 0), - data_copy_size * sizeof(float), - {1, 1, 1, data_copy_size, tensor_type}); - dest_tensor.copy(source_tensor); - } + /** copy continous data (reshaped width size) in a tensor */ + const Tensor source_tensor = Tensor::Map( + output.getAddress(batch, 0, 0, output_width_offset), + data_copy_size * sizeof(float), + {1, 1, 1, data_copy_size, tensor_type}); + Tensor dest_tensor = + Tensor::Map(input.getAddress(batch, 0, 0, 0), + data_copy_size * sizeof(float), + {1, 1, 1, data_copy_size, tensor_type}); + dest_tensor.copy(source_tensor); } } else if (in_dim.getDataType() == TensorDim::DataType::FP16) { #ifdef ENABLE_FP16 /** loop over the dimensions before the concat dimension */ for (unsigned int batch = 0; batch < output.batch(); batch++) { - /** loop over the concat dimension itself */ - for (unsigned int count = 0; count < irh.height(); count++) { - const Tensor source_tensor = Tensor::Map<_FP16>( - output.getAddress<_FP16>(batch, 0, output_height_offset + count, 0), - data_copy_size * sizeof(_FP16), - {1, 1, 1, data_copy_size, tensor_type}); - Tensor dest_tensor = - Tensor::Map<_FP16>(input.getAddress<_FP16>(batch, 0, count, 0), - data_copy_size * sizeof(_FP16), - {1, 1, 1, data_copy_size, tensor_type}); - dest_tensor.copy(source_tensor); - } + /** copy continous data (reshaped width size) in a tensor */ + const Tensor source_tensor = Tensor::Map<_FP16>( + output.getAddress<_FP16>(batch, 0, 0, output_width_offset), + data_copy_size * sizeof(_FP16), + {1, 1, 1, data_copy_size, tensor_type}); + Tensor dest_tensor = + Tensor::Map<_FP16>(input.getAddress<_FP16>(batch, 0, 0, 0), + data_copy_size * sizeof(_FP16), + {1, 1, 1, data_copy_size, tensor_type}); + dest_tensor.copy(source_tensor); } #else throw std::invalid_argument("Error: enable-fp16 is not enabled"); @@ -254,7 +304,7 @@ void ConcatLayer::calcDerivative(RunLayerContext &context) { } input.reshape(in_dim); - output_height_offset += irh.height(); + output_width_offset += irh.width(); } }