Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MVE conv int4: interleave im2col #138

Merged
merged 1 commit into from
Jun 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions ARM.CMSIS-NN.pdsc
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
<file category="source" name="Source/ConvolutionFunctions/arm_depthwise_conv_s4_opt.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_depthwise_conv_s4.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_convolve_s4.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_convolve_even_s4.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_convolve_s8.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_convolve_s16.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_nn_mat_mult_s8.c"/>
Expand Down Expand Up @@ -107,6 +108,7 @@
<file category="source" name="Source/NNSupportFunctions/arm_q7_to_q15_with_offset.c"/>
<file category="source" name="Source/NNSupportFunctions/arm_s8_to_s16_unordered_with_offset.c"/>
<file category="source" name="Source/NNSupportFunctions/arm_nn_mat_mult_nt_t_s4.c"/>
<file category="source" name="Source/NNSupportFunctions/arm_nn_mat_mult_nt_interleaved_t_even_s4.c"/>
<file category="source" name="Source/NNSupportFunctions/arm_nn_mat_mult_nt_t_s8.c"/>
<file category="source" name="Source/NNSupportFunctions/arm_nn_mat_mult_nt_t_s16.c"/>
<file category="source" name="Source/NNSupportFunctions/arm_nn_mat_mult_nt_t_s8_s32.c"/>
Expand Down
46 changes: 44 additions & 2 deletions Include/arm_nnfunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
* Title: arm_nnfunctions.h
* Description: Public header file for CMSIS NN Library
*
* $Date: 23 April 2024
* $Revision: V.16.0.0
* $Date: 04 Jun 2024
* $Revision: V.16.1.0
*
* Target : Arm(R) M-Profile Architecture
* -------------------------------------------------------------------- */
Expand Down Expand Up @@ -354,6 +354,48 @@ arm_cmsis_nn_status arm_convolve_s4(const cmsis_nn_context *ctx,
const int32_t *bias_data,
const cmsis_nn_dims *output_dims,
int8_t *output_data);

/**
* @brief Basic s4 convolution function with a requirement of even number of kernels.
* @param[in, out] ctx Function context that contains the additional buffer if required by the function.
* arm_convolve_s4_get_buffer_size will return the buffer_size if required.
* The caller is expected to clear the buffer ,if applicable, for security reasons.
* @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...).
* Range of conv_params->input_offset : [-127, 128]
* Range of conv_params->output_offset : [-128, 127]
* @param[in] quant_params Per-channel quantization info.
* It contains the multiplier and shift values to be applied to each output channel
* @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
* @param[in] input_data Input (activation) data pointer. Data type: int8
* @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the
* spatial filter dimensions. Note the product must be even.
* @param[in] filter_data Packed Filter data pointer. Data type: int8 packed with 2x int4
* @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
* @param[in] bias_data Optional bias data pointer. Data type: int32
* @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT]
* @param[out] output_data Output data pointer. Data type: int8
*
* @return The function returns <code>ARM_CMSIS_NN_SUCCESS</code> if successful or
* <code>ARM_CMSIS_NN_ARG_ERROR</code> if incorrect arguments or
* <code>ARM_CMSIS_NN_NO_IMPL_ERROR</code> if not for MVE
*
* @details
* 1. Supported framework: TensorFlow Lite micro
* 2. Additional memory is required for optimization. Refer to argument 'ctx' for details.
*
*/
arm_cmsis_nn_status arm_convolve_even_s4(const cmsis_nn_context *ctx,
const cmsis_nn_conv_params *conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const int8_t *input_data,
const cmsis_nn_dims *filter_dims,
const int8_t *filter_data,
const cmsis_nn_dims *bias_dims,
const int32_t *bias_data,
const cmsis_nn_dims *output_dims,
int8_t *output_data);

/**
* @brief Basic s8 convolution function
* @param[in, out] ctx Function context that contains the additional buffer if required by the function.
Expand Down
53 changes: 51 additions & 2 deletions Include/arm_nnsupportfunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
* Title: arm_nnsupportfunctions.h
* Description: Public header file of support functions for CMSIS NN Library
*
* $Date: 30 April 2024
* $Revision: V.22.0.0
* $Date: 27 May 2024
* $Revision: V.22.1.0
*
* Target : Arm(R) M-Profile Architecture
* -------------------------------------------------------------------- */
Expand Down Expand Up @@ -461,6 +461,55 @@ arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s4(const int8_t *lhs,
const int32_t activation_max,
const int32_t lhs_cols_offset);

/**
* @brief General Matrix-multiplication function with per-channel requantization.
* This function assumes:
* - LHS input matrix NOT transposed (nt)
* - RHS input matrix transposed (t)
* - RHS is int8 packed with 2x int4
* - LHS is int8
* - LHS/RHS input columns must be even numbered
* - LHS must be interleaved. Compare to arm_nn_mat_mult_nt_t_s4 where LHS is not interleaved.
*
* @note This operation also performs the broadcast bias addition before the requantization
*
* @param[in] lhs Pointer to the LHS input matrix
* @param[in] rhs Pointer to the RHS input matrix
* @param[in] bias Pointer to the bias vector. The length of this vector is equal to the number of
* output columns (or RHS input rows)
* @param[out] dst Pointer to the output matrix with "m" rows and "n" columns
* @param[in] dst_multipliers Pointer to the multipliers vector needed for the per-channel requantization.
* The length of this vector is equal to the number of output columns (or RHS input
* rows)
* @param[in] dst_shifts Pointer to the shifts vector needed for the per-channel requantization. The length
* of this vector is equal to the number of output columns (or RHS input rows)
* @param[in] lhs_rows Number of LHS input rows
* @param[in] rhs_rows Number of RHS input rows
* @param[in] rhs_cols Number of LHS/RHS input columns. Note this must be even.
* @param[in] lhs_offset Offset to be applied to the LHS input value
* @param[in] dst_offset Offset to be applied the output result
* @param[in] activation_min Minimum value to clamp down the output. Range : int8
* @param[in] activation_max Maximum value to clamp up the output. Range : int8
* @param[in] lhs_cols_offset Column offset between subsequent lhs_rows
*
* @return The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
*
*/
arm_cmsis_nn_status arm_nn_mat_mult_nt_interleaved_t_even_s4(const int8_t *lhs,
const int8_t *rhs,
const int32_t *bias,
int8_t *dst,
const int32_t *dst_multipliers,
const int32_t *dst_shifts,
const int32_t lhs_rows,
const int32_t rhs_rows,
const int32_t rhs_cols,
const int32_t lhs_offset,
const int32_t dst_offset,
const int32_t activation_min,
const int32_t activation_max,
const int32_t lhs_cols_offset);

/**
* @brief General Matrix-multiplication function with per-channel requantization.
* This function assumes:
Expand Down
230 changes: 230 additions & 0 deletions Source/ConvolutionFunctions/arm_convolve_even_s4.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
/*
* SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <[email protected]>
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_even_s4.c
* Description: s8 version of convolution using symmetric quantization with 4 bit weights.
*
* $Date: 05 Jun 2024
* $Revision: V.1.0.0
*
* Target : Arm(R) M-Profile Architecture
*
* -------------------------------------------------------------------- */

#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"

/**
* @ingroup Public
*/

/**
* @addtogroup NNConv
* @{
*/

/*
* Basic s8 convolution function with int4 packed RHS (weights) and even RHS columns,
*
* Refer header file for details.
*
*/
arm_cmsis_nn_status arm_convolve_even_s4(const cmsis_nn_context *ctx,
const cmsis_nn_conv_params *conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const int8_t *input_data,
const cmsis_nn_dims *filter_dims,
const int8_t *packed_filter_data,
const cmsis_nn_dims *bias_dims,
const int32_t *bias_data,
const cmsis_nn_dims *output_dims,
int8_t *output_data)
{
(void)bias_dims;

#if defined(ARM_MATH_MVEI)

if (ctx->buf == NULL)
{
return ARM_CMSIS_NN_ARG_ERROR;
}

int16_t *buffer_a = (int16_t *)ctx->buf;

const int32_t input_batches = input_dims->n;
const uint16_t input_x = input_dims->w;
const uint16_t input_y = input_dims->h;
const uint16_t input_ch = input_dims->c;
const uint16_t kernel_x = filter_dims->w;
const uint16_t kernel_y = filter_dims->h;
const uint16_t output_x = output_dims->w;
const uint16_t output_y = output_dims->h;
const uint16_t output_ch = output_dims->c;

const uint16_t pad_x = conv_params->padding.w;
const uint16_t pad_y = conv_params->padding.h;
const uint16_t stride_x = conv_params->stride.w;
const uint16_t stride_y = conv_params->stride.h;
const int32_t dilation_x = conv_params->dilation.w;
const int32_t dilation_y = conv_params->dilation.h;
const int32_t out_offset = conv_params->output_offset;
const int32_t out_activation_min = conv_params->activation.min;
const int32_t out_activation_max = conv_params->activation.max;
const int32_t rhs_cols = kernel_x * kernel_y * input_ch;
const int32_t input_offset = conv_params->input_offset;

if (rhs_cols & 0x1)
{
return ARM_CMSIS_NN_ARG_ERROR;
}

const int32_t blk_cnt = rhs_cols >> 5;

int32_t *output_mult = quant_params->multiplier;
int32_t *output_shift = quant_params->shift;

int i_batch;

for (i_batch = 0; i_batch < input_batches; i_batch++)
{
/* Generate up to four columns from the input tensor a GEMM computation */
int8_t *im2col_buf = (int8_t *)buffer_a;
const int32_t rhs_rows = output_dims->c;
int8_t *out = output_data;
int32_t lhs_rows = 0;

/* This part implements the im2col function */
for (int i_out_y = 0; i_out_y < output_y; i_out_y++)
{
for (int i_out_x = 0; i_out_x < output_x; i_out_x++)
{
const int32_t base_idx_x = stride_x * i_out_x - pad_x;
const int32_t base_idx_y = stride_y * i_out_y - pad_y;

for (int32_t i_ker_y = 0; i_ker_y < kernel_y; i_ker_y++)
{
for (int32_t i_ker_x = 0; i_ker_x < kernel_x; i_ker_x++)
{
const int32_t k_y = base_idx_y + dilation_y * i_ker_y;
const int32_t k_x = base_idx_x + dilation_x * i_ker_x;

if (k_y < 0 || k_y >= input_y || k_x < 0 || k_x >= input_x)
{
arm_memset_s8(im2col_buf, (int8_t)-input_offset, sizeof(int8_t) * input_ch);
}
else
{
arm_memcpy_s8(im2col_buf, input_data + (k_y * input_x + k_x) * input_ch, input_ch);
}
im2col_buf += input_ch;
}
}

/* Reformat most of the buffer by interleaving it */
int8_t *im2col_buf_interleaved = (int8_t *)buffer_a + lhs_rows * rhs_cols;
for (int j = blk_cnt; j > 0; --j)
{
int8x16x2_t x2 = vld2q_s8(im2col_buf_interleaved);

vstrbq_s8(im2col_buf_interleaved, x2.val[1]);
im2col_buf_interleaved += 16;

vstrbq_s8(im2col_buf_interleaved, x2.val[0]);
im2col_buf_interleaved += 16;
}

lhs_rows++;

/* Computation is filed for every 4 columns */
if (lhs_rows == 4)
{
arm_nn_mat_mult_nt_interleaved_t_even_s4((int8_t *)buffer_a,
packed_filter_data,
bias_data,
out,
output_mult,
output_shift,
lhs_rows,
rhs_rows,
rhs_cols,
input_offset,
out_offset,
out_activation_min,
out_activation_max,
rhs_cols);

out += lhs_rows * rhs_rows;

lhs_rows = 0;
im2col_buf = (int8_t *)buffer_a;
}
}
}

/* Handle left over columns */
if (lhs_rows != 0)
{
arm_nn_mat_mult_nt_interleaved_t_even_s4((int8_t *)buffer_a,
packed_filter_data,
bias_data,
out,
output_mult,
output_shift,
lhs_rows,
rhs_rows,
rhs_cols,
input_offset,
out_offset,
out_activation_min,
out_activation_max,
rhs_cols);
out += lhs_rows * rhs_rows;
lhs_rows = 0;
im2col_buf = (int8_t *)buffer_a;
}

/* Advance to the next batch */
input_data += (input_x * input_y * input_ch);
output_data += (output_x * output_y * output_ch);
}
#else
(void)ctx;
(void)conv_params;
(void)quant_params;
(void)input_dims;
(void)input_data;
(void)filter_dims;
(void)packed_filter_data;
(void)bias_data;
(void)output_dims;
(void)output_data;

return ARM_CMSIS_NN_NO_IMPL_ERROR;

#endif // #if defined(ARM_MATH_MVEI)

/* Return to application */
return ARM_CMSIS_NN_SUCCESS;
}

/**
* @} end of NNConv group
*/
9 changes: 2 additions & 7 deletions Source/ConvolutionFunctions/arm_convolve_s4.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
* Title: arm_convolve_s4.c
* Description: s8 version of convolution using symmetric quantization with 4 bit weights.
*
* $Date: 10 April 2024
* $Revision: V.1.1.0
* $Date: 17 May 2024
* $Revision: V.1.2.0
*
* Target : Arm(R) M-Profile Architecture
*
Expand Down Expand Up @@ -153,11 +153,6 @@ arm_cmsis_nn_status arm_convolve_s4(const cmsis_nn_context *ctx,
im2col_buf = (int8_t *)buffer_a;
}
}

if (out == NULL)
{
return ARM_CMSIS_NN_NO_IMPL_ERROR;
}
}

/* Handle left over columns */
Expand Down
Loading