ARM-software · mansnils · Jun 20, 2024 · May 17, 2024
diff --git a/ARM.CMSIS-NN.pdsc b/ARM.CMSIS-NN.pdsc
@@ -61,6 +61,7 @@
         <file category="source" name="Source/ConvolutionFunctions/arm_depthwise_conv_s4_opt.c"/>
         <file category="source" name="Source/ConvolutionFunctions/arm_depthwise_conv_s4.c"/>
         <file category="source" name="Source/ConvolutionFunctions/arm_convolve_s4.c"/>
+        <file category="source" name="Source/ConvolutionFunctions/arm_convolve_even_s4.c"/>
         <file category="source" name="Source/ConvolutionFunctions/arm_convolve_s8.c"/>
         <file category="source" name="Source/ConvolutionFunctions/arm_convolve_s16.c"/>
         <file category="source" name="Source/ConvolutionFunctions/arm_nn_mat_mult_s8.c"/>
@@ -107,6 +108,7 @@
         <file category="source" name="Source/NNSupportFunctions/arm_q7_to_q15_with_offset.c"/>
         <file category="source" name="Source/NNSupportFunctions/arm_s8_to_s16_unordered_with_offset.c"/>
         <file category="source" name="Source/NNSupportFunctions/arm_nn_mat_mult_nt_t_s4.c"/>
+        <file category="source" name="Source/NNSupportFunctions/arm_nn_mat_mult_nt_interleaved_t_even_s4.c"/>
         <file category="source" name="Source/NNSupportFunctions/arm_nn_mat_mult_nt_t_s8.c"/>
         <file category="source" name="Source/NNSupportFunctions/arm_nn_mat_mult_nt_t_s16.c"/>
         <file category="source" name="Source/NNSupportFunctions/arm_nn_mat_mult_nt_t_s8_s32.c"/>

diff --git a/Include/arm_nnfunctions.h b/Include/arm_nnfunctions.h
@@ -21,8 +21,8 @@
  * Title:        arm_nnfunctions.h
  * Description:  Public header file for CMSIS NN Library
  *
- * $Date:        23 April 2024
- * $Revision:    V.16.0.0
+ * $Date:        04 Jun 2024
+ * $Revision:    V.16.1.0
  *
  * Target :  Arm(R) M-Profile Architecture
  * -------------------------------------------------------------------- */
@@ -354,6 +354,48 @@ arm_cmsis_nn_status arm_convolve_s4(const cmsis_nn_context *ctx,
                                     const int32_t *bias_data,
                                     const cmsis_nn_dims *output_dims,
                                     int8_t *output_data);
+
+/**
+ * @brief Basic s4 convolution function with a requirement of even number of kernels.
+ * @param[in, out] ctx            Function context that contains the additional buffer if required by the function.
+ *                                arm_convolve_s4_get_buffer_size will return the buffer_size if required.
+ *                                The caller is expected to clear the buffer ,if applicable, for security reasons.
+ * @param[in]      conv_params    Convolution parameters (e.g. strides, dilations, pads,...).
+ *                                Range of conv_params->input_offset  : [-127, 128]
+ *                                Range of conv_params->output_offset : [-128, 127]
+ * @param[in]      quant_params   Per-channel quantization info.
+ *                                It contains the multiplier and shift values to be applied to each output channel
+ * @param[in]      input_dims     Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
+ * @param[in]      input_data     Input (activation) data pointer. Data type: int8
+ * @param[in]      filter_dims    Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the
+ *                                spatial filter dimensions. Note the product must be even.
+ * @param[in]      filter_data    Packed Filter data pointer. Data type: int8 packed with 2x int4
+ * @param[in]      bias_dims      Bias tensor dimensions. Format: [C_OUT]
+ * @param[in]      bias_data      Optional bias data pointer. Data type: int32
+ * @param[in]      output_dims    Output tensor dimensions. Format: [N, H, W, C_OUT]
+ * @param[out]     output_data    Output data pointer. Data type: int8
+ *
+ * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code> if successful or
+ *                                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if incorrect arguments or
+ *                                  <code>ARM_CMSIS_NN_NO_IMPL_ERROR</code> if not for MVE
+ *
+ * @details
+ *    1. Supported framework: TensorFlow Lite micro
+ *    2. Additional memory is required for optimization. Refer to argument 'ctx' for details.
+ *
+ */
+arm_cmsis_nn_status arm_convolve_even_s4(const cmsis_nn_context *ctx,
+                                         const cmsis_nn_conv_params *conv_params,
+                                         const cmsis_nn_per_channel_quant_params *quant_params,
+                                         const cmsis_nn_dims *input_dims,
+                                         const int8_t *input_data,
+                                         const cmsis_nn_dims *filter_dims,
+                                         const int8_t *filter_data,
+                                         const cmsis_nn_dims *bias_dims,
+                                         const int32_t *bias_data,
+                                         const cmsis_nn_dims *output_dims,
+                                         int8_t *output_data);
+
 /**
  * @brief Basic s8 convolution function
  * @param[in, out] ctx            Function context that contains the additional buffer if required by the function.

diff --git a/Include/arm_nnsupportfunctions.h b/Include/arm_nnsupportfunctions.h
@@ -21,8 +21,8 @@
  * Title:        arm_nnsupportfunctions.h
  * Description:  Public header file of support functions for CMSIS NN Library
  *
- * $Date:        30 April 2024
- * $Revision:    V.22.0.0
+ * $Date:        27 May 2024
+ * $Revision:    V.22.1.0
  *
  * Target :  Arm(R) M-Profile Architecture
  * -------------------------------------------------------------------- */
@@ -461,6 +461,55 @@ arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s4(const int8_t *lhs,
                                             const int32_t activation_max,
                                             const int32_t lhs_cols_offset);
 
+/**
+ * @brief General Matrix-multiplication function with per-channel requantization.
+ *        This function assumes:
+ *        - LHS input matrix NOT transposed (nt)
+ *        - RHS input matrix transposed (t)
+ *        - RHS is int8 packed with 2x int4
+ *        - LHS is int8
+ *        - LHS/RHS input columns must be even numbered
+ *        - LHS must be interleaved. Compare to arm_nn_mat_mult_nt_t_s4 where LHS is not interleaved.
+ *
+ *  @note This operation also performs the broadcast bias addition before the requantization
+ *
+ * @param[in]  lhs                Pointer to the LHS input matrix
+ * @param[in]  rhs                Pointer to the RHS input matrix
+ * @param[in]  bias               Pointer to the bias vector. The length of this vector is equal to the number of
+ *                                output columns (or RHS input rows)
+ * @param[out] dst                Pointer to the output matrix with "m" rows and "n" columns
+ * @param[in]  dst_multipliers    Pointer to the multipliers vector needed for the per-channel requantization.
+ *                                The length of this vector is equal to the number of output columns (or RHS input
+ *                                rows)
+ * @param[in]  dst_shifts         Pointer to the shifts vector needed for the per-channel requantization. The length
+ *                                of this vector is equal to the number of output columns (or RHS input rows)
+ * @param[in]  lhs_rows           Number of LHS input rows
+ * @param[in]  rhs_rows           Number of RHS input rows
+ * @param[in]  rhs_cols           Number of LHS/RHS input columns. Note this must be even.
+ * @param[in]  lhs_offset         Offset to be applied to the LHS input value
+ * @param[in]  dst_offset         Offset to be applied the output result
+ * @param[in]  activation_min     Minimum value to clamp down the output. Range : int8
+ * @param[in]  activation_max     Maximum value to clamp up the output. Range : int8
+ * @param[in]  lhs_cols_offset    Column offset between subsequent lhs_rows
+ *
+ * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
+ *
+ */
+arm_cmsis_nn_status arm_nn_mat_mult_nt_interleaved_t_even_s4(const int8_t *lhs,
+                                                             const int8_t *rhs,
+                                                             const int32_t *bias,
+                                                             int8_t *dst,
+                                                             const int32_t *dst_multipliers,
+                                                             const int32_t *dst_shifts,
+                                                             const int32_t lhs_rows,
+                                                             const int32_t rhs_rows,
+                                                             const int32_t rhs_cols,
+                                                             const int32_t lhs_offset,
+                                                             const int32_t dst_offset,
+                                                             const int32_t activation_min,
+                                                             const int32_t activation_max,
+                                                             const int32_t lhs_cols_offset);
+
 /**
  * @brief General Matrix-multiplication function with per-channel requantization.
  *        This function assumes:

diff --git a/Source/ConvolutionFunctions/arm_convolve_even_s4.c b/Source/ConvolutionFunctions/arm_convolve_even_s4.c
@@ -0,0 +1,230 @@
+/*
+ * SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <[email protected]>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_convolve_even_s4.c
+ * Description:  s8 version of convolution using symmetric quantization with 4 bit weights.
+ *
+ * $Date:        05 Jun 2024
+ * $Revision:    V.1.0.0
+ *
+ * Target :  Arm(R) M-Profile Architecture
+ *
+ * -------------------------------------------------------------------- */
+
+#include "arm_nnfunctions.h"
+#include "arm_nnsupportfunctions.h"
+
+/**
+ *  @ingroup Public
+ */
+
+/**
+ * @addtogroup NNConv
+ * @{
+ */
+
+/*
+ * Basic s8 convolution function with int4 packed RHS (weights) and even RHS columns,
+ *
+ * Refer header file for details.
+ *
+ */
+arm_cmsis_nn_status arm_convolve_even_s4(const cmsis_nn_context *ctx,
+                                         const cmsis_nn_conv_params *conv_params,
+                                         const cmsis_nn_per_channel_quant_params *quant_params,
+                                         const cmsis_nn_dims *input_dims,
+                                         const int8_t *input_data,
+                                         const cmsis_nn_dims *filter_dims,
+                                         const int8_t *packed_filter_data,
+                                         const cmsis_nn_dims *bias_dims,
+                                         const int32_t *bias_data,
+                                         const cmsis_nn_dims *output_dims,
+                                         int8_t *output_data)
+{
+    (void)bias_dims;
+
+#if defined(ARM_MATH_MVEI)
+
+    if (ctx->buf == NULL)
+    {
+        return ARM_CMSIS_NN_ARG_ERROR;
+    }
+
+    int16_t *buffer_a = (int16_t *)ctx->buf;
+
+    const int32_t input_batches = input_dims->n;
+    const uint16_t input_x = input_dims->w;
+    const uint16_t input_y = input_dims->h;
+    const uint16_t input_ch = input_dims->c;
+    const uint16_t kernel_x = filter_dims->w;
+    const uint16_t kernel_y = filter_dims->h;
+    const uint16_t output_x = output_dims->w;
+    const uint16_t output_y = output_dims->h;
+    const uint16_t output_ch = output_dims->c;
+
+    const uint16_t pad_x = conv_params->padding.w;
+    const uint16_t pad_y = conv_params->padding.h;
+    const uint16_t stride_x = conv_params->stride.w;
+    const uint16_t stride_y = conv_params->stride.h;
+    const int32_t dilation_x = conv_params->dilation.w;
+    const int32_t dilation_y = conv_params->dilation.h;
+    const int32_t out_offset = conv_params->output_offset;
+    const int32_t out_activation_min = conv_params->activation.min;
+    const int32_t out_activation_max = conv_params->activation.max;
+    const int32_t rhs_cols = kernel_x * kernel_y * input_ch;
+    const int32_t input_offset = conv_params->input_offset;
+
+    if (rhs_cols & 0x1)
+    {
+        return ARM_CMSIS_NN_ARG_ERROR;
+    }
+
+    const int32_t blk_cnt = rhs_cols >> 5;
+
+    int32_t *output_mult = quant_params->multiplier;
+    int32_t *output_shift = quant_params->shift;
+
+    int i_batch;
+
+    for (i_batch = 0; i_batch < input_batches; i_batch++)
+    {
+        /* Generate up to four columns from the input tensor a GEMM computation */
+        int8_t *im2col_buf = (int8_t *)buffer_a;
+        const int32_t rhs_rows = output_dims->c;
+        int8_t *out = output_data;
+        int32_t lhs_rows = 0;
+
+        /* This part implements the im2col function */
+        for (int i_out_y = 0; i_out_y < output_y; i_out_y++)
+        {
+            for (int i_out_x = 0; i_out_x < output_x; i_out_x++)
+            {
+                const int32_t base_idx_x = stride_x * i_out_x - pad_x;
+                const int32_t base_idx_y = stride_y * i_out_y - pad_y;
+
+                for (int32_t i_ker_y = 0; i_ker_y < kernel_y; i_ker_y++)
+                {
+                    for (int32_t i_ker_x = 0; i_ker_x < kernel_x; i_ker_x++)
+                    {
+                        const int32_t k_y = base_idx_y + dilation_y * i_ker_y;
+                        const int32_t k_x = base_idx_x + dilation_x * i_ker_x;
+
+                        if (k_y < 0 || k_y >= input_y || k_x < 0 || k_x >= input_x)
+                        {
+                            arm_memset_s8(im2col_buf, (int8_t)-input_offset, sizeof(int8_t) * input_ch);
+                        }
+                        else
+                        {
+                            arm_memcpy_s8(im2col_buf, input_data + (k_y * input_x + k_x) * input_ch, input_ch);
+                        }
+                        im2col_buf += input_ch;
+                    }
+                }
+
+                /* Reformat most of the buffer by interleaving it */
+                int8_t *im2col_buf_interleaved = (int8_t *)buffer_a + lhs_rows * rhs_cols;
+                for (int j = blk_cnt; j > 0; --j)
+                {
+                    int8x16x2_t x2 = vld2q_s8(im2col_buf_interleaved);
+
+                    vstrbq_s8(im2col_buf_interleaved, x2.val[1]);
+                    im2col_buf_interleaved += 16;
+
+                    vstrbq_s8(im2col_buf_interleaved, x2.val[0]);
+                    im2col_buf_interleaved += 16;
+                }
+
+                lhs_rows++;
+
+                /* Computation is filed for every 4 columns */
+                if (lhs_rows == 4)
+                {
+                    arm_nn_mat_mult_nt_interleaved_t_even_s4((int8_t *)buffer_a,
+                                                             packed_filter_data,
+                                                             bias_data,
+                                                             out,
+                                                             output_mult,
+                                                             output_shift,
+                                                             lhs_rows,
+                                                             rhs_rows,
+                                                             rhs_cols,
+                                                             input_offset,
+                                                             out_offset,
+                                                             out_activation_min,
+                                                             out_activation_max,
+                                                             rhs_cols);
+
+                    out += lhs_rows * rhs_rows;
+
+                    lhs_rows = 0;
+                    im2col_buf = (int8_t *)buffer_a;
+                }
+            }
+        }
+
+        /* Handle left over columns */
+        if (lhs_rows != 0)
+        {
+            arm_nn_mat_mult_nt_interleaved_t_even_s4((int8_t *)buffer_a,
+                                                     packed_filter_data,
+                                                     bias_data,
+                                                     out,
+                                                     output_mult,
+                                                     output_shift,
+                                                     lhs_rows,
+                                                     rhs_rows,
+                                                     rhs_cols,
+                                                     input_offset,
+                                                     out_offset,
+                                                     out_activation_min,
+                                                     out_activation_max,
+                                                     rhs_cols);
+            out += lhs_rows * rhs_rows;
+            lhs_rows = 0;
+            im2col_buf = (int8_t *)buffer_a;
+        }
+
+        /* Advance to the next batch */
+        input_data += (input_x * input_y * input_ch);
+        output_data += (output_x * output_y * output_ch);
+    }
+#else
+    (void)ctx;
+    (void)conv_params;
+    (void)quant_params;
+    (void)input_dims;
+    (void)input_data;
+    (void)filter_dims;
+    (void)packed_filter_data;
+    (void)bias_data;
+    (void)output_dims;
+    (void)output_data;
+
+    return ARM_CMSIS_NN_NO_IMPL_ERROR;
+
+#endif // #if defined(ARM_MATH_MVEI)
+
+    /* Return to application */
+    return ARM_CMSIS_NN_SUCCESS;
+}
+
+/**
+ * @} end of NNConv group
+ */
diff --git a/Source/ConvolutionFunctions/arm_convolve_s4.c b/Source/ConvolutionFunctions/arm_convolve_s4.c
@@ -21,8 +21,8 @@
  * Title:        arm_convolve_s4.c
  * Description:  s8 version of convolution using symmetric quantization with 4 bit weights.
  *
- * $Date:        10 April 2024
- * $Revision:    V.1.1.0
+ * $Date:        17 May 2024
+ * $Revision:    V.1.2.0
  *
  * Target :  Arm(R) M-Profile Architecture
  *
@@ -153,11 +153,6 @@ arm_cmsis_nn_status arm_convolve_s4(const cmsis_nn_context *ctx,
                     im2col_buf = (int8_t *)buffer_a;
                 }
             }
-
-            if (out == NULL)
-            {
-                return ARM_CMSIS_NN_NO_IMPL_ERROR;
-            }
         }
 
         /* Handle left over columns */