ARM-software · ArmRyan · Jul 11, 2024 · Mar 21, 2024 · Jul 11, 2024
diff --git a/ARM.CMSIS-NN.pdsc b/ARM.CMSIS-NN.pdsc
@@ -104,6 +104,7 @@
         <file category="source" name="Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s4.c"/>
         <file category="source" name="Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c"/>
         <file category="source" name="Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s16.c"/>
+        <file category="source" name="Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s16_s16.c"/>
         <file category="source" name="Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_svdf_s8.c"/>
         <file category="source" name="Source/NNSupportFunctions/arm_q7_to_q15_with_offset.c"/>
         <file category="source" name="Source/NNSupportFunctions/arm_s8_to_s16_unordered_with_offset.c"/>
@@ -127,6 +128,8 @@
         <file category="source" name="Source/NNSupportFunctions/arm_nn_vec_mat_mul_result_acc_s8_s16.c"/>
         <file category="source" name="Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_row_offset_s8_s16.c"/>
         <file category="source" name="Source/NNSupportFunctions/arm_nn_vec_mat_mul_result_acc_s16.c"/>
+        <file category="source" name="Source/FullyConnectedFunctions/arm_batch_matmul_s8.c"/>
+        <file category="source" name="Source/FullyConnectedFunctions/arm_batch_matmul_s16.c"/>
         <file category="source" name="Source/FullyConnectedFunctions/arm_fully_connected_s4.c"/>
         <file category="source" name="Source/FullyConnectedFunctions/arm_fully_connected_s8.c"/>
         <file category="source" name="Source/FullyConnectedFunctions/arm_fully_connected_s16.c"/>

diff --git a/Include/Internal/arm_nn_compiler.h b/Include/Internal/arm_nn_compiler.h
@@ -21,8 +21,8 @@
  * Title:        arm_nn_compiler.h
  * Description:  Generic compiler header
  *
- * $Date:        16 January 2024
- * $Revision:    V.1.2.2
+ * $Date:        20 June 2024
+ * $Revision:    V.1.3.0
  *
  * Target :  Arm(R) M-Profile Architecture
  * -------------------------------------------------------------------- */
@@ -189,6 +189,7 @@ __STATIC_FORCEINLINE uint8_t CLZ(uint32_t value)
     // Common intrinsics
     #define SMLABB __smlabb
     #define SMLATT __smlatt
+    #define SMLALD __smlald
     #define QADD __qadd
     #define QSUB8 __qsub8
     #define QSUB16 __qsub16

diff --git a/Include/arm_nn_types.h b/Include/arm_nn_types.h
@@ -22,8 +22,8 @@
  * Description:  Public header file to contain the CMSIS-NN structs for the
  *               TensorFlowLite micro compliant functions
  *
- * $Date:        11 April 2024
- * $Revision:    V.3.2.0
+ * $Date:        19 June 2024
+ * $Revision:    V.3.3.0
  *
  * Target :  Arm(R) M-Profile Architecture
  * -------------------------------------------------------------------- */
@@ -165,11 +165,19 @@ typedef struct
 typedef struct
 {
     int32_t input_offset;  /**< The negative of the zero value for the input tensor */
-    int32_t filter_offset; /**< The negative of the zero value for the filter tensor. Not used */
+    int32_t filter_offset; /**< The negative of the zero value for the filter tensor */
     int32_t output_offset; /**< The negative of the zero value for the output tensor */
     cmsis_nn_activation activation;
 } cmsis_nn_fc_params;
 
+/** CMSIS-NN object for Batch Matmul layer parameters */
+typedef struct
+{
+    const bool adj_x;
+    const bool adj_y;
+    cmsis_nn_fc_params fc_params;
+} cmsis_nn_bmm_params;
+
 /** CMSIS-NN object for SVDF layer parameters */
 typedef struct
 {

diff --git a/Include/arm_nnfunctions.h b/Include/arm_nnfunctions.h
@@ -21,8 +21,8 @@
  * Title:        arm_nnfunctions.h
  * Description:  Public header file for CMSIS NN Library
  *
- * $Date:        04 Jun 2024
- * $Revision:    V.16.1.0
+ * $Date:        19 June 2024
+ * $Revision:    V.16.2.0
  *
  * Target :  Arm(R) M-Profile Architecture
  * -------------------------------------------------------------------- */
@@ -2604,6 +2604,80 @@ arm_cmsis_nn_status arm_lstm_unidirectional_s16(const int16_t *input,
                                                 const cmsis_nn_lstm_params *params,
                                                 cmsis_nn_lstm_context *buffers);
 
+/**
+ * @brief Batch matmul function with 8 bit input and output.
+ *
+ * @param[in]   ctx                   Temporary scratch buffer
+ *                                    The caller is expected to clear the buffer, if applicable, for security reasons.
+ *                                    Optional function arm_fully_connected_s8_get_buffer_size() provides the buffer
+ *                                    size if an additional buffer is required.
+ * @param[in]   bmm_params            Batch matmul Parameters
+ *                                    Adjoint flags are currently unused.
+ * @param[in]   quant_params          Quantization parameters
+ * @param[in]   input_lhs_dims        Input lhs tensor dimensions.
+ *                                    This should be NHWC where lhs C = rhs C
+ * @param[in]   input_lhs             Pointer to input tensor
+ * @param[in]   input_rhs_dims        Input lhs tensor dimensions.
+ *                                    This is expected to be transposed so
+ *                                    should be NHWC where lhs C = rhs C
+ * @param[in]   input_rhs             Pointer to transposed input tensor
+ * @param[in]   output_dims           Output tensor dimensions
+ * @param[out]  output                Pointer to the output tensor
+ *
+ * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
+ *
+ * @details
+ *    1. Supported framework: TensorFlow Lite Micro
+ *    2. Performs row * row matrix multiplication with the RHS transposed.
+ *
+ */
+arm_cmsis_nn_status arm_batch_matmul_s8(const cmsis_nn_context *ctx,
+                                        const cmsis_nn_bmm_params *bmm_params,
+                                        const cmsis_nn_per_tensor_quant_params *quant_params,
+                                        const cmsis_nn_dims *input_lhs_dims,
+                                        const int8_t *input_lhs,
+                                        const cmsis_nn_dims *input_rhs_dims,
+                                        const int8_t *input_rhs,
+                                        const cmsis_nn_dims *output_dims,
+                                        int8_t *output);
+
+/**
+ * @brief Batch matmul function with 16 bit input and output.
+ *
+ * @param[in]   ctx                   Temporary scratch buffer
+ *                                    The caller is expected to clear the buffer, if applicable, for security reasons.
+ *                                    Optional function arm_fully_connected_s8_get_buffer_size() provides the buffer
+ *                                    size if an additional buffer is required.
+ * @param[in]   bmm_params            Batch matmul Parameters
+ *                                    Adjoint flags are currently unused.
+ * @param[in]   quant_params          Quantization parameters
+ * @param[in]   input_lhs_dims        Input lhs tensor dimensions.
+ *                                    This should be NHWC where LHS.C = RHS.C
+ * @param[in]   input_lhs             Pointer to input tensor
+ * @param[in]   input_rhs_dims        Input lhs tensor dimensions.
+ *                                    This is expected to be transposed so
+ *                                    should be NHWC where LHS.C = RHS.C
+ * @param[in]   input_rhs             Pointer to transposed input tensor
+ * @param[in]   output_dims           Output tensor dimensions
+ * @param[out]  output                Pointer to the output tensor
+ *
+ * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
+ *
+ * @details
+ *    1. Supported framework: TensorFlow Lite Micro
+ *    2. Performs row * row matrix multiplication with the RHS transposed.
+ *
+ */
+arm_cmsis_nn_status arm_batch_matmul_s16(const cmsis_nn_context *ctx,
+                                         const cmsis_nn_bmm_params *bmm_params,
+                                         const cmsis_nn_per_tensor_quant_params *quant_params,
+                                         const cmsis_nn_dims *input_lhs_dims,
+                                         const int16_t *input_lhs,
+                                         const cmsis_nn_dims *input_rhs_dims,
+                                         const int16_t *input_rhs,
+                                         const cmsis_nn_dims *output_dims,
+                                         int16_t *output);
+
 #ifdef __cplusplus
 }
 #endif

diff --git a/Include/arm_nnsupportfunctions.h b/Include/arm_nnsupportfunctions.h
@@ -21,8 +21,8 @@
  * Title:        arm_nnsupportfunctions.h
  * Description:  Public header file of support functions for CMSIS NN Library
  *
- * $Date:        27 May 2024
- * $Revision:    V.22.1.0
+ * $Date:        19 June 2024
+ * $Revision:    V.22.2.0
  *
  * Target :  Arm(R) M-Profile Architecture
  * -------------------------------------------------------------------- */
@@ -704,7 +704,7 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const int8_t *lhs,
                                              const int32_t rhs_offset);
 
 /**
- * @brief s16 Vector by Matrix (transposed) multiplication
+ * @brief s16 Vector by s8 Matrix (transposed) multiplication
  *
  * @param[in]      lhs             Input left-hand side vector
  * @param[in]      rhs             Input right-hand side matrix (transposed)
@@ -731,6 +731,34 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s16(const int16_t *lhs,
                                               const int32_t activation_min,
                                               const int32_t activation_max);
 
+/**
+ * @brief s16 Vector by s16 Matrix (transposed) multiplication
+ *
+ * @param[in]      lhs             Input left-hand side vector
+ * @param[in]      rhs             Input right-hand side matrix (transposed)
+ * @param[in]      bias            Input bias
+ * @param[out]     dst             Output vector
+ * @param[in]      dst_multiplier  Output multiplier
+ * @param[in]      dst_shift       Output shift
+ * @param[in]      rhs_cols        Number of columns in the right-hand side input matrix
+ * @param[in]      rhs_rows        Number of rows in the right-hand side input matrix
+ * @param[in]      activation_min  Minimum value to clamp the output to. Range: int16
+ * @param[in]      activation_max  Maximum value to clamp the output to. Range: int16
+ *
+ * @return         The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
+ *
+ */
+arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s16_s16(const int16_t *lhs,
+                                                  const int16_t *rhs,
+                                                  const int64_t *bias,
+                                                  int16_t *dst,
+                                                  const int32_t dst_multiplier,
+                                                  const int32_t dst_shift,
+                                                  const int32_t rhs_cols,
+                                                  const int32_t rhs_rows,
+                                                  const int32_t activation_min,
+                                                  const int32_t activation_max);
+
 /**
  * @brief s8 Vector by Matrix (transposed) multiplication with s16 output
  *

diff --git a/README.md b/README.md
@@ -30,6 +30,7 @@ Examples are Cortex-M55 or Cortex-M85 configured with MVE.
 | DepthwiseConv2D | Yes         | Yes        | Yes        | Yes         | Yes          | Yes          | Yes         | Yes          | Yes          |
 | TransposeConv2D | Yes         | No         | No         | Yes         | No           | No           | Yes         | No           | No           |
 | Fully Connected | Yes         | Yes        | Yes        | Yes         | Yes          | Yes          | Yes         | Yes          | Yes          |
+| Batch Matmul    | Yes         | Yes        | No         | Yes         | Yes          | No           | Yes         | Yes          | No           |
 | Add             | Yes         | Yes        | N/A        | Yes         | Yes          | N/A          | Yes         | Yes          | N/A          |
 | Mul             | Yes         | Yes        | N/A        | Yes         | Yes          | N/A          | Yes         | Yes          | N/A          |
 | MaxPooling      | Yes         | Yes        | N/A        | Yes         | Yes          | N/A          | Yes         | Yes          | N/A          |

diff --git a/Source/CMakeLists.txt b/Source/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# SPDX-FileCopyrightText: Copyright 2019-2022 Arm Limited and/or its affiliates <[email protected]>
+# SPDX-FileCopyrightText: Copyright 2019-2022, 2024 Arm Limited and/or its affiliates <[email protected]>
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -18,7 +18,7 @@
 
 SET(ROOT ${CMSIS_PATH})
 
-# Select which parts of the CMSIS-DSP must be compiled.
+# Select which parts of the CMSIS-NN must be compiled.
 # There are some dependencies between the parts but they are not tracked
 # by this cmake. So, enabling some functions may require to enable some
 # other ones.

diff --git a/Source/FullyConnectedFunctions/arm_batch_matmul_s16.c b/Source/FullyConnectedFunctions/arm_batch_matmul_s16.c
@@ -0,0 +1,105 @@
+/*
+ * SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office.com>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_nn_batch_matmul_s16.c
+ * Description:  Batch matrix multiplication. Does not perform transposes, see header file for details.
+ *
+ * $Date:        19 June 2024
+ * $Revision:    V.1.0.0
+ *
+ * Target :  Arm(R) M-Profile Architecture
+ *
+ * -------------------------------------------------------------------- */
+#include "arm_nnfunctions.h"
+#include "arm_nnsupportfunctions.h"
+
+/**
+ * @ingroup Public
+ */
+
+/**
+ * @addtogroup FC
+ * @{
+ */
+
+/*
+ * s16 batch matrix multiplication
+ * Refer to header file for details.
+ */
+arm_cmsis_nn_status arm_batch_matmul_s16(const cmsis_nn_context *ctx,
+                                         const cmsis_nn_bmm_params *bmm_params,
+                                         const cmsis_nn_per_tensor_quant_params *quant_params,
+                                         const cmsis_nn_dims *input_lhs_dims,
+                                         const int16_t *input_lhs,
+                                         const cmsis_nn_dims *input_rhs_dims,
+                                         const int16_t *input_rhs,
+                                         const cmsis_nn_dims *output_dims,
+                                         int16_t *output)
+{
+    (void)ctx;
+    const int32_t output_batch = output_dims->n;
+    const int32_t output_height = output_dims->h;
+    const int32_t lhs_rows = input_lhs_dims->w;
+    const int32_t rhs_rows = input_rhs_dims->w;
+    const int32_t rhs_cols = input_rhs_dims->c;
+
+    const int32_t inner_lhs_diff = input_lhs_dims->h >= input_rhs_dims->h ? 0 : lhs_rows * rhs_cols;
+    const int32_t inner_rhs_diff = input_rhs_dims->h >= input_lhs_dims->h ? rhs_rows * rhs_cols : 0;
+    const int32_t outer_lhs_diff = input_lhs_dims->n >= input_rhs_dims->n
+        ? inner_lhs_diff
+        : -((lhs_rows * rhs_cols) - inner_lhs_diff) * input_lhs_dims->h;
+    const int32_t outer_rhs_diff = input_rhs_dims->n >= input_lhs_dims->n ? (rhs_rows * rhs_cols) - inner_rhs_diff
+                                                                          : -inner_rhs_diff * input_rhs_dims->h;
+
+    const int32_t reduced_multiplier = REDUCE_MULTIPLIER(quant_params->multiplier);
+
+    for (int i_out_batch = 0; i_out_batch < output_batch; i_out_batch++)
+    {
+        for (int i_out_height = 0; i_out_height < output_height; i_out_height++)
+        {
+
+            for (int j = 0; j < lhs_rows; j++)
+            {
+                arm_nn_vec_mat_mult_t_s16_s16(input_lhs,
+                                              input_rhs,
+                                              NULL,
+                                              output,
+                                              reduced_multiplier,
+                                              quant_params->shift,
+                                              rhs_cols,
+                                              rhs_rows,
+                                              bmm_params->fc_params.activation.min,
+                                              bmm_params->fc_params.activation.max);
+                input_lhs += rhs_cols;
+                output += rhs_rows;
+            }
+            input_lhs -= inner_lhs_diff;
+            input_rhs += inner_rhs_diff;
+        }
+        input_lhs += outer_lhs_diff;
+        input_rhs += outer_rhs_diff;
+    }
+
+    return ARM_CMSIS_NN_SUCCESS;
+}
+
+/**
+ * @} end of Doxygen group
+ */