Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Int8 and Int16 Batch Matmul support #139

Merged
merged 2 commits into from
Jul 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions ARM.CMSIS-NN.pdsc
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@
<file category="source" name="Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s4.c"/>
<file category="source" name="Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c"/>
<file category="source" name="Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s16.c"/>
<file category="source" name="Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s16_s16.c"/>
<file category="source" name="Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_svdf_s8.c"/>
<file category="source" name="Source/NNSupportFunctions/arm_q7_to_q15_with_offset.c"/>
<file category="source" name="Source/NNSupportFunctions/arm_s8_to_s16_unordered_with_offset.c"/>
Expand All @@ -127,6 +128,8 @@
<file category="source" name="Source/NNSupportFunctions/arm_nn_vec_mat_mul_result_acc_s8_s16.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_row_offset_s8_s16.c"/>
<file category="source" name="Source/NNSupportFunctions/arm_nn_vec_mat_mul_result_acc_s16.c"/>
<file category="source" name="Source/FullyConnectedFunctions/arm_batch_matmul_s8.c"/>
<file category="source" name="Source/FullyConnectedFunctions/arm_batch_matmul_s16.c"/>
<file category="source" name="Source/FullyConnectedFunctions/arm_fully_connected_s4.c"/>
<file category="source" name="Source/FullyConnectedFunctions/arm_fully_connected_s8.c"/>
<file category="source" name="Source/FullyConnectedFunctions/arm_fully_connected_s16.c"/>
Expand Down
5 changes: 3 additions & 2 deletions Include/Internal/arm_nn_compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
* Title: arm_nn_compiler.h
* Description: Generic compiler header
*
* $Date: 16 January 2024
* $Revision: V.1.2.2
* $Date: 20 June 2024
* $Revision: V.1.3.0
*
* Target : Arm(R) M-Profile Architecture
* -------------------------------------------------------------------- */
Expand Down Expand Up @@ -189,6 +189,7 @@ __STATIC_FORCEINLINE uint8_t CLZ(uint32_t value)
// Common intrinsics
#define SMLABB __smlabb
#define SMLATT __smlatt
#define SMLALD __smlald
#define QADD __qadd
#define QSUB8 __qsub8
#define QSUB16 __qsub16
Expand Down
14 changes: 11 additions & 3 deletions Include/arm_nn_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@
* Description: Public header file to contain the CMSIS-NN structs for the
* TensorFlowLite micro compliant functions
*
* $Date: 11 April 2024
* $Revision: V.3.2.0
* $Date: 19 June 2024
* $Revision: V.3.3.0
*
* Target : Arm(R) M-Profile Architecture
* -------------------------------------------------------------------- */
Expand Down Expand Up @@ -165,11 +165,19 @@ typedef struct
typedef struct
{
int32_t input_offset; /**< The negative of the zero value for the input tensor */
int32_t filter_offset; /**< The negative of the zero value for the filter tensor. Not used */
int32_t filter_offset; /**< The negative of the zero value for the filter tensor */
int32_t output_offset; /**< The negative of the zero value for the output tensor */
cmsis_nn_activation activation;
} cmsis_nn_fc_params;

/** CMSIS-NN object for Batch Matmul layer parameters */
typedef struct
{
const bool adj_x;
const bool adj_y;
cmsis_nn_fc_params fc_params;
} cmsis_nn_bmm_params;

/** CMSIS-NN object for SVDF layer parameters */
typedef struct
{
Expand Down
78 changes: 76 additions & 2 deletions Include/arm_nnfunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
* Title: arm_nnfunctions.h
* Description: Public header file for CMSIS NN Library
*
* $Date: 04 Jun 2024
* $Revision: V.16.1.0
* $Date: 19 June 2024
* $Revision: V.16.2.0
*
* Target : Arm(R) M-Profile Architecture
* -------------------------------------------------------------------- */
Expand Down Expand Up @@ -2604,6 +2604,80 @@ arm_cmsis_nn_status arm_lstm_unidirectional_s16(const int16_t *input,
const cmsis_nn_lstm_params *params,
cmsis_nn_lstm_context *buffers);

/**
* @brief Batch matmul function with 8 bit input and output.
ArmRyan marked this conversation as resolved.
Show resolved Hide resolved
*
* @param[in] ctx Temporary scratch buffer
* The caller is expected to clear the buffer, if applicable, for security reasons.
* Optional function arm_fully_connected_s8_get_buffer_size() provides the buffer
* size if an additional buffer is required.
* @param[in] bmm_params Batch matmul Parameters
* Adjoint flags are currently unused.
* @param[in] quant_params Quantization parameters
* @param[in] input_lhs_dims Input lhs tensor dimensions.
* This should be NHWC where lhs C = rhs C
* @param[in] input_lhs Pointer to input tensor
* @param[in] input_rhs_dims Input lhs tensor dimensions.
* This is expected to be transposed so
* should be NHWC where lhs C = rhs C
* @param[in] input_rhs Pointer to transposed input tensor
* @param[in] output_dims Output tensor dimensions
* @param[out] output Pointer to the output tensor
*
* @return The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
*
* @details
* 1. Supported framework: TensorFlow Lite Micro
* 2. Performs row * row matrix multiplication with the RHS transposed.
*
*/
arm_cmsis_nn_status arm_batch_matmul_s8(const cmsis_nn_context *ctx,
const cmsis_nn_bmm_params *bmm_params,
const cmsis_nn_per_tensor_quant_params *quant_params,
const cmsis_nn_dims *input_lhs_dims,
const int8_t *input_lhs,
const cmsis_nn_dims *input_rhs_dims,
const int8_t *input_rhs,
const cmsis_nn_dims *output_dims,
int8_t *output);

/**
* @brief Batch matmul function with 16 bit input and output.
*
* @param[in] ctx Temporary scratch buffer
* The caller is expected to clear the buffer, if applicable, for security reasons.
* Optional function arm_fully_connected_s8_get_buffer_size() provides the buffer
* size if an additional buffer is required.
* @param[in] bmm_params Batch matmul Parameters
* Adjoint flags are currently unused.
* @param[in] quant_params Quantization parameters
* @param[in] input_lhs_dims Input lhs tensor dimensions.
* This should be NHWC where LHS.C = RHS.C
* @param[in] input_lhs Pointer to input tensor
* @param[in] input_rhs_dims Input lhs tensor dimensions.
* This is expected to be transposed so
* should be NHWC where LHS.C = RHS.C
* @param[in] input_rhs Pointer to transposed input tensor
* @param[in] output_dims Output tensor dimensions
* @param[out] output Pointer to the output tensor
*
* @return The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
*
* @details
* 1. Supported framework: TensorFlow Lite Micro
* 2. Performs row * row matrix multiplication with the RHS transposed.
*
*/
arm_cmsis_nn_status arm_batch_matmul_s16(const cmsis_nn_context *ctx,
const cmsis_nn_bmm_params *bmm_params,
const cmsis_nn_per_tensor_quant_params *quant_params,
const cmsis_nn_dims *input_lhs_dims,
const int16_t *input_lhs,
const cmsis_nn_dims *input_rhs_dims,
const int16_t *input_rhs,
const cmsis_nn_dims *output_dims,
int16_t *output);

#ifdef __cplusplus
}
#endif
Expand Down
34 changes: 31 additions & 3 deletions Include/arm_nnsupportfunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
* Title: arm_nnsupportfunctions.h
* Description: Public header file of support functions for CMSIS NN Library
*
* $Date: 27 May 2024
* $Revision: V.22.1.0
* $Date: 19 June 2024
* $Revision: V.22.2.0
*
* Target : Arm(R) M-Profile Architecture
* -------------------------------------------------------------------- */
Expand Down Expand Up @@ -704,7 +704,7 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const int8_t *lhs,
const int32_t rhs_offset);

/**
* @brief s16 Vector by Matrix (transposed) multiplication
* @brief s16 Vector by s8 Matrix (transposed) multiplication
*
* @param[in] lhs Input left-hand side vector
* @param[in] rhs Input right-hand side matrix (transposed)
Expand All @@ -731,6 +731,34 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s16(const int16_t *lhs,
const int32_t activation_min,
const int32_t activation_max);

/**
* @brief s16 Vector by s16 Matrix (transposed) multiplication
*
* @param[in] lhs Input left-hand side vector
* @param[in] rhs Input right-hand side matrix (transposed)
* @param[in] bias Input bias
* @param[out] dst Output vector
* @param[in] dst_multiplier Output multiplier
* @param[in] dst_shift Output shift
* @param[in] rhs_cols Number of columns in the right-hand side input matrix
* @param[in] rhs_rows Number of rows in the right-hand side input matrix
* @param[in] activation_min Minimum value to clamp the output to. Range: int16
* @param[in] activation_max Maximum value to clamp the output to. Range: int16
*
* @return The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
*
*/
arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s16_s16(const int16_t *lhs,
const int16_t *rhs,
const int64_t *bias,
int16_t *dst,
const int32_t dst_multiplier,
const int32_t dst_shift,
const int32_t rhs_cols,
const int32_t rhs_rows,
const int32_t activation_min,
const int32_t activation_max);

/**
* @brief s8 Vector by Matrix (transposed) multiplication with s16 output
*
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ Examples are Cortex-M55 or Cortex-M85 configured with MVE.
| DepthwiseConv2D | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes |
| TransposeConv2D | Yes | No | No | Yes | No | No | Yes | No | No |
| Fully Connected | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes |
| Batch Matmul | Yes | Yes | No | Yes | Yes | No | Yes | Yes | No |
| Add | Yes | Yes | N/A | Yes | Yes | N/A | Yes | Yes | N/A |
| Mul | Yes | Yes | N/A | Yes | Yes | N/A | Yes | Yes | N/A |
| MaxPooling | Yes | Yes | N/A | Yes | Yes | N/A | Yes | Yes | N/A |
Expand Down
4 changes: 2 additions & 2 deletions Source/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# SPDX-FileCopyrightText: Copyright 2019-2022 Arm Limited and/or its affiliates <[email protected]>
# SPDX-FileCopyrightText: Copyright 2019-2022, 2024 Arm Limited and/or its affiliates <[email protected]>
#
# SPDX-License-Identifier: Apache-2.0
#
Expand All @@ -18,7 +18,7 @@

SET(ROOT ${CMSIS_PATH})

# Select which parts of the CMSIS-DSP must be compiled.
# Select which parts of the CMSIS-NN must be compiled.
# There are some dependencies between the parts but they are not tracked
# by this cmake. So, enabling some functions may require to enable some
# other ones.
Expand Down
105 changes: 105 additions & 0 deletions Source/FullyConnectedFunctions/arm_batch_matmul_s16.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
/*
* SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office.com>
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_batch_matmul_s16.c
* Description: Batch matrix multiplication. Does not perform transposes, see header file for details.
*
* $Date: 19 June 2024
* $Revision: V.1.0.0
*
* Target : Arm(R) M-Profile Architecture
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"

/**
* @ingroup Public
*/

/**
* @addtogroup FC
* @{
*/

/*
* s16 batch matrix multiplication
* Refer to header file for details.
*/
arm_cmsis_nn_status arm_batch_matmul_s16(const cmsis_nn_context *ctx,
const cmsis_nn_bmm_params *bmm_params,
const cmsis_nn_per_tensor_quant_params *quant_params,
const cmsis_nn_dims *input_lhs_dims,
const int16_t *input_lhs,
const cmsis_nn_dims *input_rhs_dims,
const int16_t *input_rhs,
const cmsis_nn_dims *output_dims,
int16_t *output)
{
(void)ctx;
const int32_t output_batch = output_dims->n;
const int32_t output_height = output_dims->h;
const int32_t lhs_rows = input_lhs_dims->w;
const int32_t rhs_rows = input_rhs_dims->w;
const int32_t rhs_cols = input_rhs_dims->c;

const int32_t inner_lhs_diff = input_lhs_dims->h >= input_rhs_dims->h ? 0 : lhs_rows * rhs_cols;
const int32_t inner_rhs_diff = input_rhs_dims->h >= input_lhs_dims->h ? rhs_rows * rhs_cols : 0;
const int32_t outer_lhs_diff = input_lhs_dims->n >= input_rhs_dims->n
? inner_lhs_diff
: -((lhs_rows * rhs_cols) - inner_lhs_diff) * input_lhs_dims->h;
const int32_t outer_rhs_diff = input_rhs_dims->n >= input_lhs_dims->n ? (rhs_rows * rhs_cols) - inner_rhs_diff
: -inner_rhs_diff * input_rhs_dims->h;

const int32_t reduced_multiplier = REDUCE_MULTIPLIER(quant_params->multiplier);

for (int i_out_batch = 0; i_out_batch < output_batch; i_out_batch++)
{
for (int i_out_height = 0; i_out_height < output_height; i_out_height++)
{

for (int j = 0; j < lhs_rows; j++)
{
arm_nn_vec_mat_mult_t_s16_s16(input_lhs,
input_rhs,
NULL,
output,
reduced_multiplier,
quant_params->shift,
rhs_cols,
rhs_rows,
bmm_params->fc_params.activation.min,
bmm_params->fc_params.activation.max);
input_lhs += rhs_cols;
output += rhs_rows;
}
input_lhs -= inner_lhs_diff;
input_rhs += inner_rhs_diff;
}
input_lhs += outer_lhs_diff;
input_rhs += outer_rhs_diff;
}

return ARM_CMSIS_NN_SUCCESS;
}

/**
* @} end of Doxygen group
*/
Loading