From 01410206f0a12113f175a1dda6aba9f0f3a52355 Mon Sep 17 00:00:00 2001 From: skykongkong8 Date: Thu, 15 Feb 2024 18:06:58 +0900 Subject: [PATCH 01/11] [ BLAS ] implement elementwise vector multiplication and addition in fp32 - rename ewvm, ewva to ele_mul and ele_add - implement for fp32 case as well - add scalar multiplier alpha, beta parameter **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: skykongkong8 --- nntrainer/tensor/blas_interface.cpp | 68 +++++++++++++++++++---------- nntrainer/tensor/blas_interface.h | 55 +++++++++++++++++------ nntrainer/tensor/blas_neon.cpp | 34 ++++++++++++++- nntrainer/tensor/blas_neon.h | 34 ++++++++++++++- nntrainer/tensor/half_tensor.cpp | 5 +-- nntrainer/tensor/tensor.cpp | 4 +- 6 files changed, 154 insertions(+), 46 deletions(-) diff --git a/nntrainer/tensor/blas_interface.cpp b/nntrainer/tensor/blas_interface.cpp index 680485ee1b..fd476e47f2 100644 --- a/nntrainer/tensor/blas_interface.cpp +++ b/nntrainer/tensor/blas_interface.cpp @@ -245,25 +245,6 @@ static void copy_int8_to_fp16(const unsigned int N, const uint8_t *X, #endif } -static void ewvm_FP16(const unsigned int N, const _FP16 *X, const _FP16 *Y, - _FP16 *Z) { -#if (defined USE__FP16 && USE_NEON) - nntrainer::neon::ewvm(N, X, Y, Z); -#else - for (unsigned int i = 0; i < N; ++i) - Z[i] = X[i] * Y[i]; -#endif -} - -static void ewva_FP16(const unsigned int N, const _FP16 *X, const _FP16 *Y, - _FP16 *Z) { -#if (defined USE__FP16 && USE_NEON) - nntrainer::neon::ewva(N, X, Y, Z); -#else - for (unsigned int i = 0; i < N; ++i) - Z[i] = X[i] + Y[i]; -#endif -} void sscal(const unsigned int N, const float alpha, _FP16 *X, const int incX) { unsigned int incx = abs(incX); @@ -400,12 +381,28 @@ void scopy_int8_to_float16(const unsigned int N, const uint8_t *X, copy_int8_to_fp16(N, X, incX, Y, incY); } -void ewvm(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z) { - ewvm_FP16(N, X, Y, Z); +void ele_mul(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, + _FP16 alpha, _FP16 beta) { +#if (defined USE__FP16 && USE_NEON) + nntrainer::neon::ele_mul(N, X, Y, Z, alpha, beta); +#else + for (unsigned int i = 0; i < N; ++i) { + Z[i] *= beta; + Z[i] = alpha * X[i] * Y[i]; + } +#endif } -void ewva(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z) { - ewva_FP16(N, X, Y, Z); +void ele_add(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, + _FP16 alpha, _FP16 beta) { +#if (defined USE__FP16 && USE_NEON) + nntrainer::neon::ele_add(N, X, Y, Z, alpha, beta); +#else + for (unsigned int i = 0; i < N; ++i) { + Z[i] *= beta; + Z[i] = X[i] + alpha * Y[i]; + } +#endif } _FP16 snrm2(const int N, const _FP16 *X, const int incX) { @@ -904,4 +901,29 @@ void inv_sqrt_inplace(const unsigned int N, float *X) { #endif } +void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z, + float alpha, float beta) { +#ifdef USE_NEON + nntrainer::neon::ele_mul(N, X, Y, Z, alpha, beta); +#else + for (unsigned int i = 0; i < N; ++i) { + + Z[i] *= beta; + Z[i] = alpha * X[i] * Y[i]; + } +#endif +} + +void ele_add(const unsigned int N, const float *X, const float *Y, float *Z, + float alpha, float beta) { +#ifdef USE_NEON + nntrainer::neon::ele_add(N, X, Y, Z, alpha, beta); +#else + for (unsigned int i = 0; i < N; ++i) { + Z[i] *= beta; + Z[i] = X[i] + alpha * Y[i]; + } +#endif +} + } // namespace nntrainer diff --git a/nntrainer/tensor/blas_interface.h b/nntrainer/tensor/blas_interface.h index a7211c3f74..a0e9d017c0 100644 --- a/nntrainer/tensor/blas_interface.h +++ b/nntrainer/tensor/blas_interface.h @@ -154,20 +154,26 @@ void sgemv(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, const unsigned int M, /** * @brief elementwise vector multiplication : Z = X ⊙ Y * @param[in] N length of the vector - * @param[in] X __fp16 * for Vector X - * @param[in] Y __fp16 * for Vector Y - * @param[in] Z __fp16 * for Vector Z - */ -void ewvm(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z); + * @param[in] X _FP16 * for Vector X + * @param[in] Y _FP16 * for Vector Y + * @param[in] Z _FP16 * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + */ +void ele_mul(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, + _FP16 alpha = 1.f, _FP16 beta = 0.f); /** * @brief elementwise vector addition : Z = X + Y * @param[in] N length of the vector - * @param[in] X __fp16 * for Vector X - * @param[in] Y __fp16 * for Vector Y - * @param[in] Z __fp16 * for Vector Z - */ -void ewva(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z); + * @param[in] X _FP16 * for Vector X + * @param[in] Y _FP16 * for Vector Y + * @param[in] Z _FP16 * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + */ +void ele_add(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, + _FP16 alpha = 1.f, _FP16 beta = 0.f); /** * @brief isamax function : index of first maxima @@ -351,8 +357,7 @@ unsigned int isamax(const unsigned int N, const float *X, const int incX); * @param[in] Y float * for Vector Y * @param[in] alpha float * for scaling angle (radian) */ -void sine(const unsigned int N, float *X, float *Y, - float alpha = 1.0); +void sine(const unsigned int N, float *X, float *Y, float alpha = 1.0); /** * @brief cosine with neon: Y = cos(alpha * X) @@ -361,8 +366,7 @@ void sine(const unsigned int N, float *X, float *Y, * @param[in] Y float * for Vector Y * @param[in] alpha float * for scaling angle (radian) */ -void cosine(const unsigned int N, float *X, float *Y, - float alpha = 1.0); +void cosine(const unsigned int N, float *X, float *Y, float alpha = 1.0); /** * @brief inversed squared root transformation inplace : X = 1 / sqrt(X) @@ -371,6 +375,29 @@ void cosine(const unsigned int N, float *X, float *Y, * @param X float * for Vector X */ void inv_sqrt_inplace(const unsigned int N, float *X); +/** + * @brief elementwise vector multiplication : Z = X ⊙ Y + * @param[in] N length of the vector + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] Z float * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + */ +void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z, + float alpha = 1.f, float beta = 0.f); + +/** + * @brief elementwise vector addition : Z = X + Y + * @param[in] N length of the vector + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] Z float * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + */ +void ele_add(const unsigned int N, const float *X, const float *Y, float *Z, + float alpha = 1.f, float beta = 0.f); } /* namespace nntrainer */ #endif /* __cplusplus */ #endif /* __BLAS_INTERFACE_H__ */ diff --git a/nntrainer/tensor/blas_neon.cpp b/nntrainer/tensor/blas_neon.cpp index bcc2e7476a..0f922a4cd5 100644 --- a/nntrainer/tensor/blas_neon.cpp +++ b/nntrainer/tensor/blas_neon.cpp @@ -435,6 +435,36 @@ void inv_sqrt_inplace(const unsigned int N, float *X) { } } +void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z, float alpha, float beta) { + unsigned int i = 0; + for (; N - i >= 4; i += 4) { + float32x4_t x0_3 = vld1q_f32(&X[i]); + float32x4_t y0_3 = vld1q_f32(&Y[i]); + float32x4_t z0_3 = vmulq_f32(x0_3, y0_3); + + vst1q_f32(&Z[i], z0_3); + } + while (i < N) { + Z[i] = X[i] * Y[i]; + ++i; + } +} + +void ele_add(const unsigned int N, const float *X, const float *Y, float *Z, float alpha, float beta) { + unsigned int i = 0; + for (; N - i >= 4; i += 4) { + float32x4_t x0_3 = vld1q_f32(&X[i]); + float32x4_t y0_3 = vld1q_f32(&Y[i]); + float32x4_t z0_3 = vaddq_f32(x0_3, y0_3); + + vst1q_f32(&Z[i], z0_3); + } + while (i < N) { + Z[i] = X[i] + Y[i]; + ++i; + } +} + #ifdef ENABLE_FP16 void hgemv(const __fp16 *A, const __fp16 *X, __fp16 *Y, uint32_t rows, @@ -2014,7 +2044,7 @@ void hgemm_transAB(const __fp16 *A, const __fp16 *B, float *C, uint32_t M, } } -void ewvm(const unsigned int N, const __fp16 *X, const __fp16 *Y, __fp16 *Z) { +void ele_mul(const unsigned int N, const __fp16 *X, const __fp16 *Y, __fp16 *Z, __fp16 alpha, __fp16 beta) { unsigned int i = 0; for (; N - i >= 8; i += 8) { float16x8_t x0_7 = vld1q_f16(&X[i]); @@ -2029,7 +2059,7 @@ void ewvm(const unsigned int N, const __fp16 *X, const __fp16 *Y, __fp16 *Z) { } } -void ewva(const unsigned int N, const __fp16 *X, const __fp16 *Y, __fp16 *Z) { +void ele_add(const unsigned int N, const __fp16 *X, const __fp16 *Y, __fp16 *Z, __fp16 alpha, __fp16 beta) { unsigned int i = 0; for (; N - i >= 8; i += 8) { float16x8_t x0_7 = vld1q_f16(&X[i]); diff --git a/nntrainer/tensor/blas_neon.h b/nntrainer/tensor/blas_neon.h index 2e30e2f944..116c04e04b 100644 --- a/nntrainer/tensor/blas_neon.h +++ b/nntrainer/tensor/blas_neon.h @@ -98,6 +98,30 @@ void cosine(const unsigned int N, float *X, float *Y, float alpha = 1.0); */ void inv_sqrt_inplace(const unsigned int N, float *X); +/** + * @brief elementwise vector multiplication : Z = X ⊙ Y + * @param[in] N length of the vector + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] Z float * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + */ +void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z, + float alpha = 1.f, float beta = 0.f); + +/** + * @brief elementwise vector addition : Z = X + Y + * @param[in] N length of the vector + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] Z float * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + */ +void ele_add(const unsigned int N, const float *X, const float *Y, float *Z, + float alpha = 1.f, float beta = 0.f); + #ifdef ENABLE_FP16 /** * @brief hgemv computation with neon : Y = alpha*A*X + beta*Y @@ -118,16 +142,22 @@ void hgemv(const __fp16 *A, const __fp16 *X, __fp16 *Y, uint32_t rows, * @param[in] X __fp16 * for Vector X * @param[in] Y __fp16 * for Vector Y * @param[in] Z __fp16 * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output */ -void ewvm(const unsigned N, const __fp16 *X, const __fp16 *Y, __fp16 *Z); +void ele_mul(const unsigned N, const __fp16 *X, const __fp16 *Y, __fp16 *Z, + __fp16 alpha = 1.f, __fp16 beta = 0.f); /** * @brief elementwise vector addition with neon : Z = X + Y * @param[in] N length of the vector * @param[in] X __fp16 * for Vector X * @param[in] Y __fp16 * for Vector Y * @param[in] Z __fp16 * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output */ -void ewva(const unsigned N, const __fp16 *X, const __fp16 *Y, __fp16 *Z); +void ele_add(const unsigned N, const __fp16 *X, const __fp16 *Y, __fp16 *Z, + __fp16 alpha = 1.f, __fp16 beta = 0.f); /** * @brief transposed hgemv computation with neon diff --git a/nntrainer/tensor/half_tensor.cpp b/nntrainer/tensor/half_tensor.cpp index 470f84489c..d0e25b4bf4 100644 --- a/nntrainer/tensor/half_tensor.cpp +++ b/nntrainer/tensor/half_tensor.cpp @@ -417,7 +417,7 @@ TensorV2 &HalfTensor::multiply(TensorV2 const &m, TensorV2 &output, _FP16 *out_buf) { if (e.strides[3] == 1 && output.getStrides()[3] == 1 && strides[3] == 1 && std::fpclassify(beta) == FP_ZERO) { - ewvm(e.buffer_size, buf, m_buf, out_buf); + ele_mul(e.buffer_size, buf, m_buf, out_buf); } else { for (unsigned int i = 0; i < e.buffer_size; ++i) { *out_buf = *buf * *m_buf + static_cast<_FP16>(beta) * *out_buf; @@ -510,8 +510,7 @@ TensorV2 &HalfTensor::add(TensorV2 const &m, TensorV2 &output, float const alpha) const { auto f = [&](const BroadcastInfoV2 &e, const _FP16 *buf, const _FP16 *m_buf, _FP16 *out_buf) { - if (e.strides[3] == 1 && strides[3] == 1 && strides[3] == 1 && - std::fpclassify(alpha) == FP_ZERO) { + if (e.strides[3] == 1 && strides[3] == 1 && strides[3] == 1 && alpha == 0) { ewva(e.buffer_size, buf, m_buf, out_buf); } else { for (unsigned int i = 0; i < e.buffer_size; ++i) { diff --git a/nntrainer/tensor/tensor.cpp b/nntrainer/tensor/tensor.cpp index 36014201cb..9cbfe08f22 100644 --- a/nntrainer/tensor/tensor.cpp +++ b/nntrainer/tensor/tensor.cpp @@ -852,7 +852,7 @@ Tensor &Tensor::multiply(Tensor const &m, Tensor &output, _FP16 *out_buf) { if (e.strides[3] == 1 && output.strides[3] == 1 && strides[3] == 1 && beta == 0.0) { - ewvm(e.buffer_size, buf, m_buf, out_buf); + ele_mul(e.buffer_size, buf, m_buf, out_buf); } else { for (unsigned int i = 0; i < e.buffer_size; ++i) { *out_buf = *buf * *m_buf + static_cast<_FP16>(beta) * *out_buf; @@ -1095,7 +1095,7 @@ Tensor &Tensor::add(Tensor const &m, Tensor &output, float const alpha) const { _FP16 *out_buf) { if (e.strides[3] == 1 && strides[3] == 1 && strides[3] == 1 && alpha == 0) { - ewva(e.buffer_size, buf, m_buf, out_buf); + ele_add(e.buffer_size, buf, m_buf, out_buf); } else { for (unsigned int i = 0; i < e.buffer_size; ++i) { *out_buf = *buf + *m_buf * static_cast<_FP16>(alpha); From 7c2d042d70be03f61ed9deed3a550fdec82ea144 Mon Sep 17 00:00:00 2001 From: skykongkong8 Date: Fri, 16 Feb 2024 10:53:40 +0900 Subject: [PATCH 02/11] [ BLAS ] Add scalar multiplier in elementwise operations - It is quite common to use scalar multiplier in elementwise addition and multiplication. - However, in case of multiplier beta, if the output vector Z is set to NaN, it might produce invalid values. **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: skykongkong8 --- nntrainer/tensor/blas_interface.cpp | 28 +++++++--- nntrainer/tensor/blas_interface.h | 4 +- nntrainer/tensor/blas_neon.cpp | 82 ++++++++++++++++++++++------- nntrainer/tensor/blas_neon.h | 4 +- 4 files changed, 86 insertions(+), 32 deletions(-) diff --git a/nntrainer/tensor/blas_interface.cpp b/nntrainer/tensor/blas_interface.cpp index fd476e47f2..2cb0b24568 100644 --- a/nntrainer/tensor/blas_interface.cpp +++ b/nntrainer/tensor/blas_interface.cpp @@ -382,25 +382,37 @@ void scopy_int8_to_float16(const unsigned int N, const uint8_t *X, } void ele_mul(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, - _FP16 alpha, _FP16 beta) { + float alpha, float beta) { #if (defined USE__FP16 && USE_NEON) nntrainer::neon::ele_mul(N, X, Y, Z, alpha, beta); #else - for (unsigned int i = 0; i < N; ++i) { - Z[i] *= beta; - Z[i] = alpha * X[i] * Y[i]; + if (beta != 0.f) { + for (unsigned int i = 0; i < N; ++i) { + Z[i] = static_cast<_FP16>(alpha) * X[i] * Y[i] + + static_cast<_FP16>(beta) * Z[i]; + } + } else { + for (unsigned int i = 0; i < N; ++i) { + Z[i] = static_cast<_FP16>(alpha) * X[i] * Y[i]; + } } #endif } void ele_add(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, - _FP16 alpha, _FP16 beta) { + float alpha, float beta) { #if (defined USE__FP16 && USE_NEON) nntrainer::neon::ele_add(N, X, Y, Z, alpha, beta); #else - for (unsigned int i = 0; i < N; ++i) { - Z[i] *= beta; - Z[i] = X[i] + alpha * Y[i]; + if (beta != 0.f) { + for (unsigned int i = 0; i < N; ++i) { + Z[i] = X[i] + static_cast<_FP16>(alpha) * Y[i] + + static_cast<_FP16>(beta) * Z[i]; + } + } else { + for (unsigned int i = 0; i < N; ++i) { + Z[i] = X[i] + static_cast<_FP16>(alpha) * Y[i]; + } } #endif } diff --git a/nntrainer/tensor/blas_interface.h b/nntrainer/tensor/blas_interface.h index a0e9d017c0..fa7afc63ca 100644 --- a/nntrainer/tensor/blas_interface.h +++ b/nntrainer/tensor/blas_interface.h @@ -161,7 +161,7 @@ void sgemv(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, const unsigned int M, * @param[in] beta scalar multiplier for output */ void ele_mul(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, - _FP16 alpha = 1.f, _FP16 beta = 0.f); + float alpha = 1.f , float beta = 0.f); /** * @brief elementwise vector addition : Z = X + Y @@ -173,7 +173,7 @@ void ele_mul(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, * @param[in] beta scalar multiplier for output */ void ele_add(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, - _FP16 alpha = 1.f, _FP16 beta = 0.f); + float alpha = 1.f, float beta = 0.f); /** * @brief isamax function : index of first maxima diff --git a/nntrainer/tensor/blas_neon.cpp b/nntrainer/tensor/blas_neon.cpp index 0f922a4cd5..a9743f8f59 100644 --- a/nntrainer/tensor/blas_neon.cpp +++ b/nntrainer/tensor/blas_neon.cpp @@ -435,32 +435,52 @@ void inv_sqrt_inplace(const unsigned int N, float *X) { } } -void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z, float alpha, float beta) { +void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z, + float alpha, float beta) { unsigned int i = 0; + float32x4_t alpha_vec = vdupq_n_f32(alpha); + float32x4_t beta_vec = vdupq_n_f32(beta); for (; N - i >= 4; i += 4) { float32x4_t x0_3 = vld1q_f32(&X[i]); float32x4_t y0_3 = vld1q_f32(&Y[i]); - float32x4_t z0_3 = vmulq_f32(x0_3, y0_3); - - vst1q_f32(&Z[i], z0_3); + if (alpha != 1.f) { + y0_3 = vmulq_f32(y0_3, alpha_vec); + } + float32x4_t xy0_3 = vmulq_f32(x0_3, y0_3); + if (beta != 0.f) { + float32x4_t z0_3 = vmulq_f32(vld1q_f32(&Z[i]), beta_vec); + vst1q_f32(&Z[i], vaddq_f32(z0_3, xy0_3)); + } else + vst1q_f32(&Z[i], xy0_3); } while (i < N) { - Z[i] = X[i] * Y[i]; + if (beta != 1.f) Z[i] = alpha * X[i] * Y[i] + beta * Z[i]; + else Z[i] = alpha * X[i] * Y[i]; ++i; } } -void ele_add(const unsigned int N, const float *X, const float *Y, float *Z, float alpha, float beta) { +void ele_add(const unsigned int N, const float *X, const float *Y, float *Z, + float alpha, float beta) { unsigned int i = 0; + float32x4_t alpha_vec = vdupq_n_f32(alpha); + float32x4_t beta_vec = vdupq_n_f32(beta); for (; N - i >= 4; i += 4) { float32x4_t x0_3 = vld1q_f32(&X[i]); float32x4_t y0_3 = vld1q_f32(&Y[i]); - float32x4_t z0_3 = vaddq_f32(x0_3, y0_3); - - vst1q_f32(&Z[i], z0_3); + if (alpha != 1.f) { + y0_3 = vmulq_f32(y0_3, alpha_vec); + } + float32x4_t xy0_3 = vaddq_f32(x0_3, y0_3); + if (beta != 0.f) { + float32x4_t z0_3 = vmulq_f32(vld1q_f32(&Z[i]), beta_vec); + vst1q_f32(&Z[i], vaddq_f32(z0_3, xy0_3)); + } else + vst1q_f32(&Z[i], xy0_3); } while (i < N) { - Z[i] = X[i] + Y[i]; + if (beta != 1.f) Z[i] = X[i] + alpha * Y[i] + beta * Z[i]; + else Z[i] = X[i] + alpha * Y[i]; ++i; } } @@ -2044,32 +2064,54 @@ void hgemm_transAB(const __fp16 *A, const __fp16 *B, float *C, uint32_t M, } } -void ele_mul(const unsigned int N, const __fp16 *X, const __fp16 *Y, __fp16 *Z, __fp16 alpha, __fp16 beta) { +void ele_mul(const unsigned int N, const __fp16 *X, const __fp16 *Y, __fp16 *Z, + float alpha, float beta) { unsigned int i = 0; + float16x8_t alpha_vec = vdupq_n_f16(alpha); + float16x8_t beta_vec = vdupq_n_f16(beta); for (; N - i >= 8; i += 8) { float16x8_t x0_7 = vld1q_f16(&X[i]); float16x8_t y0_7 = vld1q_f16(&Y[i]); - float16x8_t z0_7 = vmulq_f16(x0_7, y0_7); - - vst1q_f16(&Z[i], z0_7); + if (alpha != 1.f) { + y0_7 = vmulq_f16(y0_7, alpha_vec); + } + float16x8_t xy0_7 = vmulq_f16(x0_7, y0_7); + if (beta != 0.f) { + float16x8_t z0_7 = vmulq_f16(vld1q_f16(&Z[i]), beta_vec); + vst1q_f16(&Z[i], vaddq_f16(z0_7, xy0_7)); + } else { + vst1q_f16(&Z[i], xy0_7); + } } while (i < N) { - Z[i] = X[i] * Y[i]; + if (beta != 1.f) Z[i] = alpha * X[i] * Y[i] + beta * Z[i]; + else Z[i] = alpha * X[i] * Y[i]; ++i; } } -void ele_add(const unsigned int N, const __fp16 *X, const __fp16 *Y, __fp16 *Z, __fp16 alpha, __fp16 beta) { +void ele_add(const unsigned int N, const __fp16 *X, const __fp16 *Y, __fp16 *Z, + float alpha, float beta) { unsigned int i = 0; + float16x8_t alpha_vec = vdupq_n_f16(alpha); + float16x8_t beta_vec = vdupq_n_f16(beta); for (; N - i >= 8; i += 8) { float16x8_t x0_7 = vld1q_f16(&X[i]); float16x8_t y0_7 = vld1q_f16(&Y[i]); - float16x8_t z0_7 = vaddq_f16(x0_7, y0_7); - - vst1q_f16(&Z[i], z0_7); + if (alpha != 1.f) { + y0_7 = vmulq_f16(y0_7, alpha_vec); + } + float16x8_t xy0_7 = vaddq_f16(x0_7, y0_7); + if (beta != 0.f) { + float16x8_t z0_7 = vmulq_f16(vld1q_f16(&Z[i]), beta_vec); + vst1q_f16(&Z[i], vaddq_f16(z0_7, xy0_7)); + } else { + vst1q_f16(&Z[i], xy0_7); + } } while (i < N) { - Z[i] = X[i] + Y[i]; + if (beta != 1.f) Z[i] = X[i] + alpha * Y[i] + beta * Z[i]; + else Z[i] = X[i] + alpha * Y[i]; ++i; } } diff --git a/nntrainer/tensor/blas_neon.h b/nntrainer/tensor/blas_neon.h index 116c04e04b..5460a52625 100644 --- a/nntrainer/tensor/blas_neon.h +++ b/nntrainer/tensor/blas_neon.h @@ -146,7 +146,7 @@ void hgemv(const __fp16 *A, const __fp16 *X, __fp16 *Y, uint32_t rows, * @param[in] beta scalar multiplier for output */ void ele_mul(const unsigned N, const __fp16 *X, const __fp16 *Y, __fp16 *Z, - __fp16 alpha = 1.f, __fp16 beta = 0.f); + float alpha = 1.f, float beta = 0.f); /** * @brief elementwise vector addition with neon : Z = X + Y * @param[in] N length of the vector @@ -157,7 +157,7 @@ void ele_mul(const unsigned N, const __fp16 *X, const __fp16 *Y, __fp16 *Z, * @param[in] beta scalar multiplier for output */ void ele_add(const unsigned N, const __fp16 *X, const __fp16 *Y, __fp16 *Z, - __fp16 alpha = 1.f, __fp16 beta = 0.f); + float alpha = 1.f, float beta = 0.f); /** * @brief transposed hgemv computation with neon From 57eeff33de7744f77f17d7dc7a9a99f8cab46908 Mon Sep 17 00:00:00 2001 From: skykongkong8 Date: Fri, 16 Feb 2024 14:23:05 +0900 Subject: [PATCH 03/11] [ NEON ] Add ele_sub and ele_div function structure - This commit introduces a basic structure of elementwise subtraction and division function structure - Function implementation will be added in a later commit **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: skykongkong8 --- nntrainer/tensor/blas_neon.h | 49 ++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/nntrainer/tensor/blas_neon.h b/nntrainer/tensor/blas_neon.h index 5460a52625..b36c6b08ed 100644 --- a/nntrainer/tensor/blas_neon.h +++ b/nntrainer/tensor/blas_neon.h @@ -121,6 +121,30 @@ void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z, */ void ele_add(const unsigned int N, const float *X, const float *Y, float *Z, float alpha = 1.f, float beta = 0.f); +/** + * @brief elementwise vector subtraction with neon : Z = X + Y + * @param[in] N length of the vector + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] Z float * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + */ +void ele_sub(const unsigned N, const float *X, const float *Y, float *Z, + float alpha = 1.f, float beta = 0.f); + +/** + * @brief elementwise vector division with neon : Z = X + Y + * @note ZeroDivisionError is not guaranteed in this function + * @param[in] N length of the vector + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] Z float * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + */ +void ele_div(const unsigned N, const float *X, const float *Y, float *Z, + float alpha = 1.f, float beta = 0.f); #ifdef ENABLE_FP16 /** @@ -159,6 +183,31 @@ void ele_mul(const unsigned N, const __fp16 *X, const __fp16 *Y, __fp16 *Z, void ele_add(const unsigned N, const __fp16 *X, const __fp16 *Y, __fp16 *Z, float alpha = 1.f, float beta = 0.f); +/** + * @brief elementwise vector subtraction with neon : Z = X + Y + * @param[in] N length of the vector + * @param[in] X __fp16 * for Vector X + * @param[in] Y __fp16 * for Vector Y + * @param[in] Z __fp16 * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + */ +void ele_sub(const unsigned N, const __fp16 *X, const __fp16 *Y, __fp16 *Z, + float alpha = 1.f, float beta = 0.f); + +/** + * @brief elementwise vector division with neon : Z = X + Y + * @note ZeroDivisionError is not guaranteed in this function + * @param[in] N length of the vector + * @param[in] X __fp16 * for Vector X + * @param[in] Y __fp16 * for Vector Y + * @param[in] Z __fp16 * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + */ +void ele_div(const unsigned N, const __fp16 *X, const __fp16 *Y, __fp16 *Z, + float alpha = 1.f, float beta = 0.f); + /** * @brief transposed hgemv computation with neon * Y = alpha*transpose(A)*X From 4a3a56c1364a082e0eea43e976421761fd34544f Mon Sep 17 00:00:00 2001 From: skykongkong8 Date: Fri, 16 Feb 2024 14:28:14 +0900 Subject: [PATCH 04/11] [ trivial ] Fix formula in function brief of elementwise functions - Latest elementwise functions get alpha, beta as scalar multipliers. - With those parameters, formula in function brief can be discribed in a more precise way **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: skykongkong8 --- nntrainer/tensor/blas_interface.h | 12 ++++++++---- nntrainer/tensor/blas_neon.h | 22 ++++++++++++++-------- 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/nntrainer/tensor/blas_interface.h b/nntrainer/tensor/blas_interface.h index fa7afc63ca..807a7c9567 100644 --- a/nntrainer/tensor/blas_interface.h +++ b/nntrainer/tensor/blas_interface.h @@ -152,7 +152,8 @@ void sgemv(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, const unsigned int M, const unsigned int lda, const _FP16 *X, const int incX, const float beta, _FP16 *Y, const int incY); /** - * @brief elementwise vector multiplication : Z = X ⊙ Y + * @brief elementwise vector multiplication : Z = X ⊙ alpha * Y + + * beta * Z * @param[in] N length of the vector * @param[in] X _FP16 * for Vector X * @param[in] Y _FP16 * for Vector Y @@ -164,7 +165,8 @@ void ele_mul(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, float alpha = 1.f , float beta = 0.f); /** - * @brief elementwise vector addition : Z = X + Y + * @brief elementwise vector addition : Z = X + alpha * Y + beta * + * Z * @param[in] N length of the vector * @param[in] X _FP16 * for Vector X * @param[in] Y _FP16 * for Vector Y @@ -376,7 +378,8 @@ void cosine(const unsigned int N, float *X, float *Y, float alpha = 1.0); */ void inv_sqrt_inplace(const unsigned int N, float *X); /** - * @brief elementwise vector multiplication : Z = X ⊙ Y + * @brief elementwise vector multiplication : Z = X ⊙ alpha * Y + + * beta * Z * @param[in] N length of the vector * @param[in] X float * for Vector X * @param[in] Y float * for Vector Y @@ -388,7 +391,8 @@ void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z, float alpha = 1.f, float beta = 0.f); /** - * @brief elementwise vector addition : Z = X + Y + * @brief elementwise vector addition : Z = X + alpha * Y + beta * + * Z * @param[in] N length of the vector * @param[in] X float * for Vector X * @param[in] Y float * for Vector Y diff --git a/nntrainer/tensor/blas_neon.h b/nntrainer/tensor/blas_neon.h index b36c6b08ed..4a7af67e4e 100644 --- a/nntrainer/tensor/blas_neon.h +++ b/nntrainer/tensor/blas_neon.h @@ -99,7 +99,7 @@ void cosine(const unsigned int N, float *X, float *Y, float alpha = 1.0); void inv_sqrt_inplace(const unsigned int N, float *X); /** - * @brief elementwise vector multiplication : Z = X ⊙ Y + * @brief elementwise vector multiplication : Z = X ⊙ alpha * Y + beta * Z * @param[in] N length of the vector * @param[in] X float * for Vector X * @param[in] Y float * for Vector Y @@ -111,7 +111,7 @@ void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z, float alpha = 1.f, float beta = 0.f); /** - * @brief elementwise vector addition : Z = X + Y + * @brief elementwise vector addition : Z = X + alpha * Y + beta * Z * @param[in] N length of the vector * @param[in] X float * for Vector X * @param[in] Y float * for Vector Y @@ -122,7 +122,8 @@ void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z, void ele_add(const unsigned int N, const float *X, const float *Y, float *Z, float alpha = 1.f, float beta = 0.f); /** - * @brief elementwise vector subtraction with neon : Z = X + Y + * @brief elementwise vector subtraction with neon : Z = X - alpha * Y + + * beta * Z * @param[in] N length of the vector * @param[in] X float * for Vector X * @param[in] Y float * for Vector Y @@ -134,7 +135,8 @@ void ele_sub(const unsigned N, const float *X, const float *Y, float *Z, float alpha = 1.f, float beta = 0.f); /** - * @brief elementwise vector division with neon : Z = X + Y + * @brief elementwise vector division with neon : Z = X / (alpha * Y) + beta + * * Z * @note ZeroDivisionError is not guaranteed in this function * @param[in] N length of the vector * @param[in] X float * for Vector X @@ -161,7 +163,8 @@ void hgemv(const __fp16 *A, const __fp16 *X, __fp16 *Y, uint32_t rows, uint32_t cols, float alpha, float beta); /** - * @brief elementwise vector multiplication with neon : Z = X ⊙ Y + * @brief elementwise vector multiplication with neon : Z = X ⊙ alpha * Y + + * beta * Z * @param[in] N length of the vector * @param[in] X __fp16 * for Vector X * @param[in] Y __fp16 * for Vector Y @@ -172,7 +175,8 @@ void hgemv(const __fp16 *A, const __fp16 *X, __fp16 *Y, uint32_t rows, void ele_mul(const unsigned N, const __fp16 *X, const __fp16 *Y, __fp16 *Z, float alpha = 1.f, float beta = 0.f); /** - * @brief elementwise vector addition with neon : Z = X + Y + * @brief elementwise vector addition with neon : Z = X + alpha * Y + beta * + * Z * @param[in] N length of the vector * @param[in] X __fp16 * for Vector X * @param[in] Y __fp16 * for Vector Y @@ -184,7 +188,8 @@ void ele_add(const unsigned N, const __fp16 *X, const __fp16 *Y, __fp16 *Z, float alpha = 1.f, float beta = 0.f); /** - * @brief elementwise vector subtraction with neon : Z = X + Y + * @brief elementwise vector subtraction with neon : Z = X - alpha * Y + + * beta * Z * @param[in] N length of the vector * @param[in] X __fp16 * for Vector X * @param[in] Y __fp16 * for Vector Y @@ -196,7 +201,8 @@ void ele_sub(const unsigned N, const __fp16 *X, const __fp16 *Y, __fp16 *Z, float alpha = 1.f, float beta = 0.f); /** - * @brief elementwise vector division with neon : Z = X + Y + * @brief elementwise vector division with neon : Z = X / (alpha * Y) + beta + * * Z * @note ZeroDivisionError is not guaranteed in this function * @param[in] N length of the vector * @param[in] X __fp16 * for Vector X From 80c13ddedba07b70ab9a8b9acccc4ff69e0adf1b Mon Sep 17 00:00:00 2001 From: skykongkong8 Date: Fri, 16 Feb 2024 14:37:25 +0900 Subject: [PATCH 05/11] [ NEON ] Implement ele_sub, ele_div functions - Implement elementwise subtraction and division functions based on function structures proposed from the previous commit **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: skykongkong8 --- nntrainer/tensor/blas_neon.cpp | 134 +++++++++++++++++++++++++++++++-- 1 file changed, 126 insertions(+), 8 deletions(-) diff --git a/nntrainer/tensor/blas_neon.cpp b/nntrainer/tensor/blas_neon.cpp index a9743f8f59..fc6f9ec5d5 100644 --- a/nntrainer/tensor/blas_neon.cpp +++ b/nntrainer/tensor/blas_neon.cpp @@ -454,8 +454,10 @@ void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z, vst1q_f32(&Z[i], xy0_3); } while (i < N) { - if (beta != 1.f) Z[i] = alpha * X[i] * Y[i] + beta * Z[i]; - else Z[i] = alpha * X[i] * Y[i]; + if (beta != 1.f) + Z[i] = alpha * X[i] * Y[i] + beta * Z[i]; + else + Z[i] = alpha * X[i] * Y[i]; ++i; } } @@ -479,8 +481,64 @@ void ele_add(const unsigned int N, const float *X, const float *Y, float *Z, vst1q_f32(&Z[i], xy0_3); } while (i < N) { - if (beta != 1.f) Z[i] = X[i] + alpha * Y[i] + beta * Z[i]; - else Z[i] = X[i] + alpha * Y[i]; + if (beta != 1.f) + Z[i] = X[i] + alpha * Y[i] + beta * Z[i]; + else + Z[i] = X[i] + alpha * Y[i]; + ++i; + } +} + +void ele_sub(const unsigned N, const float *X, const float *Y, float *Z, + float alpha, float beta) { + unsigned int i = 0; + float32x4_t alpha_vec = vdupq_n_f32(alpha); + float32x4_t beta_vec = vdupq_n_f32(beta); + for (; N - i >= 4; i += 4) { + float32x4_t x0_3 = vld1q_f32(&X[i]); + float32x4_t y0_3 = vld1q_f32(&Y[i]); + if (alpha != 1.f) { + y0_3 = vmulq_f32(y0_3, alpha_vec); + } + float32x4_t xy0_3 = vsubq_f32(x0_3, y0_3); + if (beta != 0.f) { + float32x4_t z0_3 = vmulq_f32(vld1q_f32(&Z[i]), beta_vec); + vst1q_f32(&Z[i], vaddq_f32(z0_3, xy0_3)); + } else + vst1q_f32(&Z[i], xy0_3); + } + while (i < N) { + if (beta != 1.f) + Z[i] = X[i] - alpha * Y[i] + beta * Z[i]; + else + Z[i] = X[i] - alpha * Y[i]; + ++i; + } +} + +void ele_div(const unsigned N, const float *X, const float *Y, float *Z, + float alpha, float beta) { + unsigned int i = 0; + float32x4_t alpha_vec = vdupq_n_f32(alpha); + float32x4_t beta_vec = vdupq_n_f32(beta); + for (; N - i >= 4; i += 4) { + float32x4_t x0_3 = vld1q_f32(&X[i]); + float32x4_t y0_3 = vld1q_f32(&Y[i]); + if (alpha != 1.f) { + y0_3 = vmulq_f32(y0_3, alpha_vec); + } + float32x4_t xy0_3 = vdivq_f32(x0_3, y0_3); + if (beta != 0.f) { + float32x4_t z0_3 = vmulq_f32(vld1q_f32(&Z[i]), beta_vec); + vst1q_f32(&Z[i], vaddq_f32(z0_3, xy0_3)); + } else + vst1q_f32(&Z[i], xy0_3); + } + while (i < N) { + if (beta != 1.f) + Z[i] = X[i] / (alpha * Y[i]) + beta * Z[i]; + else + Z[i] = X[i] / (alpha * Y[i]); ++i; } } @@ -2084,8 +2142,10 @@ void ele_mul(const unsigned int N, const __fp16 *X, const __fp16 *Y, __fp16 *Z, } } while (i < N) { - if (beta != 1.f) Z[i] = alpha * X[i] * Y[i] + beta * Z[i]; - else Z[i] = alpha * X[i] * Y[i]; + if (beta != 1.f) + Z[i] = alpha * X[i] * Y[i] + beta * Z[i]; + else + Z[i] = alpha * X[i] * Y[i]; ++i; } } @@ -2110,8 +2170,66 @@ void ele_add(const unsigned int N, const __fp16 *X, const __fp16 *Y, __fp16 *Z, } } while (i < N) { - if (beta != 1.f) Z[i] = X[i] + alpha * Y[i] + beta * Z[i]; - else Z[i] = X[i] + alpha * Y[i]; + if (beta != 1.f) + Z[i] = X[i] + alpha * Y[i] + beta * Z[i]; + else + Z[i] = X[i] + alpha * Y[i]; + ++i; + } +} + +void ele_sub(const unsigned int N, const __fp16 *X, const __fp16 *Y, __fp16 *Z, + float alpha, float beta) { + unsigned int i = 0; + float16x8_t alpha_vec = vdupq_n_f16(alpha); + float16x8_t beta_vec = vdupq_n_f16(beta); + for (; N - i >= 8; i += 8) { + float16x8_t x0_7 = vld1q_f16(&X[i]); + float16x8_t y0_7 = vld1q_f16(&Y[i]); + if (alpha != 1.f) { + y0_7 = vmulq_f16(y0_7, alpha_vec); + } + float16x8_t xy0_7 = vsubq_f16(x0_7, y0_7); + if (beta != 0.f) { + float16x8_t z0_7 = vmulq_f16(vld1q_f16(&Z[i]), beta_vec); + vst1q_f16(&Z[i], vaddq_f16(z0_7, xy0_7)); + } else { + vst1q_f16(&Z[i], xy0_7); + } + } + while (i < N) { + if (beta != 1.f) + Z[i] = X[i] - alpha * Y[i] + beta * Z[i]; + else + Z[i] = X[i] - alpha * Y[i]; + ++i; + } +} + +void ele_div(const unsigned int N, const __fp16 *X, const __fp16 *Y, __fp16 *Z, + float alpha, float beta) { + unsigned int i = 0; + float16x8_t alpha_vec = vdupq_n_f16(alpha); + float16x8_t beta_vec = vdupq_n_f16(beta); + for (; N - i >= 8; i += 8) { + float16x8_t x0_7 = vld1q_f16(&X[i]); + float16x8_t y0_7 = vld1q_f16(&Y[i]); + if (alpha != 1.f) { + y0_7 = vmulq_f16(y0_7, alpha_vec); + } + float16x8_t xy0_7 = vdivq_f16(x0_7, y0_7); + if (beta != 0.f) { + float16x8_t z0_7 = vmulq_f16(vld1q_f16(&Z[i]), beta_vec); + vst1q_f16(&Z[i], vaddq_f16(z0_7, xy0_7)); + } else { + vst1q_f16(&Z[i], xy0_7); + } + } + while (i < N) { + if (beta != 1.f) + Z[i] = X[i] / (alpha * Y[i]) + beta * Z[i]; + else + Z[i] = X[i] / (alpha * Y[i]); ++i; } } From b3a271cdd9e480df571fb49357a8fa84156eeed7 Mon Sep 17 00:00:00 2001 From: skykongkong8 Date: Fri, 16 Feb 2024 14:40:10 +0900 Subject: [PATCH 06/11] [ BLAS ] Add ele_sub and ele_div function structure - This commit introduces a basic structure of elementwise subtraction and division funct ion structure in blas_interface - Function implementation will be added in a later commit **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: skykongkong8 --- nntrainer/tensor/blas_interface.h | 54 ++++++++++++++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) diff --git a/nntrainer/tensor/blas_interface.h b/nntrainer/tensor/blas_interface.h index 807a7c9567..7682ae4182 100644 --- a/nntrainer/tensor/blas_interface.h +++ b/nntrainer/tensor/blas_interface.h @@ -162,7 +162,7 @@ void sgemv(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, const unsigned int M, * @param[in] beta scalar multiplier for output */ void ele_mul(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, - float alpha = 1.f , float beta = 0.f); + float alpha = 1.f, float beta = 0.f); /** * @brief elementwise vector addition : Z = X + alpha * Y + beta * @@ -176,6 +176,32 @@ void ele_mul(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, */ void ele_add(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, float alpha = 1.f, float beta = 0.f); +/** + * @brief elementwise vector subtraction with neon : Z = X - alpha * Y + + * beta * Z + * @param[in] N length of the vector + * @param[in] X _FP16 * for Vector X + * @param[in] Y _FP16 * for Vector Y + * @param[in] Z _FP16 * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + */ +void ele_sub(const unsigned N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, + float alpha = 1.f, float beta = 0.f); + +/** + * @brief elementwise vector division with neon : Z = X / (alpha * Y) + beta + * * Z + * @note ZeroDivisionError is not guaranteed in this function + * @param[in] N length of the vector + * @param[in] X _FP16 * for Vector X + * @param[in] Y _FP16 * for Vector Y + * @param[in] Z _FP16 * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + */ +void ele_div(const unsigned N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, + float alpha = 1.f, float beta = 0.f); /** * @brief isamax function : index of first maxima @@ -402,6 +428,32 @@ void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z, */ void ele_add(const unsigned int N, const float *X, const float *Y, float *Z, float alpha = 1.f, float beta = 0.f); +/** + * @brief elementwise vector subtraction with neon : Z = X - alpha * Y + + * beta * Z + * @param[in] N length of the vector + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] Z float * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + */ +void ele_sub(const unsigned N, const float *X, const float *Y, float *Z, + float alpha = 1.f, float beta = 0.f); + +/** + * @brief elementwise vector division with neon : Z = X / (alpha * Y) + beta + * * Z + * @note ZeroDivisionError is not guaranteed in this function + * @param[in] N length of the vector + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] Z float * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + */ +void ele_div(const unsigned N, const float *X, const float *Y, float *Z, + float alpha = 1.f, float beta = 0.f); } /* namespace nntrainer */ #endif /* __cplusplus */ #endif /* __BLAS_INTERFACE_H__ */ From e9f43306d02d1d59fd9eb541532eaf5449568df6 Mon Sep 17 00:00:00 2001 From: skykongkong8 Date: Fri, 16 Feb 2024 14:44:34 +0900 Subject: [PATCH 07/11] [ BLAS ] Implement ele_sub, ele_div functions - Implement elementwise subtraction and division functions based on function structures proposed from the previous commit - with NEON, we can use SIMD-acclerated function from blas_neon **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: skykongkong8 --- nntrainer/tensor/blas_interface.cpp | 73 ++++++++++++++++++++++++++++- nntrainer/tensor/blas_neon.cpp | 16 +++---- 2 files changed, 80 insertions(+), 9 deletions(-) diff --git a/nntrainer/tensor/blas_interface.cpp b/nntrainer/tensor/blas_interface.cpp index 2cb0b24568..3ce60ec782 100644 --- a/nntrainer/tensor/blas_interface.cpp +++ b/nntrainer/tensor/blas_interface.cpp @@ -417,6 +417,42 @@ void ele_add(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, #endif } +void ele_sub(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, + float alpha, float beta) { +#if (defined USE__FP16 && USE_NEON) + nntrainer::neon::ele_sub(N, X, Y, Z, alpha, beta); +#else + if (beta != 0.f) { + for (unsigned int i = 0; i < N; ++i) { + Z[i] = X[i] - static_cast<_FP16>(alpha) * Y[i] + + static_cast<_FP16>(beta) * Z[i]; + } + } else { + for (unsigned int i = 0; i < N; ++i) { + Z[i] = X[i] - static_cast<_FP16>(alpha) * Y[i]; + } + } +#endif +} + +void ele_div(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, + float alpha, float beta) { +#if (defined USE__FP16 && USE_NEON) + nntrainer::neon::ele_div(N, X, Y, Z, alpha, beta); +#else + if (beta != 0.f) { + for (unsigned int i = 0; i < N; ++i) { + Z[i] = X[i] / (static_cast<_FP16>(alpha) * Y[i]) + + static_cast<_FP16>(beta) * Z[i]; + } + } else { + for (unsigned int i = 0; i < N; ++i) { + Z[i] = X[i] / (static_cast<_FP16>(alpha) * Y[i]); + } + } +#endif +} + _FP16 snrm2(const int N, const _FP16 *X, const int incX) { return snrm2_FP16(N, X, incX); } @@ -919,7 +955,6 @@ void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z, nntrainer::neon::ele_mul(N, X, Y, Z, alpha, beta); #else for (unsigned int i = 0; i < N; ++i) { - Z[i] *= beta; Z[i] = alpha * X[i] * Y[i]; } @@ -938,4 +973,40 @@ void ele_add(const unsigned int N, const float *X, const float *Y, float *Z, #endif } +void ele_sub(const unsigned int N, const float *X, const float *Y, float *Z, + float alpha, float beta) { +#ifdef USE_NEON + nntrainer::neon::ele_sub(N, X, Y, Z, alpha, beta); +#else + if (beta != 0.f) { + for (unsigned int i = 0; i < N; ++i) { + Z[i] = X[i] - alpha * Y[i] + + beta * Z[i]; + } + } else { + for (unsigned int i = 0; i < N; ++i) { + Z[i] = X[i] - alpha * Y[i]; + } + } +#endif +} + +void ele_div(const unsigned int N, const float *X, const float *Y, float *Z, + float alpha, float beta) { +#ifdef USE_NEON + nntrainer::neon::ele_div(N, X, Y, Z, alpha, beta); +#else + if (beta != 0.f) { + for (unsigned int i = 0; i < N; ++i) { + Z[i] = X[i] / (alpha * Y[i]) + + beta * Z[i]; + } + } else { + for (unsigned int i = 0; i < N; ++i) { + Z[i] = X[i] / (alpha * Y[i]); + } + } +#endif +} + } // namespace nntrainer diff --git a/nntrainer/tensor/blas_neon.cpp b/nntrainer/tensor/blas_neon.cpp index fc6f9ec5d5..bfe49d8422 100644 --- a/nntrainer/tensor/blas_neon.cpp +++ b/nntrainer/tensor/blas_neon.cpp @@ -454,7 +454,7 @@ void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z, vst1q_f32(&Z[i], xy0_3); } while (i < N) { - if (beta != 1.f) + if (beta != 0.f) Z[i] = alpha * X[i] * Y[i] + beta * Z[i]; else Z[i] = alpha * X[i] * Y[i]; @@ -481,7 +481,7 @@ void ele_add(const unsigned int N, const float *X, const float *Y, float *Z, vst1q_f32(&Z[i], xy0_3); } while (i < N) { - if (beta != 1.f) + if (beta != 0.f) Z[i] = X[i] + alpha * Y[i] + beta * Z[i]; else Z[i] = X[i] + alpha * Y[i]; @@ -508,7 +508,7 @@ void ele_sub(const unsigned N, const float *X, const float *Y, float *Z, vst1q_f32(&Z[i], xy0_3); } while (i < N) { - if (beta != 1.f) + if (beta != 0.f) Z[i] = X[i] - alpha * Y[i] + beta * Z[i]; else Z[i] = X[i] - alpha * Y[i]; @@ -535,7 +535,7 @@ void ele_div(const unsigned N, const float *X, const float *Y, float *Z, vst1q_f32(&Z[i], xy0_3); } while (i < N) { - if (beta != 1.f) + if (beta != 0.f) Z[i] = X[i] / (alpha * Y[i]) + beta * Z[i]; else Z[i] = X[i] / (alpha * Y[i]); @@ -2142,7 +2142,7 @@ void ele_mul(const unsigned int N, const __fp16 *X, const __fp16 *Y, __fp16 *Z, } } while (i < N) { - if (beta != 1.f) + if (beta != 0.f) Z[i] = alpha * X[i] * Y[i] + beta * Z[i]; else Z[i] = alpha * X[i] * Y[i]; @@ -2170,7 +2170,7 @@ void ele_add(const unsigned int N, const __fp16 *X, const __fp16 *Y, __fp16 *Z, } } while (i < N) { - if (beta != 1.f) + if (beta != 0.f) Z[i] = X[i] + alpha * Y[i] + beta * Z[i]; else Z[i] = X[i] + alpha * Y[i]; @@ -2198,7 +2198,7 @@ void ele_sub(const unsigned int N, const __fp16 *X, const __fp16 *Y, __fp16 *Z, } } while (i < N) { - if (beta != 1.f) + if (beta != 0.f) Z[i] = X[i] - alpha * Y[i] + beta * Z[i]; else Z[i] = X[i] - alpha * Y[i]; @@ -2226,7 +2226,7 @@ void ele_div(const unsigned int N, const __fp16 *X, const __fp16 *Y, __fp16 *Z, } } while (i < N) { - if (beta != 1.f) + if (beta != 0.f) Z[i] = X[i] / (alpha * Y[i]) + beta * Z[i]; else Z[i] = X[i] / (alpha * Y[i]); From 736be6160894db8786841ce06542050b00c0e83d Mon Sep 17 00:00:00 2001 From: skykongkong8 Date: Fri, 16 Feb 2024 14:55:03 +0900 Subject: [PATCH 08/11] [ bugfix/Tensor ] Wrong practice in elementwise vector addition's scalar multiplier - Default value of scalar multiplier alpha should be 1, not 0 **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: skykongkong8 --- nntrainer/tensor/tensor.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nntrainer/tensor/tensor.cpp b/nntrainer/tensor/tensor.cpp index 9cbfe08f22..b73ceef716 100644 --- a/nntrainer/tensor/tensor.cpp +++ b/nntrainer/tensor/tensor.cpp @@ -1076,7 +1076,7 @@ Tensor &Tensor::add(Tensor const &m, Tensor &output, float const alpha) const { auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf, float *out_buf) { if (e.strides[3] == 1 && strides[3] == 1 && strides[3] == 1 && - alpha == 0) { + alpha == 1.f) { std::transform(buf, buf + e.buffer_size, m_buf, out_buf, std::plus()); } else { @@ -1094,7 +1094,7 @@ Tensor &Tensor::add(Tensor const &m, Tensor &output, float const alpha) const { auto f = [&](const BroadcastInfo &e, const _FP16 *buf, const _FP16 *m_buf, _FP16 *out_buf) { if (e.strides[3] == 1 && strides[3] == 1 && strides[3] == 1 && - alpha == 0) { + alpha == 1.f) { ele_add(e.buffer_size, buf, m_buf, out_buf); } else { for (unsigned int i = 0; i < e.buffer_size; ++i) { From 02d47e2b47c3ba9cdbbbe15b8f5f02a6e1a9ab04 Mon Sep 17 00:00:00 2001 From: skykongkong8 Date: Mon, 19 Feb 2024 09:37:22 +0900 Subject: [PATCH 09/11] [ BLAS ] Consider NaN case of output vector - With default output scalar multiplier value beta, output Z might contain NaN values. **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: skykongkong8 --- nntrainer/tensor/blas_interface.cpp | 73 ++++++++++++----------------- 1 file changed, 29 insertions(+), 44 deletions(-) diff --git a/nntrainer/tensor/blas_interface.cpp b/nntrainer/tensor/blas_interface.cpp index 3ce60ec782..fbcf899ad5 100644 --- a/nntrainer/tensor/blas_interface.cpp +++ b/nntrainer/tensor/blas_interface.cpp @@ -386,15 +386,12 @@ void ele_mul(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, #if (defined USE__FP16 && USE_NEON) nntrainer::neon::ele_mul(N, X, Y, Z, alpha, beta); #else - if (beta != 0.f) { - for (unsigned int i = 0; i < N; ++i) { + for (unsigned int i = 0; i < N; ++i) { + if (beta != 0.f) Z[i] = static_cast<_FP16>(alpha) * X[i] * Y[i] + static_cast<_FP16>(beta) * Z[i]; - } - } else { - for (unsigned int i = 0; i < N; ++i) { + else Z[i] = static_cast<_FP16>(alpha) * X[i] * Y[i]; - } } #endif } @@ -404,15 +401,12 @@ void ele_add(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, #if (defined USE__FP16 && USE_NEON) nntrainer::neon::ele_add(N, X, Y, Z, alpha, beta); #else - if (beta != 0.f) { - for (unsigned int i = 0; i < N; ++i) { + for (unsigned int i = 0; i < N; ++i) { + if (beta != 0.f) Z[i] = X[i] + static_cast<_FP16>(alpha) * Y[i] + static_cast<_FP16>(beta) * Z[i]; - } - } else { - for (unsigned int i = 0; i < N; ++i) { + else Z[i] = X[i] + static_cast<_FP16>(alpha) * Y[i]; - } } #endif } @@ -422,15 +416,12 @@ void ele_sub(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, #if (defined USE__FP16 && USE_NEON) nntrainer::neon::ele_sub(N, X, Y, Z, alpha, beta); #else - if (beta != 0.f) { - for (unsigned int i = 0; i < N; ++i) { + for (unsigned int i = 0; i < N; ++i) { + if (beta != 0.f) Z[i] = X[i] - static_cast<_FP16>(alpha) * Y[i] + static_cast<_FP16>(beta) * Z[i]; - } - } else { - for (unsigned int i = 0; i < N; ++i) { + else Z[i] = X[i] - static_cast<_FP16>(alpha) * Y[i]; - } } #endif } @@ -440,15 +431,12 @@ void ele_div(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, #if (defined USE__FP16 && USE_NEON) nntrainer::neon::ele_div(N, X, Y, Z, alpha, beta); #else - if (beta != 0.f) { - for (unsigned int i = 0; i < N; ++i) { + for (unsigned int i = 0; i < N; ++i) { + if (beta != 0.f) Z[i] = X[i] / (static_cast<_FP16>(alpha) * Y[i]) + static_cast<_FP16>(beta) * Z[i]; - } - } else { - for (unsigned int i = 0; i < N; ++i) { + else Z[i] = X[i] / (static_cast<_FP16>(alpha) * Y[i]); - } } #endif } @@ -955,8 +943,10 @@ void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z, nntrainer::neon::ele_mul(N, X, Y, Z, alpha, beta); #else for (unsigned int i = 0; i < N; ++i) { - Z[i] *= beta; - Z[i] = alpha * X[i] * Y[i]; + if (beta != 0.f) + Z[i] = alpha * X[i] * Y[i] + beta * Z[i]; + else + Z[i] = alpha * X[i] * Y[i]; } #endif } @@ -967,8 +957,10 @@ void ele_add(const unsigned int N, const float *X, const float *Y, float *Z, nntrainer::neon::ele_add(N, X, Y, Z, alpha, beta); #else for (unsigned int i = 0; i < N; ++i) { - Z[i] *= beta; - Z[i] = X[i] + alpha * Y[i]; + if (beta != 0.f) + Z[i] = X[i] + alpha * Y[i] + beta * Z[i]; + else + Z[i] = X[i] + alpha * Y[i]; } #endif } @@ -978,16 +970,13 @@ void ele_sub(const unsigned int N, const float *X, const float *Y, float *Z, #ifdef USE_NEON nntrainer::neon::ele_sub(N, X, Y, Z, alpha, beta); #else - if (beta != 0.f) { - for (unsigned int i = 0; i < N; ++i) { - Z[i] = X[i] - alpha * Y[i] + - beta * Z[i]; - } - } else { - for (unsigned int i = 0; i < N; ++i) { + for (unsigned int i = 0; i < N; ++i) { + if (beta != 0.f) + Z[i] = X[i] - alpha * Y[i] + beta * Z[i]; + else Z[i] = X[i] - alpha * Y[i]; - } } + #endif } @@ -996,15 +985,11 @@ void ele_div(const unsigned int N, const float *X, const float *Y, float *Z, #ifdef USE_NEON nntrainer::neon::ele_div(N, X, Y, Z, alpha, beta); #else - if (beta != 0.f) { - for (unsigned int i = 0; i < N; ++i) { - Z[i] = X[i] / (alpha * Y[i]) + - beta * Z[i]; - } - } else { - for (unsigned int i = 0; i < N; ++i) { + for (unsigned int i = 0; i < N; ++i) { + if (beta != 0.f) + Z[i] = X[i] / (alpha * Y[i]) + beta * Z[i]; + else Z[i] = X[i] / (alpha * Y[i]); - } } #endif } From b6d29147539c4912d2be4c6705fad4d37acf9602 Mon Sep 17 00:00:00 2001 From: skykongkong8 Date: Mon, 19 Feb 2024 14:20:14 +0900 Subject: [PATCH 10/11] [ BLAS ] Fix beta comparing logic - According to discussions made from #2473, we found a better way of comparing float scalar multiplier using __FLT_MIN__ **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: skykongkong8 --- nntrainer/tensor/blas_interface.cpp | 16 +++++++-------- nntrainer/tensor/blas_interface.h | 4 ++-- nntrainer/tensor/blas_neon.cpp | 32 ++++++++++++++--------------- nntrainer/tensor/blas_neon.h | 4 ++-- 4 files changed, 28 insertions(+), 28 deletions(-) diff --git a/nntrainer/tensor/blas_interface.cpp b/nntrainer/tensor/blas_interface.cpp index fbcf899ad5..2a190150fb 100644 --- a/nntrainer/tensor/blas_interface.cpp +++ b/nntrainer/tensor/blas_interface.cpp @@ -387,7 +387,7 @@ void ele_mul(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, nntrainer::neon::ele_mul(N, X, Y, Z, alpha, beta); #else for (unsigned int i = 0; i < N; ++i) { - if (beta != 0.f) + if (std::abs(beta) > __FLT_MIN__) Z[i] = static_cast<_FP16>(alpha) * X[i] * Y[i] + static_cast<_FP16>(beta) * Z[i]; else @@ -402,7 +402,7 @@ void ele_add(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, nntrainer::neon::ele_add(N, X, Y, Z, alpha, beta); #else for (unsigned int i = 0; i < N; ++i) { - if (beta != 0.f) + if (std::abs(beta) > __FLT_MIN__) Z[i] = X[i] + static_cast<_FP16>(alpha) * Y[i] + static_cast<_FP16>(beta) * Z[i]; else @@ -417,7 +417,7 @@ void ele_sub(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, nntrainer::neon::ele_sub(N, X, Y, Z, alpha, beta); #else for (unsigned int i = 0; i < N; ++i) { - if (beta != 0.f) + if (std::abs(beta) > __FLT_MIN__) Z[i] = X[i] - static_cast<_FP16>(alpha) * Y[i] + static_cast<_FP16>(beta) * Z[i]; else @@ -432,7 +432,7 @@ void ele_div(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, nntrainer::neon::ele_div(N, X, Y, Z, alpha, beta); #else for (unsigned int i = 0; i < N; ++i) { - if (beta != 0.f) + if (std::abs(beta) > __FLT_MIN__) Z[i] = X[i] / (static_cast<_FP16>(alpha) * Y[i]) + static_cast<_FP16>(beta) * Z[i]; else @@ -943,7 +943,7 @@ void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z, nntrainer::neon::ele_mul(N, X, Y, Z, alpha, beta); #else for (unsigned int i = 0; i < N; ++i) { - if (beta != 0.f) + if (std::abs(beta) > __FLT_MIN__) Z[i] = alpha * X[i] * Y[i] + beta * Z[i]; else Z[i] = alpha * X[i] * Y[i]; @@ -957,7 +957,7 @@ void ele_add(const unsigned int N, const float *X, const float *Y, float *Z, nntrainer::neon::ele_add(N, X, Y, Z, alpha, beta); #else for (unsigned int i = 0; i < N; ++i) { - if (beta != 0.f) + if (std::abs(beta) > __FLT_MIN__) Z[i] = X[i] + alpha * Y[i] + beta * Z[i]; else Z[i] = X[i] + alpha * Y[i]; @@ -971,7 +971,7 @@ void ele_sub(const unsigned int N, const float *X, const float *Y, float *Z, nntrainer::neon::ele_sub(N, X, Y, Z, alpha, beta); #else for (unsigned int i = 0; i < N; ++i) { - if (beta != 0.f) + if (std::abs(beta) > __FLT_MIN__) Z[i] = X[i] - alpha * Y[i] + beta * Z[i]; else Z[i] = X[i] - alpha * Y[i]; @@ -986,7 +986,7 @@ void ele_div(const unsigned int N, const float *X, const float *Y, float *Z, nntrainer::neon::ele_div(N, X, Y, Z, alpha, beta); #else for (unsigned int i = 0; i < N; ++i) { - if (beta != 0.f) + if (std::abs(beta) > __FLT_MIN__) Z[i] = X[i] / (alpha * Y[i]) + beta * Z[i]; else Z[i] = X[i] / (alpha * Y[i]); diff --git a/nntrainer/tensor/blas_interface.h b/nntrainer/tensor/blas_interface.h index 7682ae4182..3e9ff0fcfc 100644 --- a/nntrainer/tensor/blas_interface.h +++ b/nntrainer/tensor/blas_interface.h @@ -385,7 +385,7 @@ unsigned int isamax(const unsigned int N, const float *X, const int incX); * @param[in] Y float * for Vector Y * @param[in] alpha float * for scaling angle (radian) */ -void sine(const unsigned int N, float *X, float *Y, float alpha = 1.0); +void sine(const unsigned int N, float *X, float *Y, float alpha = 1.f); /** * @brief cosine with neon: Y = cos(alpha * X) @@ -394,7 +394,7 @@ void sine(const unsigned int N, float *X, float *Y, float alpha = 1.0); * @param[in] Y float * for Vector Y * @param[in] alpha float * for scaling angle (radian) */ -void cosine(const unsigned int N, float *X, float *Y, float alpha = 1.0); +void cosine(const unsigned int N, float *X, float *Y, float alpha = 1.f); /** * @brief inversed squared root transformation inplace : X = 1 / sqrt(X) diff --git a/nntrainer/tensor/blas_neon.cpp b/nntrainer/tensor/blas_neon.cpp index bfe49d8422..4dc9708d56 100644 --- a/nntrainer/tensor/blas_neon.cpp +++ b/nntrainer/tensor/blas_neon.cpp @@ -447,14 +447,14 @@ void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z, y0_3 = vmulq_f32(y0_3, alpha_vec); } float32x4_t xy0_3 = vmulq_f32(x0_3, y0_3); - if (beta != 0.f) { + if (std::abs(beta) > __FLT_MIN__) { float32x4_t z0_3 = vmulq_f32(vld1q_f32(&Z[i]), beta_vec); vst1q_f32(&Z[i], vaddq_f32(z0_3, xy0_3)); } else vst1q_f32(&Z[i], xy0_3); } while (i < N) { - if (beta != 0.f) + if (std::abs(beta) > __FLT_MIN__) Z[i] = alpha * X[i] * Y[i] + beta * Z[i]; else Z[i] = alpha * X[i] * Y[i]; @@ -474,14 +474,14 @@ void ele_add(const unsigned int N, const float *X, const float *Y, float *Z, y0_3 = vmulq_f32(y0_3, alpha_vec); } float32x4_t xy0_3 = vaddq_f32(x0_3, y0_3); - if (beta != 0.f) { + if (std::abs(beta) > __FLT_MIN__) { float32x4_t z0_3 = vmulq_f32(vld1q_f32(&Z[i]), beta_vec); vst1q_f32(&Z[i], vaddq_f32(z0_3, xy0_3)); } else vst1q_f32(&Z[i], xy0_3); } while (i < N) { - if (beta != 0.f) + if (std::abs(beta) > __FLT_MIN__) Z[i] = X[i] + alpha * Y[i] + beta * Z[i]; else Z[i] = X[i] + alpha * Y[i]; @@ -501,14 +501,14 @@ void ele_sub(const unsigned N, const float *X, const float *Y, float *Z, y0_3 = vmulq_f32(y0_3, alpha_vec); } float32x4_t xy0_3 = vsubq_f32(x0_3, y0_3); - if (beta != 0.f) { + if (std::abs(beta) > __FLT_MIN__) { float32x4_t z0_3 = vmulq_f32(vld1q_f32(&Z[i]), beta_vec); vst1q_f32(&Z[i], vaddq_f32(z0_3, xy0_3)); } else vst1q_f32(&Z[i], xy0_3); } while (i < N) { - if (beta != 0.f) + if (std::abs(beta) > __FLT_MIN__) Z[i] = X[i] - alpha * Y[i] + beta * Z[i]; else Z[i] = X[i] - alpha * Y[i]; @@ -528,14 +528,14 @@ void ele_div(const unsigned N, const float *X, const float *Y, float *Z, y0_3 = vmulq_f32(y0_3, alpha_vec); } float32x4_t xy0_3 = vdivq_f32(x0_3, y0_3); - if (beta != 0.f) { + if (std::abs(beta) > __FLT_MIN__) { float32x4_t z0_3 = vmulq_f32(vld1q_f32(&Z[i]), beta_vec); vst1q_f32(&Z[i], vaddq_f32(z0_3, xy0_3)); } else vst1q_f32(&Z[i], xy0_3); } while (i < N) { - if (beta != 0.f) + if (std::abs(beta) > __FLT_MIN__) Z[i] = X[i] / (alpha * Y[i]) + beta * Z[i]; else Z[i] = X[i] / (alpha * Y[i]); @@ -2134,7 +2134,7 @@ void ele_mul(const unsigned int N, const __fp16 *X, const __fp16 *Y, __fp16 *Z, y0_7 = vmulq_f16(y0_7, alpha_vec); } float16x8_t xy0_7 = vmulq_f16(x0_7, y0_7); - if (beta != 0.f) { + if (std::abs(beta) > __FLT_MIN__) { float16x8_t z0_7 = vmulq_f16(vld1q_f16(&Z[i]), beta_vec); vst1q_f16(&Z[i], vaddq_f16(z0_7, xy0_7)); } else { @@ -2142,7 +2142,7 @@ void ele_mul(const unsigned int N, const __fp16 *X, const __fp16 *Y, __fp16 *Z, } } while (i < N) { - if (beta != 0.f) + if (std::abs(beta) > __FLT_MIN__) Z[i] = alpha * X[i] * Y[i] + beta * Z[i]; else Z[i] = alpha * X[i] * Y[i]; @@ -2162,7 +2162,7 @@ void ele_add(const unsigned int N, const __fp16 *X, const __fp16 *Y, __fp16 *Z, y0_7 = vmulq_f16(y0_7, alpha_vec); } float16x8_t xy0_7 = vaddq_f16(x0_7, y0_7); - if (beta != 0.f) { + if (std::abs(beta) > __FLT_MIN__) { float16x8_t z0_7 = vmulq_f16(vld1q_f16(&Z[i]), beta_vec); vst1q_f16(&Z[i], vaddq_f16(z0_7, xy0_7)); } else { @@ -2170,7 +2170,7 @@ void ele_add(const unsigned int N, const __fp16 *X, const __fp16 *Y, __fp16 *Z, } } while (i < N) { - if (beta != 0.f) + if (std::abs(beta) > __FLT_MIN__) Z[i] = X[i] + alpha * Y[i] + beta * Z[i]; else Z[i] = X[i] + alpha * Y[i]; @@ -2190,7 +2190,7 @@ void ele_sub(const unsigned int N, const __fp16 *X, const __fp16 *Y, __fp16 *Z, y0_7 = vmulq_f16(y0_7, alpha_vec); } float16x8_t xy0_7 = vsubq_f16(x0_7, y0_7); - if (beta != 0.f) { + if (std::abs(beta) > __FLT_MIN__) { float16x8_t z0_7 = vmulq_f16(vld1q_f16(&Z[i]), beta_vec); vst1q_f16(&Z[i], vaddq_f16(z0_7, xy0_7)); } else { @@ -2198,7 +2198,7 @@ void ele_sub(const unsigned int N, const __fp16 *X, const __fp16 *Y, __fp16 *Z, } } while (i < N) { - if (beta != 0.f) + if (std::abs(beta) > __FLT_MIN__) Z[i] = X[i] - alpha * Y[i] + beta * Z[i]; else Z[i] = X[i] - alpha * Y[i]; @@ -2218,7 +2218,7 @@ void ele_div(const unsigned int N, const __fp16 *X, const __fp16 *Y, __fp16 *Z, y0_7 = vmulq_f16(y0_7, alpha_vec); } float16x8_t xy0_7 = vdivq_f16(x0_7, y0_7); - if (beta != 0.f) { + if (std::abs(beta) > __FLT_MIN__) { float16x8_t z0_7 = vmulq_f16(vld1q_f16(&Z[i]), beta_vec); vst1q_f16(&Z[i], vaddq_f16(z0_7, xy0_7)); } else { @@ -2226,7 +2226,7 @@ void ele_div(const unsigned int N, const __fp16 *X, const __fp16 *Y, __fp16 *Z, } } while (i < N) { - if (beta != 0.f) + if (std::abs(beta) > __FLT_MIN__) Z[i] = X[i] / (alpha * Y[i]) + beta * Z[i]; else Z[i] = X[i] / (alpha * Y[i]); diff --git a/nntrainer/tensor/blas_neon.h b/nntrainer/tensor/blas_neon.h index 4a7af67e4e..92a48124f1 100644 --- a/nntrainer/tensor/blas_neon.h +++ b/nntrainer/tensor/blas_neon.h @@ -79,7 +79,7 @@ void copy_int8_or_int4(const unsigned int N, const uint8_t *X, uint8_t *Y); * @param[in] Y float * for Vector Y * @param[in] alpha float * for scaling angle (radian) */ -void sine(const unsigned int N, float *X, float *Y, float alpha = 1.0); +void sine(const unsigned int N, float *X, float *Y, float alpha = 1.f); /** * @brief cosine with neon: Y = cos(alpha * X) @@ -88,7 +88,7 @@ void sine(const unsigned int N, float *X, float *Y, float alpha = 1.0); * @param[in] Y float * for Vector Y * @param[in] alpha float * for scaling angle (radian) */ -void cosine(const unsigned int N, float *X, float *Y, float alpha = 1.0); +void cosine(const unsigned int N, float *X, float *Y, float alpha = 1.f); /** * @brief inversed squared root transformation with neon : X = 1 / sqrt(X) From 9a3f5e4c9b292fbf8fd06ef264788771ae68565a Mon Sep 17 00:00:00 2001 From: skykongkong8 Date: Fri, 23 Feb 2024 10:51:10 +0900 Subject: [PATCH 11/11] [ TensorV2 ] Apply changes made from ele-wise SIMD operations - Like commit#7363546, alpha option in ewva should be set to 1, not 0. - Change function name : ew* -> ele_* **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: skykongkong8 --- nntrainer/tensor/half_tensor.cpp | 4 ++-- nntrainer/tensor/tensor.cpp | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/nntrainer/tensor/half_tensor.cpp b/nntrainer/tensor/half_tensor.cpp index d0e25b4bf4..f352177bda 100644 --- a/nntrainer/tensor/half_tensor.cpp +++ b/nntrainer/tensor/half_tensor.cpp @@ -510,8 +510,8 @@ TensorV2 &HalfTensor::add(TensorV2 const &m, TensorV2 &output, float const alpha) const { auto f = [&](const BroadcastInfoV2 &e, const _FP16 *buf, const _FP16 *m_buf, _FP16 *out_buf) { - if (e.strides[3] == 1 && strides[3] == 1 && strides[3] == 1 && alpha == 0) { - ewva(e.buffer_size, buf, m_buf, out_buf); + if (e.strides[3] == 1 && strides[3] == 1 && strides[3] == 1 && alpha == 1) { + ele_add(e.buffer_size, buf, m_buf, out_buf); } else { for (unsigned int i = 0; i < e.buffer_size; ++i) { *out_buf = *buf + *m_buf * static_cast<_FP16>(alpha); diff --git a/nntrainer/tensor/tensor.cpp b/nntrainer/tensor/tensor.cpp index b73ceef716..a63f3216e5 100644 --- a/nntrainer/tensor/tensor.cpp +++ b/nntrainer/tensor/tensor.cpp @@ -127,8 +127,7 @@ class SrcSharedTensor { SrcSharedTensor() : src(nullptr), off(0) {} SrcSharedTensor(const Tensor *tensor, size_t offset) : - src(tensor), - off(offset) {} + src(tensor), off(offset) {} /** * @brief Get the allocated src tensor