nnstreamer · jijoongmoon · Feb 23, 2024 · Feb 15, 2024 · Feb 16, 2024 · Feb 16, 2024
@@ -245,25 +245,6 @@ static void copy_int8_to_fp16(const unsigned int N, const uint8_t *X,
 #endif
 }
 
-static void ewvm_FP16(const unsigned int N, const _FP16 *X, const _FP16 *Y,
-                      _FP16 *Z) {
-#if (defined USE__FP16 && USE_NEON)
-  nntrainer::neon::ewvm(N, X, Y, Z);
-#else
-  for (unsigned int i = 0; i < N; ++i)
-    Z[i] = X[i] * Y[i];
-#endif
-}
-
-static void ewva_FP16(const unsigned int N, const _FP16 *X, const _FP16 *Y,
-                      _FP16 *Z) {
-#if (defined USE__FP16 && USE_NEON)
-  nntrainer::neon::ewva(N, X, Y, Z);
-#else
-  for (unsigned int i = 0; i < N; ++i)
-    Z[i] = X[i] + Y[i];
-#endif
-}
 void sscal(const unsigned int N, const float alpha, _FP16 *X, const int incX) {
   unsigned int incx = abs(incX);
 
@@ -400,12 +381,64 @@ void scopy_int8_to_float16(const unsigned int N, const uint8_t *X,
   copy_int8_to_fp16(N, X, incX, Y, incY);
 }
 
-void ewvm(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z) {
-  ewvm_FP16(N, X, Y, Z);
+void ele_mul(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z,
-void ele_mul(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z,
+void elementwise_multiply(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z,
-void ele_mul(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z,
+void elementwise_multiply(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z,
+             float alpha, float beta) {
+#if (defined USE__FP16 && USE_NEON)
+  nntrainer::neon::ele_mul(N, X, Y, Z, alpha, beta);
+#else
+  for (unsigned int i = 0; i < N; ++i) {
+    if (std::abs(beta) > __FLT_MIN__)
+      Z[i] = static_cast<_FP16>(alpha) * X[i] * Y[i] +
+             static_cast<_FP16>(beta) * Z[i];
+    else
+      Z[i] = static_cast<_FP16>(alpha) * X[i] * Y[i];
-    if (std::abs(beta) > __FLT_MIN__)
-      Z[i] = static_cast<_FP16>(alpha) * X[i] * Y[i] +
-             static_cast<_FP16>(beta) * Z[i];
-    else
-      Z[i] = static_cast<_FP16>(alpha) * X[i] * Y[i];
+      Z[i] = static_cast<_FP16>(alpha) * X[i] * Y[i] + static_cast<_FP16>(beta) * Z[i];
-    if (std::abs(beta) > __FLT_MIN__)
-      Z[i] = static_cast<_FP16>(alpha) * X[i] * Y[i] +
-             static_cast<_FP16>(beta) * Z[i];
-    else
-      Z[i] = static_cast<_FP16>(alpha) * X[i] * Y[i];
+      Z[i] = static_cast<_FP16>(alpha) * X[i] * Y[i] + static_cast<_FP16>(beta) * Z[i];
+  }
+#endif
+}
+
+void ele_add(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z,
+             float alpha, float beta) {
+#if (defined USE__FP16 && USE_NEON)
+  nntrainer::neon::ele_add(N, X, Y, Z, alpha, beta);
+#else
+  for (unsigned int i = 0; i < N; ++i) {
+    if (std::abs(beta) > __FLT_MIN__)
+      Z[i] = X[i] + static_cast<_FP16>(alpha) * Y[i] +
+             static_cast<_FP16>(beta) * Z[i];
+    else
+      Z[i] = X[i] + static_cast<_FP16>(alpha) * Y[i];
+  }
+#endif
+}
+
+void ele_sub(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z,
+             float alpha, float beta) {
+#if (defined USE__FP16 && USE_NEON)
+  nntrainer::neon::ele_sub(N, X, Y, Z, alpha, beta);
+#else
+  for (unsigned int i = 0; i < N; ++i) {
+    if (std::abs(beta) > __FLT_MIN__)
+      Z[i] = X[i] - static_cast<_FP16>(alpha) * Y[i] +
+             static_cast<_FP16>(beta) * Z[i];
+    else
+      Z[i] = X[i] - static_cast<_FP16>(alpha) * Y[i];
+  }
+#endif
 }
 
-void ewva(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z) {
-  ewva_FP16(N, X, Y, Z);
+void ele_div(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z,
+             float alpha, float beta) {
+#if (defined USE__FP16 && USE_NEON)
+  nntrainer::neon::ele_div(N, X, Y, Z, alpha, beta);
+#else
+  for (unsigned int i = 0; i < N; ++i) {
+    if (std::abs(beta) > __FLT_MIN__)
+      Z[i] = X[i] / (static_cast<_FP16>(alpha) * Y[i]) +
+             static_cast<_FP16>(beta) * Z[i];
+    else
+      Z[i] = X[i] / (static_cast<_FP16>(alpha) * Y[i]);
+  }
+#endif
 }
 
 _FP16 snrm2(const int N, const _FP16 *X, const int incX) {
@@ -904,4 +937,61 @@ void inv_sqrt_inplace(const unsigned int N, float *X) {
 #endif
 }
 
+void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z,
+             float alpha, float beta) {
+#ifdef USE_NEON
+  nntrainer::neon::ele_mul(N, X, Y, Z, alpha, beta);
+#else
+  for (unsigned int i = 0; i < N; ++i) {
+    if (std::abs(beta) > __FLT_MIN__)
+      Z[i] = alpha * X[i] * Y[i] + beta * Z[i];
+    else
+      Z[i] = alpha * X[i] * Y[i];
+  }
+#endif
+}
+
+void ele_add(const unsigned int N, const float *X, const float *Y, float *Z,
+             float alpha, float beta) {
+#ifdef USE_NEON
+  nntrainer::neon::ele_add(N, X, Y, Z, alpha, beta);
+#else
+  for (unsigned int i = 0; i < N; ++i) {
+    if (std::abs(beta) > __FLT_MIN__)
+      Z[i] = X[i] + alpha * Y[i] + beta * Z[i];
+    else
+      Z[i] = X[i] + alpha * Y[i];
+  }
+#endif
+}
+
+void ele_sub(const unsigned int N, const float *X, const float *Y, float *Z,
+             float alpha, float beta) {
+#ifdef USE_NEON
+  nntrainer::neon::ele_sub(N, X, Y, Z, alpha, beta);
+#else
+  for (unsigned int i = 0; i < N; ++i) {
+    if (std::abs(beta) > __FLT_MIN__)
+      Z[i] = X[i] - alpha * Y[i] + beta * Z[i];
+    else
+      Z[i] = X[i] - alpha * Y[i];
+  }
+
+#endif
+}
+
+void ele_div(const unsigned int N, const float *X, const float *Y, float *Z,
+             float alpha, float beta) {
+#ifdef USE_NEON
+  nntrainer::neon::ele_div(N, X, Y, Z, alpha, beta);
+#else
+  for (unsigned int i = 0; i < N; ++i) {
+    if (std::abs(beta) > __FLT_MIN__)
+      Z[i] = X[i] / (alpha * Y[i]) + beta * Z[i];
+    else
+      Z[i] = X[i] / (alpha * Y[i]);
+  }
+#endif
+}
+
 } // namespace nntrainer
@@ -152,22 +152,56 @@ void sgemv(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, const unsigned int M,
            const unsigned int lda, const _FP16 *X, const int incX,
            const float beta, _FP16 *Y, const int incY);
 /**
- * @brief     elementwise vector multiplication : Z = X ⊙ Y
+ * @brief     elementwise vector multiplication : Z = X ⊙ alpha * Y +
+ * beta * Z
  * @param[in] N  length of the vector
- * @param[in] X __fp16 * for Vector X
- * @param[in] Y __fp16 * for Vector Y
- * @param[in] Z __fp16 * for Vector Z
- */
-void ewvm(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z);
+ * @param[in] X _FP16 * for Vector X
+ * @param[in] Y _FP16 * for Vector Y
+ * @param[in] Z _FP16 * for Vector Z
+ * @param[in] alpha scalar multiplier for input
+ * @param[in] beta scalar multiplier for output
+ */
+void ele_mul(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z,
+             float alpha = 1.f, float beta = 0.f);
 
 /**
- * @brief     elementwise vector addition : Z = X + Y
+ * @brief     elementwise vector addition : Z = X + alpha * Y + beta *
+ * Z
  * @param[in] N  length of the vector
- * @param[in] X __fp16 * for Vector X
- * @param[in] Y __fp16 * for Vector Y
- * @param[in] Z __fp16 * for Vector Z
+ * @param[in] X _FP16 * for Vector X
+ * @param[in] Y _FP16 * for Vector Y
+ * @param[in] Z _FP16 * for Vector Z
+ * @param[in] alpha scalar multiplier for input
+ * @param[in] beta scalar multiplier for output
  */
-void ewva(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z);
+void ele_add(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z,
+             float alpha = 1.f, float beta = 0.f);
+/**
+ * @brief     elementwise vector subtraction with neon : Z = X - alpha * Y +
+ * beta * Z
+ * @param[in] N  length of the vector
+ * @param[in] X _FP16 * for Vector X
+ * @param[in] Y _FP16 * for Vector Y
+ * @param[in] Z _FP16 * for Vector Z
+ * @param[in] alpha scalar multiplier for input
+ * @param[in] beta scalar multiplier for output
+ */
+void ele_sub(const unsigned N, const _FP16 *X, const _FP16 *Y, _FP16 *Z,
+             float alpha = 1.f, float beta = 0.f);
+
+/**
+ * @brief     elementwise vector division with neon : Z = X / (alpha * Y) + beta
+ * * Z
+ * @note ZeroDivisionError is not guaranteed in this function
+ * @param[in] N  length of the vector
+ * @param[in] X _FP16 * for Vector X
+ * @param[in] Y _FP16 * for Vector Y
+ * @param[in] Z _FP16 * for Vector Z
+ * @param[in] alpha scalar multiplier for input
+ * @param[in] beta scalar multiplier for output
+ */
+void ele_div(const unsigned N, const _FP16 *X, const _FP16 *Y, _FP16 *Z,
+             float alpha = 1.f, float beta = 0.f);
 
 /**
  * @brief     isamax function : index of first maxima
@@ -351,8 +385,7 @@ unsigned int isamax(const unsigned int N, const float *X, const int incX);
  * @param[in] Y float * for Vector Y
  * @param[in] alpha float * for scaling angle (radian)
  */
-void sine(const unsigned int N, float *X, float *Y,
-                         float alpha = 1.0);
+void sine(const unsigned int N, float *X, float *Y, float alpha = 1.f);
 
 /**
  * @brief     cosine with neon: Y = cos(alpha * X)
@@ -361,8 +394,7 @@ void sine(const unsigned int N, float *X, float *Y,
  * @param[in] Y float * for Vector Y
  * @param[in] alpha float * for scaling angle (radian)
  */
-void cosine(const unsigned int N, float *X, float *Y,
-                           float alpha = 1.0);
+void cosine(const unsigned int N, float *X, float *Y, float alpha = 1.f);
 
 /**
  * @brief inversed squared root transformation inplace : X = 1 / sqrt(X)
@@ -371,6 +403,57 @@ void cosine(const unsigned int N, float *X, float *Y,
  * @param X float * for Vector X
  */
 void inv_sqrt_inplace(const unsigned int N, float *X);
+/**
+ * @brief     elementwise vector multiplication : Z = X ⊙ alpha * Y +
+ * beta * Z
+ * @param[in] N  length of the vector
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ * @param[in] Z float * for Vector Z
+ * @param[in] alpha scalar multiplier for input
+ * @param[in] beta scalar multiplier for output
+ */
+void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z,
+             float alpha = 1.f, float beta = 0.f);
+
+/**
+ * @brief     elementwise vector addition : Z = X + alpha * Y + beta *
+ * Z
+ * @param[in] N  length of the vector
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ * @param[in] Z float * for Vector Z
+ * @param[in] alpha scalar multiplier for input
+ * @param[in] beta scalar multiplier for output
+ */
+void ele_add(const unsigned int N, const float *X, const float *Y, float *Z,
+             float alpha = 1.f, float beta = 0.f);
+/**
+ * @brief     elementwise vector subtraction with neon : Z = X - alpha * Y +
+ * beta * Z
+ * @param[in] N  length of the vector
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ * @param[in] Z float * for Vector Z
+ * @param[in] alpha scalar multiplier for input
+ * @param[in] beta scalar multiplier for output
+ */
+void ele_sub(const unsigned N, const float *X, const float *Y, float *Z,
+             float alpha = 1.f, float beta = 0.f);
+
+/**
+ * @brief     elementwise vector division with neon : Z = X / (alpha * Y) + beta
+ * * Z
+ * @note ZeroDivisionError is not guaranteed in this function
+ * @param[in] N  length of the vector
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ * @param[in] Z float * for Vector Z
+ * @param[in] alpha scalar multiplier for input
+ * @param[in] beta scalar multiplier for output
+ */
+void ele_div(const unsigned N, const float *X, const float *Y, float *Z,
+             float alpha = 1.f, float beta = 0.f);
 } /* namespace nntrainer */
 #endif /* __cplusplus */
 #endif /* __BLAS_INTERFACE_H__ */