From 3d68b89ed919718fc8a3a0752dd8f23a1a94b580 Mon Sep 17 00:00:00 2001
From: Donghyeon Jeong <dhyeon.jeong@samsung.com>
Date: Tue, 20 Feb 2024 17:59:08 +0900
Subject: [PATCH] [AVX] Added VCVT (between floating-point and integer) AVX
 support for x86 architecture

This pull request adds VCVT (between floating-point and integer) AVX support for the x86 architecture.
This feature allows efficient conversion between single-precision floating-point and half-precision floating-point data types using the AVX instruction set.

**Changes proposed in this PR:**
- Added new VCVT instructions to the blas_interface
- Fix incorrect meson.build logic in checking gcc version and AVX support
- Add enable-avx option to enable AVX hardware acceleration

**Self-evaluation:**
1. Build test: [X]Passed [ ]Failed [ ]Skipped
2. Run test:   [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: Donghyeon Jeong <dhyeon.jeong@samsung.com>
---
 meson.build                         |  13 ++--
 meson_options.txt                   |   1 +
 nntrainer/tensor/blas_avx.cpp       | 117 ++++++++++++++++++++++++++++
 nntrainer/tensor/blas_avx.h         |  46 +++++++++++
 nntrainer/tensor/blas_interface.cpp |  18 +++++
 nntrainer/tensor/meson.build        |   3 +
 6 files changed, 193 insertions(+), 5 deletions(-)
 create mode 100644 nntrainer/tensor/blas_avx.cpp
 create mode 100644 nntrainer/tensor/blas_avx.h

diff --git a/meson.build b/meson.build
index 7c7b78b552..a16361e4a8 100644
--- a/meson.build
+++ b/meson.build
@@ -80,13 +80,16 @@ if get_option('enable-fp16')
    elif arch == 'arm'
      error ('FP16/ARM code (blas_neon.cpp) uses armv8.2 instructions. armv7 is not supported.')
    else
-     has_avx512fp16 = cc.has_argument('-mavx512fp16')
-     if (has_avx512fp16)
-       # add_project_arguments(['-mavx512fp16'], language: ['c','cpp'])
-       message ('Float16 for x86_64 enabled. Modern gcc-x64 genrally supports float16 with _Float16. -mavx512fp16 added for hardware acceleration')
+     if cc.version().version_compare('>=12.1.0')
+       message ('Float16 for x86_64 enabled. Modern gcc-x64 generally supports float16 with _Float16.')
        extra_defines += '-DENABLE_FP16=1'
+       if get_option('enable-avx')
+        extra_defines += '-DUSE_AVX=1'
+        add_project_arguments(['-march=native'], language: ['c','cpp'])
+        message('-march=native added for AVX hardware acceleration.')
+       endif
      else
-       warning ('Float16 for x86_64 enabled. However, software emulation is applied for fp16, making it slower and inconsistent. Use GCC 12+ for AVX512 FP16 support. This build will probably fail unless you bring a compiler that supports fp16 for x64.')
+       warning ('Float16 for x86_64 enabled. However, software emulation is applied for fp16, making it slower and inconsistent. Use GCC 12+ for FP16 support. This build will probably fail unless you bring a compiler that supports fp16 for x64.')
      endif
    endif  
 endif
diff --git a/meson_options.txt b/meson_options.txt
index df1002d04b..293c9690b1 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -39,6 +39,7 @@ option('enable-fp16', type: 'boolean', value: false)
 option('enable-cublas', type: 'boolean', value: false)
 option('enable-openmp', type: 'boolean', value: true)
 option('enable-neon', type: 'boolean', value: false)
+option('enable-avx', type: 'boolean', value: false)
 
 # ml-api dependency (to enable, install capi-inference from github.com/nnstreamer/api )
 # To inter-operate with nnstreamer and ML-API packages, you need to enable this.
diff --git a/nntrainer/tensor/blas_avx.cpp b/nntrainer/tensor/blas_avx.cpp
new file mode 100644
index 0000000000..ce59583d6f
--- /dev/null
+++ b/nntrainer/tensor/blas_avx.cpp
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2023 Donghyeon Jeong <dhyeon.jeong@samsung.com>
+ *
+ * @file   blas_avx.cpp
+ * @date   20 Feb 2024
+ * @see    https://github.com/nnstreamer/nntrainer
+ * @author Donghyeon Jeong <dhyeon.jeong@samsung.com>
+ * @bug    No known bugs except for NYI items
+ * @brief  This is a source for AVX implementation
+ *
+ */
+
+#include <cassert>
+#include <chrono>
+#include <cstdint>
+#include <immintrin.h>
+
+#include <blas_avx.h>
+
+namespace nntrainer::avx {
+
+void vcvt_f16_f32(size_t N, const void *input, float *output) {
+  assert(N != 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  unsigned int idx = 0;
+  const _Float16 *data = (const _Float16 *)input;
+
+  // 16 half-precision floating point values to single-precision values
+  for (; N - idx >= 16; idx += 16) {
+    const __m256 vec0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)data));
+    const __m256 vec1 =
+      _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(data + 8)));
+    data += 16;
+
+    _mm256_storeu_ps(output, vec0);
+    _mm256_storeu_ps(output + 8, vec1);
+    output += 16;
+  }
+  // 8 half-precision floating point values to single-precision values
+  for (; N - idx >= 8; idx += 8) {
+    const __m256 vec = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)data));
+    data += 8;
+
+    _mm256_storeu_ps(output, vec);
+    output += 8;
+  }
+  // remaining half-precision floating point values to single-precision values
+  if (N - idx > 0 && N - idx < 8) {
+    const __m256 vec = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)data));
+    __m128 s_vec = _mm256_castps256_ps128(vec);
+    if ((N - idx) & 4) {
+      _mm_storeu_ps(output, s_vec);
+      s_vec = _mm256_extractf128_ps(vec, 1);
+      output += 4;
+    }
+    if ((N - idx) & 2) {
+      _mm_storel_pi((__m64 *)output, s_vec);
+      s_vec = _mm_movehl_ps(s_vec, s_vec);
+      output += 2;
+    }
+    if ((N - idx) & 1) {
+      _mm_store_ss(output, s_vec);
+    }
+  }
+}
+
+void vcvt_f32_f16(size_t N, const float *input, void *output) {
+  assert(N != 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  unsigned int idx = 0;
+  _Float16 *out_data = (_Float16 *)output;
+
+  // 16 single-precision floating point values to half-precision values
+  for (; N - idx >= 16; idx += 16) {
+    const __m256 vec0 = _mm256_loadu_ps(input);
+    const __m256 vec1 = _mm256_loadu_ps(input + 8);
+    input += 16;
+
+    _mm_storeu_si128((__m128i *)out_data,
+                     _mm256_cvtps_ph(vec0, _MM_FROUND_TO_NEAREST_INT));
+    _mm_storeu_si128((__m128i *)(out_data + 8),
+                     _mm256_cvtps_ph(vec1, _MM_FROUND_TO_NEAREST_INT));
+    out_data += 16;
+  }
+  // 8 single-precision floating point values to half-precision values
+  for (; N - idx >= 8; idx += 8) {
+    const __m256 vec = _mm256_loadu_ps(input);
+    input += 8;
+
+    _mm_storeu_si128((__m128i *)out_data,
+                     _mm256_cvtps_ph(vec, _MM_FROUND_TO_NEAREST_INT));
+    out_data += 8;
+  }
+  // 4 single-precision floating point values to half-precision values
+  for (; N - idx >= 4; idx += 4) {
+    const __m128 vec = _mm_loadu_ps(input);
+    input += 4;
+
+    _mm_storeu_si64((__m128i *)out_data,
+                    _mm_cvtps_ph(vec, _MM_FROUND_TO_NEAREST_INT));
+    out_data += 4;
+  }
+  // remaining single-precision floating point values to half-precision values
+  while (idx < N) {
+    *out_data = static_cast<_Float16>(*input);
+    ++out_data;
+    ++input;
+    ++idx;
+  }
+}
+
+} // namespace nntrainer::avx
diff --git a/nntrainer/tensor/blas_avx.h b/nntrainer/tensor/blas_avx.h
new file mode 100644
index 0000000000..ab1270a208
--- /dev/null
+++ b/nntrainer/tensor/blas_avx.h
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2023 Donghyeon Jeong <dhyeon.jeong@samsung.com>
+ *
+ * @file   blas_avx.h
+ * @date   20 Feb 2024
+ * @see    https://github.com/nnstreamer/nntrainer
+ * @author Donghyeon Jeong <dhyeon.jeong@samsung.com>
+ * @bug    No known bugs except for NYI items
+ * @brief  This is a header for AVX implementation
+ *
+ */
+
+#ifndef __BLAS_AVX2_H_
+#define __BLAS_AVX2_H_
+#ifdef __cplusplus
+
+#include <cmath>
+#include <immintrin.h>
+
+namespace nntrainer::avx {
+
+/**
+ * @brief Converts half-precision floating point values to single-precision
+ * floating point values.
+ *
+ * @param[in]  N number of elements in input vector
+ * @param[in]  input vector containing 16-bit floating point values
+ * @param[out] output vector containing single-precision floating point values.
+ */
+void vcvt_f16_f32(size_t N, const void *input, float *output);
+
+/**
+ * @brief  Converts single-precision floating point values to half-precision
+ * floating point values.
+ *
+ * @param[in]  N number of elements in input vector
+ * @param[in]  input vector containing single-precision floating point values
+ * @param[out] output vector containing 16-bit floating point values
+ */
+void vcvt_f32_f16(size_t N, const float *input, void *output);
+
+} // namespace nntrainer::avx
+
+#endif /* __cplusplus */
+#endif /* __BLAS_AVX_H_ */
diff --git a/nntrainer/tensor/blas_interface.cpp b/nntrainer/tensor/blas_interface.cpp
index 2a190150fb..2c050b84f5 100644
--- a/nntrainer/tensor/blas_interface.cpp
+++ b/nntrainer/tensor/blas_interface.cpp
@@ -19,6 +19,10 @@
 #include <blas_neon.h>
 #endif
 
+#if USE_AVX
+#include <blas_avx.h>
+#endif
+
 #include <cmath>
 
 #define sgemv_loop(ci, cj, cM, cN)           \
@@ -182,6 +186,13 @@ static void copy_float32_to_float16(const unsigned int N, const float *X,
     for (unsigned int i = 0; i < N; ++i)
       Y[i * incy] = X[i * incx];
   }
+#elif USE_AVX
+  if (incX == 1 && incY == 1) {
+    nntrainer::avx::vcvt_f32_f16(N, X, Y);
+  } else {
+    for (unsigned int i = 0; i < N; ++i)
+      Y[i * incy] = static_cast<_FP16>(X[i * incx]);
+  }
 #else
   for (unsigned int i = 0; i < N; ++i)
     Y[i * incy] = static_cast<_FP16>(X[i * incx]);
@@ -200,6 +211,13 @@ static void copy_float16_to_float32(const unsigned int N, const _FP16 *X,
     for (unsigned int i = 0; i < N; ++i)
       Y[i * incy] = X[i * incx];
   }
+#elif USE_AVX
+  if (incX == 1 && incY == 1) {
+    nntrainer::avx::vcvt_f16_f32(N, X, Y);
+  } else {
+    for (unsigned int i = 0; i < N; ++i)
+      Y[i * incy] = static_cast<float>(X[i * incx]);
+  }
 #else
   for (unsigned int i = 0; i < N; ++i)
     Y[i * incy] = static_cast<float>(X[i * incx]);
diff --git a/nntrainer/tensor/meson.build b/nntrainer/tensor/meson.build
index 499eac24aa..7e8c000c6b 100644
--- a/nntrainer/tensor/meson.build
+++ b/nntrainer/tensor/meson.build
@@ -43,6 +43,9 @@ if get_option('enable-fp16')
       tensor_sources += 'blas_neon.cpp'
       tensor_headers += 'blas_neon.h'
     endif
+  elif get_option('enable-avx')
+    tensor_sources += 'blas_avx.cpp'
+    tensor_headers += 'blas_avx.h'
   endif
 endif