From 3d68b89ed919718fc8a3a0752dd8f23a1a94b580 Mon Sep 17 00:00:00 2001 From: Donghyeon Jeong Date: Tue, 20 Feb 2024 17:59:08 +0900 Subject: [PATCH] [AVX] Added VCVT (between floating-point and integer) AVX support for x86 architecture This pull request adds VCVT (between floating-point and integer) AVX support for the x86 architecture. This feature allows efficient conversion between single-precision floating-point and half-precision floating-point data types using the AVX instruction set. **Changes proposed in this PR:** - Added new VCVT instructions to the blas_interface - Fix incorrect meson.build logic in checking gcc version and AVX support - Add enable-avx option to enable AVX hardware acceleration **Self-evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: Donghyeon Jeong --- meson.build | 13 ++-- meson_options.txt | 1 + nntrainer/tensor/blas_avx.cpp | 117 ++++++++++++++++++++++++++++ nntrainer/tensor/blas_avx.h | 46 +++++++++++ nntrainer/tensor/blas_interface.cpp | 18 +++++ nntrainer/tensor/meson.build | 3 + 6 files changed, 193 insertions(+), 5 deletions(-) create mode 100644 nntrainer/tensor/blas_avx.cpp create mode 100644 nntrainer/tensor/blas_avx.h diff --git a/meson.build b/meson.build index 7c7b78b552..a16361e4a8 100644 --- a/meson.build +++ b/meson.build @@ -80,13 +80,16 @@ if get_option('enable-fp16') elif arch == 'arm' error ('FP16/ARM code (blas_neon.cpp) uses armv8.2 instructions. armv7 is not supported.') else - has_avx512fp16 = cc.has_argument('-mavx512fp16') - if (has_avx512fp16) - # add_project_arguments(['-mavx512fp16'], language: ['c','cpp']) - message ('Float16 for x86_64 enabled. Modern gcc-x64 genrally supports float16 with _Float16. -mavx512fp16 added for hardware acceleration') + if cc.version().version_compare('>=12.1.0') + message ('Float16 for x86_64 enabled. Modern gcc-x64 generally supports float16 with _Float16.') extra_defines += '-DENABLE_FP16=1' + if get_option('enable-avx') + extra_defines += '-DUSE_AVX=1' + add_project_arguments(['-march=native'], language: ['c','cpp']) + message('-march=native added for AVX hardware acceleration.') + endif else - warning ('Float16 for x86_64 enabled. However, software emulation is applied for fp16, making it slower and inconsistent. Use GCC 12+ for AVX512 FP16 support. This build will probably fail unless you bring a compiler that supports fp16 for x64.') + warning ('Float16 for x86_64 enabled. However, software emulation is applied for fp16, making it slower and inconsistent. Use GCC 12+ for FP16 support. This build will probably fail unless you bring a compiler that supports fp16 for x64.') endif endif endif diff --git a/meson_options.txt b/meson_options.txt index df1002d04b..293c9690b1 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -39,6 +39,7 @@ option('enable-fp16', type: 'boolean', value: false) option('enable-cublas', type: 'boolean', value: false) option('enable-openmp', type: 'boolean', value: true) option('enable-neon', type: 'boolean', value: false) +option('enable-avx', type: 'boolean', value: false) # ml-api dependency (to enable, install capi-inference from github.com/nnstreamer/api ) # To inter-operate with nnstreamer and ML-API packages, you need to enable this. diff --git a/nntrainer/tensor/blas_avx.cpp b/nntrainer/tensor/blas_avx.cpp new file mode 100644 index 0000000000..ce59583d6f --- /dev/null +++ b/nntrainer/tensor/blas_avx.cpp @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: Apache-2.0 +/** + * Copyright (C) 2023 Donghyeon Jeong + * + * @file blas_avx.cpp + * @date 20 Feb 2024 + * @see https://github.com/nnstreamer/nntrainer + * @author Donghyeon Jeong + * @bug No known bugs except for NYI items + * @brief This is a source for AVX implementation + * + */ + +#include +#include +#include +#include + +#include + +namespace nntrainer::avx { + +void vcvt_f16_f32(size_t N, const void *input, float *output) { + assert(N != 0); + assert(input != NULL); + assert(output != NULL); + + unsigned int idx = 0; + const _Float16 *data = (const _Float16 *)input; + + // 16 half-precision floating point values to single-precision values + for (; N - idx >= 16; idx += 16) { + const __m256 vec0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)data)); + const __m256 vec1 = + _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(data + 8))); + data += 16; + + _mm256_storeu_ps(output, vec0); + _mm256_storeu_ps(output + 8, vec1); + output += 16; + } + // 8 half-precision floating point values to single-precision values + for (; N - idx >= 8; idx += 8) { + const __m256 vec = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)data)); + data += 8; + + _mm256_storeu_ps(output, vec); + output += 8; + } + // remaining half-precision floating point values to single-precision values + if (N - idx > 0 && N - idx < 8) { + const __m256 vec = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)data)); + __m128 s_vec = _mm256_castps256_ps128(vec); + if ((N - idx) & 4) { + _mm_storeu_ps(output, s_vec); + s_vec = _mm256_extractf128_ps(vec, 1); + output += 4; + } + if ((N - idx) & 2) { + _mm_storel_pi((__m64 *)output, s_vec); + s_vec = _mm_movehl_ps(s_vec, s_vec); + output += 2; + } + if ((N - idx) & 1) { + _mm_store_ss(output, s_vec); + } + } +} + +void vcvt_f32_f16(size_t N, const float *input, void *output) { + assert(N != 0); + assert(input != NULL); + assert(output != NULL); + + unsigned int idx = 0; + _Float16 *out_data = (_Float16 *)output; + + // 16 single-precision floating point values to half-precision values + for (; N - idx >= 16; idx += 16) { + const __m256 vec0 = _mm256_loadu_ps(input); + const __m256 vec1 = _mm256_loadu_ps(input + 8); + input += 16; + + _mm_storeu_si128((__m128i *)out_data, + _mm256_cvtps_ph(vec0, _MM_FROUND_TO_NEAREST_INT)); + _mm_storeu_si128((__m128i *)(out_data + 8), + _mm256_cvtps_ph(vec1, _MM_FROUND_TO_NEAREST_INT)); + out_data += 16; + } + // 8 single-precision floating point values to half-precision values + for (; N - idx >= 8; idx += 8) { + const __m256 vec = _mm256_loadu_ps(input); + input += 8; + + _mm_storeu_si128((__m128i *)out_data, + _mm256_cvtps_ph(vec, _MM_FROUND_TO_NEAREST_INT)); + out_data += 8; + } + // 4 single-precision floating point values to half-precision values + for (; N - idx >= 4; idx += 4) { + const __m128 vec = _mm_loadu_ps(input); + input += 4; + + _mm_storeu_si64((__m128i *)out_data, + _mm_cvtps_ph(vec, _MM_FROUND_TO_NEAREST_INT)); + out_data += 4; + } + // remaining single-precision floating point values to half-precision values + while (idx < N) { + *out_data = static_cast<_Float16>(*input); + ++out_data; + ++input; + ++idx; + } +} + +} // namespace nntrainer::avx diff --git a/nntrainer/tensor/blas_avx.h b/nntrainer/tensor/blas_avx.h new file mode 100644 index 0000000000..ab1270a208 --- /dev/null +++ b/nntrainer/tensor/blas_avx.h @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: Apache-2.0 +/** + * Copyright (C) 2023 Donghyeon Jeong + * + * @file blas_avx.h + * @date 20 Feb 2024 + * @see https://github.com/nnstreamer/nntrainer + * @author Donghyeon Jeong + * @bug No known bugs except for NYI items + * @brief This is a header for AVX implementation + * + */ + +#ifndef __BLAS_AVX2_H_ +#define __BLAS_AVX2_H_ +#ifdef __cplusplus + +#include +#include + +namespace nntrainer::avx { + +/** + * @brief Converts half-precision floating point values to single-precision + * floating point values. + * + * @param[in] N number of elements in input vector + * @param[in] input vector containing 16-bit floating point values + * @param[out] output vector containing single-precision floating point values. + */ +void vcvt_f16_f32(size_t N, const void *input, float *output); + +/** + * @brief Converts single-precision floating point values to half-precision + * floating point values. + * + * @param[in] N number of elements in input vector + * @param[in] input vector containing single-precision floating point values + * @param[out] output vector containing 16-bit floating point values + */ +void vcvt_f32_f16(size_t N, const float *input, void *output); + +} // namespace nntrainer::avx + +#endif /* __cplusplus */ +#endif /* __BLAS_AVX_H_ */ diff --git a/nntrainer/tensor/blas_interface.cpp b/nntrainer/tensor/blas_interface.cpp index 2a190150fb..2c050b84f5 100644 --- a/nntrainer/tensor/blas_interface.cpp +++ b/nntrainer/tensor/blas_interface.cpp @@ -19,6 +19,10 @@ #include #endif +#if USE_AVX +#include +#endif + #include #define sgemv_loop(ci, cj, cM, cN) \ @@ -182,6 +186,13 @@ static void copy_float32_to_float16(const unsigned int N, const float *X, for (unsigned int i = 0; i < N; ++i) Y[i * incy] = X[i * incx]; } +#elif USE_AVX + if (incX == 1 && incY == 1) { + nntrainer::avx::vcvt_f32_f16(N, X, Y); + } else { + for (unsigned int i = 0; i < N; ++i) + Y[i * incy] = static_cast<_FP16>(X[i * incx]); + } #else for (unsigned int i = 0; i < N; ++i) Y[i * incy] = static_cast<_FP16>(X[i * incx]); @@ -200,6 +211,13 @@ static void copy_float16_to_float32(const unsigned int N, const _FP16 *X, for (unsigned int i = 0; i < N; ++i) Y[i * incy] = X[i * incx]; } +#elif USE_AVX + if (incX == 1 && incY == 1) { + nntrainer::avx::vcvt_f16_f32(N, X, Y); + } else { + for (unsigned int i = 0; i < N; ++i) + Y[i * incy] = static_cast(X[i * incx]); + } #else for (unsigned int i = 0; i < N; ++i) Y[i * incy] = static_cast(X[i * incx]); diff --git a/nntrainer/tensor/meson.build b/nntrainer/tensor/meson.build index 499eac24aa..7e8c000c6b 100644 --- a/nntrainer/tensor/meson.build +++ b/nntrainer/tensor/meson.build @@ -43,6 +43,9 @@ if get_option('enable-fp16') tensor_sources += 'blas_neon.cpp' tensor_headers += 'blas_neon.h' endif + elif get_option('enable-avx') + tensor_sources += 'blas_avx.cpp' + tensor_headers += 'blas_avx.h' endif endif