Skip to content

Commit

Permalink
[AVX] Added VCVT (between floating-point and integer) AVX support for…
Browse files Browse the repository at this point in the history
… x86 architecture

This pull request adds VCVT (between floating-point and integer) AVX support for the x86 architecture.
This feature allows efficient conversion between single-precision floating-point and half-precision floating-point data types using the AVX instruction set.

**Changes proposed in this PR:**
- Added new VCVT instructions to the blas_interface
- Fix incorrect meson.build logic in checking gcc version and AVX support
- Add enable-avx option to enable AVX hardware acceleration

**Self-evaluation:**
1. Build test: [X]Passed [ ]Failed [ ]Skipped
2. Run test:   [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: Donghyeon Jeong <[email protected]>
  • Loading branch information
djeong20 authored and myungjoo committed Feb 27, 2024
1 parent 789f07b commit 3d68b89
Show file tree
Hide file tree
Showing 6 changed files with 193 additions and 5 deletions.
13 changes: 8 additions & 5 deletions meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -80,13 +80,16 @@ if get_option('enable-fp16')
elif arch == 'arm'
error ('FP16/ARM code (blas_neon.cpp) uses armv8.2 instructions. armv7 is not supported.')
else
has_avx512fp16 = cc.has_argument('-mavx512fp16')
if (has_avx512fp16)
# add_project_arguments(['-mavx512fp16'], language: ['c','cpp'])
message ('Float16 for x86_64 enabled. Modern gcc-x64 genrally supports float16 with _Float16. -mavx512fp16 added for hardware acceleration')
if cc.version().version_compare('>=12.1.0')
message ('Float16 for x86_64 enabled. Modern gcc-x64 generally supports float16 with _Float16.')
extra_defines += '-DENABLE_FP16=1'
if get_option('enable-avx')
extra_defines += '-DUSE_AVX=1'
add_project_arguments(['-march=native'], language: ['c','cpp'])
message('-march=native added for AVX hardware acceleration.')
endif
else
warning ('Float16 for x86_64 enabled. However, software emulation is applied for fp16, making it slower and inconsistent. Use GCC 12+ for AVX512 FP16 support. This build will probably fail unless you bring a compiler that supports fp16 for x64.')
warning ('Float16 for x86_64 enabled. However, software emulation is applied for fp16, making it slower and inconsistent. Use GCC 12+ for FP16 support. This build will probably fail unless you bring a compiler that supports fp16 for x64.')
endif
endif
endif
Expand Down
1 change: 1 addition & 0 deletions meson_options.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ option('enable-fp16', type: 'boolean', value: false)
option('enable-cublas', type: 'boolean', value: false)
option('enable-openmp', type: 'boolean', value: true)
option('enable-neon', type: 'boolean', value: false)
option('enable-avx', type: 'boolean', value: false)

# ml-api dependency (to enable, install capi-inference from github.com/nnstreamer/api )
# To inter-operate with nnstreamer and ML-API packages, you need to enable this.
Expand Down
117 changes: 117 additions & 0 deletions nntrainer/tensor/blas_avx.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
// SPDX-License-Identifier: Apache-2.0
/**
* Copyright (C) 2023 Donghyeon Jeong <[email protected]>
*
* @file blas_avx.cpp
* @date 20 Feb 2024
* @see https://github.com/nnstreamer/nntrainer
* @author Donghyeon Jeong <[email protected]>
* @bug No known bugs except for NYI items
* @brief This is a source for AVX implementation
*
*/

#include <cassert>
#include <chrono>
#include <cstdint>
#include <immintrin.h>

#include <blas_avx.h>

namespace nntrainer::avx {

void vcvt_f16_f32(size_t N, const void *input, float *output) {
assert(N != 0);
assert(input != NULL);
assert(output != NULL);

unsigned int idx = 0;
const _Float16 *data = (const _Float16 *)input;

// 16 half-precision floating point values to single-precision values
for (; N - idx >= 16; idx += 16) {
const __m256 vec0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)data));
const __m256 vec1 =
_mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(data + 8)));
data += 16;

_mm256_storeu_ps(output, vec0);
_mm256_storeu_ps(output + 8, vec1);
output += 16;
}
// 8 half-precision floating point values to single-precision values
for (; N - idx >= 8; idx += 8) {
const __m256 vec = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)data));
data += 8;

_mm256_storeu_ps(output, vec);
output += 8;
}
// remaining half-precision floating point values to single-precision values
if (N - idx > 0 && N - idx < 8) {
const __m256 vec = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)data));
__m128 s_vec = _mm256_castps256_ps128(vec);
if ((N - idx) & 4) {
_mm_storeu_ps(output, s_vec);
s_vec = _mm256_extractf128_ps(vec, 1);
output += 4;
}
if ((N - idx) & 2) {
_mm_storel_pi((__m64 *)output, s_vec);
s_vec = _mm_movehl_ps(s_vec, s_vec);
output += 2;
}
if ((N - idx) & 1) {
_mm_store_ss(output, s_vec);
}
}
}

void vcvt_f32_f16(size_t N, const float *input, void *output) {
assert(N != 0);
assert(input != NULL);
assert(output != NULL);

unsigned int idx = 0;
_Float16 *out_data = (_Float16 *)output;

// 16 single-precision floating point values to half-precision values
for (; N - idx >= 16; idx += 16) {
const __m256 vec0 = _mm256_loadu_ps(input);
const __m256 vec1 = _mm256_loadu_ps(input + 8);
input += 16;

_mm_storeu_si128((__m128i *)out_data,
_mm256_cvtps_ph(vec0, _MM_FROUND_TO_NEAREST_INT));
_mm_storeu_si128((__m128i *)(out_data + 8),
_mm256_cvtps_ph(vec1, _MM_FROUND_TO_NEAREST_INT));
out_data += 16;
}
// 8 single-precision floating point values to half-precision values
for (; N - idx >= 8; idx += 8) {
const __m256 vec = _mm256_loadu_ps(input);
input += 8;

_mm_storeu_si128((__m128i *)out_data,
_mm256_cvtps_ph(vec, _MM_FROUND_TO_NEAREST_INT));
out_data += 8;
}
// 4 single-precision floating point values to half-precision values
for (; N - idx >= 4; idx += 4) {
const __m128 vec = _mm_loadu_ps(input);
input += 4;

_mm_storeu_si64((__m128i *)out_data,
_mm_cvtps_ph(vec, _MM_FROUND_TO_NEAREST_INT));
out_data += 4;
}
// remaining single-precision floating point values to half-precision values
while (idx < N) {
*out_data = static_cast<_Float16>(*input);
++out_data;
++input;
++idx;
}
}

} // namespace nntrainer::avx
46 changes: 46 additions & 0 deletions nntrainer/tensor/blas_avx.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
// SPDX-License-Identifier: Apache-2.0
/**
* Copyright (C) 2023 Donghyeon Jeong <[email protected]>
*
* @file blas_avx.h
* @date 20 Feb 2024
* @see https://github.com/nnstreamer/nntrainer
* @author Donghyeon Jeong <[email protected]>
* @bug No known bugs except for NYI items
* @brief This is a header for AVX implementation
*
*/

#ifndef __BLAS_AVX2_H_
#define __BLAS_AVX2_H_
#ifdef __cplusplus

#include <cmath>
#include <immintrin.h>

namespace nntrainer::avx {

/**
* @brief Converts half-precision floating point values to single-precision
* floating point values.
*
* @param[in] N number of elements in input vector
* @param[in] input vector containing 16-bit floating point values
* @param[out] output vector containing single-precision floating point values.
*/
void vcvt_f16_f32(size_t N, const void *input, float *output);

/**
* @brief Converts single-precision floating point values to half-precision
* floating point values.
*
* @param[in] N number of elements in input vector
* @param[in] input vector containing single-precision floating point values
* @param[out] output vector containing 16-bit floating point values
*/
void vcvt_f32_f16(size_t N, const float *input, void *output);

} // namespace nntrainer::avx

#endif /* __cplusplus */
#endif /* __BLAS_AVX_H_ */
18 changes: 18 additions & 0 deletions nntrainer/tensor/blas_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@
#include <blas_neon.h>
#endif

#if USE_AVX
#include <blas_avx.h>
#endif

#include <cmath>

#define sgemv_loop(ci, cj, cM, cN) \
Expand Down Expand Up @@ -182,6 +186,13 @@ static void copy_float32_to_float16(const unsigned int N, const float *X,
for (unsigned int i = 0; i < N; ++i)
Y[i * incy] = X[i * incx];
}
#elif USE_AVX
if (incX == 1 && incY == 1) {
nntrainer::avx::vcvt_f32_f16(N, X, Y);
} else {
for (unsigned int i = 0; i < N; ++i)
Y[i * incy] = static_cast<_FP16>(X[i * incx]);
}
#else
for (unsigned int i = 0; i < N; ++i)
Y[i * incy] = static_cast<_FP16>(X[i * incx]);
Expand All @@ -200,6 +211,13 @@ static void copy_float16_to_float32(const unsigned int N, const _FP16 *X,
for (unsigned int i = 0; i < N; ++i)
Y[i * incy] = X[i * incx];
}
#elif USE_AVX
if (incX == 1 && incY == 1) {
nntrainer::avx::vcvt_f16_f32(N, X, Y);
} else {
for (unsigned int i = 0; i < N; ++i)
Y[i * incy] = static_cast<float>(X[i * incx]);
}
#else
for (unsigned int i = 0; i < N; ++i)
Y[i * incy] = static_cast<float>(X[i * incx]);
Expand Down
3 changes: 3 additions & 0 deletions nntrainer/tensor/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ if get_option('enable-fp16')
tensor_sources += 'blas_neon.cpp'
tensor_headers += 'blas_neon.h'
endif
elif get_option('enable-avx')
tensor_sources += 'blas_avx.cpp'
tensor_headers += 'blas_avx.h'
endif
endif

Expand Down

0 comments on commit 3d68b89

Please sign in to comment.