Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AVX] Added VCVT (between floating-point and integer) AVX support for x86 architecture #2480

Merged
merged 1 commit into from
Feb 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 8 additions & 5 deletions meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -80,13 +80,16 @@ if get_option('enable-fp16')
elif arch == 'arm'
error ('FP16/ARM code (blas_neon.cpp) uses armv8.2 instructions. armv7 is not supported.')
else
has_avx512fp16 = cc.has_argument('-mavx512fp16')
if (has_avx512fp16)
# add_project_arguments(['-mavx512fp16'], language: ['c','cpp'])
message ('Float16 for x86_64 enabled. Modern gcc-x64 genrally supports float16 with _Float16. -mavx512fp16 added for hardware acceleration')
if cc.version().version_compare('>=12.1.0')
message ('Float16 for x86_64 enabled. Modern gcc-x64 generally supports float16 with _Float16.')
extra_defines += '-DENABLE_FP16=1'
if get_option('enable-avx')
extra_defines += '-DUSE_AVX=1'
add_project_arguments(['-march=native'], language: ['c','cpp'])
message('-march=native added for AVX hardware acceleration.')
endif
else
warning ('Float16 for x86_64 enabled. However, software emulation is applied for fp16, making it slower and inconsistent. Use GCC 12+ for AVX512 FP16 support. This build will probably fail unless you bring a compiler that supports fp16 for x64.')
warning ('Float16 for x86_64 enabled. However, software emulation is applied for fp16, making it slower and inconsistent. Use GCC 12+ for FP16 support. This build will probably fail unless you bring a compiler that supports fp16 for x64.')
endif
endif
endif
Expand Down
1 change: 1 addition & 0 deletions meson_options.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ option('enable-fp16', type: 'boolean', value: false)
option('enable-cublas', type: 'boolean', value: false)
option('enable-openmp', type: 'boolean', value: true)
option('enable-neon', type: 'boolean', value: false)
option('enable-avx', type: 'boolean', value: false)

# ml-api dependency (to enable, install capi-inference from github.com/nnstreamer/api )
# To inter-operate with nnstreamer and ML-API packages, you need to enable this.
Expand Down
117 changes: 117 additions & 0 deletions nntrainer/tensor/blas_avx.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
// SPDX-License-Identifier: Apache-2.0
/**
* Copyright (C) 2023 Donghyeon Jeong <[email protected]>
*
* @file blas_avx.cpp
* @date 20 Feb 2024
* @see https://github.com/nnstreamer/nntrainer
* @author Donghyeon Jeong <[email protected]>
* @bug No known bugs except for NYI items
* @brief This is a source for AVX implementation
*
*/

#include <cassert>
#include <chrono>
#include <cstdint>
#include <immintrin.h>

#include <blas_avx.h>

namespace nntrainer::avx {

void vcvt_f16_f32(size_t N, const void *input, float *output) {
assert(N != 0);
assert(input != NULL);
assert(output != NULL);

unsigned int idx = 0;
const _Float16 *data = (const _Float16 *)input;

// 16 half-precision floating point values to single-precision values
for (; N - idx >= 16; idx += 16) {
const __m256 vec0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)data));
const __m256 vec1 =
_mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(data + 8)));
data += 16;

_mm256_storeu_ps(output, vec0);
_mm256_storeu_ps(output + 8, vec1);
output += 16;
}
// 8 half-precision floating point values to single-precision values
for (; N - idx >= 8; idx += 8) {
const __m256 vec = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)data));
data += 8;

_mm256_storeu_ps(output, vec);
output += 8;
}
// remaining half-precision floating point values to single-precision values
if (N - idx > 0 && N - idx < 8) {
const __m256 vec = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)data));
__m128 s_vec = _mm256_castps256_ps128(vec);
if ((N - idx) & 4) {
_mm_storeu_ps(output, s_vec);
s_vec = _mm256_extractf128_ps(vec, 1);
output += 4;
}
if ((N - idx) & 2) {
_mm_storel_pi((__m64 *)output, s_vec);
s_vec = _mm_movehl_ps(s_vec, s_vec);
output += 2;
}
if ((N - idx) & 1) {
_mm_store_ss(output, s_vec);
}
}
}

void vcvt_f32_f16(size_t N, const float *input, void *output) {
assert(N != 0);
assert(input != NULL);
assert(output != NULL);

unsigned int idx = 0;
_Float16 *out_data = (_Float16 *)output;

// 16 single-precision floating point values to half-precision values
for (; N - idx >= 16; idx += 16) {
const __m256 vec0 = _mm256_loadu_ps(input);
const __m256 vec1 = _mm256_loadu_ps(input + 8);
input += 16;

_mm_storeu_si128((__m128i *)out_data,
_mm256_cvtps_ph(vec0, _MM_FROUND_TO_NEAREST_INT));
_mm_storeu_si128((__m128i *)(out_data + 8),
_mm256_cvtps_ph(vec1, _MM_FROUND_TO_NEAREST_INT));
out_data += 16;
}
// 8 single-precision floating point values to half-precision values
for (; N - idx >= 8; idx += 8) {
const __m256 vec = _mm256_loadu_ps(input);
input += 8;

_mm_storeu_si128((__m128i *)out_data,
_mm256_cvtps_ph(vec, _MM_FROUND_TO_NEAREST_INT));
out_data += 8;
}
// 4 single-precision floating point values to half-precision values
for (; N - idx >= 4; idx += 4) {
const __m128 vec = _mm_loadu_ps(input);
input += 4;

_mm_storeu_si64((__m128i *)out_data,
_mm_cvtps_ph(vec, _MM_FROUND_TO_NEAREST_INT));
out_data += 4;
}
// remaining single-precision floating point values to half-precision values
while (idx < N) {
*out_data = static_cast<_Float16>(*input);
++out_data;
++input;
++idx;
}
}

} // namespace nntrainer::avx
46 changes: 46 additions & 0 deletions nntrainer/tensor/blas_avx.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
// SPDX-License-Identifier: Apache-2.0
/**
* Copyright (C) 2023 Donghyeon Jeong <[email protected]>
*
* @file blas_avx.h
* @date 20 Feb 2024
* @see https://github.com/nnstreamer/nntrainer
* @author Donghyeon Jeong <[email protected]>
* @bug No known bugs except for NYI items
* @brief This is a header for AVX implementation
*
*/

#ifndef __BLAS_AVX2_H_
#define __BLAS_AVX2_H_
#ifdef __cplusplus

#include <cmath>
#include <immintrin.h>

namespace nntrainer::avx {

/**
* @brief Converts half-precision floating point values to single-precision
* floating point values.
*
* @param[in] N number of elements in input vector
* @param[in] input vector containing 16-bit floating point values
* @param[out] output vector containing single-precision floating point values.
*/
void vcvt_f16_f32(size_t N, const void *input, float *output);

/**
* @brief Converts single-precision floating point values to half-precision
* floating point values.
*
* @param[in] N number of elements in input vector
* @param[in] input vector containing single-precision floating point values
* @param[out] output vector containing 16-bit floating point values
*/
void vcvt_f32_f16(size_t N, const float *input, void *output);

} // namespace nntrainer::avx

#endif /* __cplusplus */
#endif /* __BLAS_AVX_H_ */
18 changes: 18 additions & 0 deletions nntrainer/tensor/blas_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@
#include <blas_neon.h>
#endif

#if USE_AVX
#include <blas_avx.h>
#endif

#include <cmath>

#define sgemv_loop(ci, cj, cM, cN) \
Expand Down Expand Up @@ -182,6 +186,13 @@ static void copy_float32_to_float16(const unsigned int N, const float *X,
for (unsigned int i = 0; i < N; ++i)
Y[i * incy] = X[i * incx];
}
#elif USE_AVX
if (incX == 1 && incY == 1) {
nntrainer::avx::vcvt_f32_f16(N, X, Y);
} else {
for (unsigned int i = 0; i < N; ++i)
Y[i * incy] = static_cast<_FP16>(X[i * incx]);
}
#else
for (unsigned int i = 0; i < N; ++i)
Y[i * incy] = static_cast<_FP16>(X[i * incx]);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not for this PR or this month, but for later versions (you are now having technical debts here):

maintain the same code in general cpp files across NEON/AVX/None.
centralize architectural dependencies to a corresponding header and its implementation only.

You may have a single header for all three cases and choose a corresponding implementation (.c/.cpp file) at build time, too.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Try the better approach from the next implementation (and fix this when you are ready)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You mean something like:

// new file : blas_raw.cpp
...
void foo( ... ) {
...
} else {
    for (unsigned int i = 0; i < N; ++i)
      Y[i * incy] = static_cast<_FP16>(X[i * incx]);
  }
...
}
...

and use it for all blas_interface.cpp, blas_neon.cpp, blas_avx.cpp ?
?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for bringing this up! I will keep it in mind for future improvement.

Copy link
Member

@myungjoo myungjoo Feb 23, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or.. you may have a base class as a set of operators/functions/methods for general CPU operations compatible for ALL systems with virtual functions and a derived class as a set of operators/functions/methods for architecture-dependent (SIMD) operations overriding the virtual functions.

Then, if you don't have an implementation in the derived class (SIMD), the fallback method in the base class will be automatically chosen. If you have an implementation in the derived class and the class is chosen at run-time/build-time, it will be automatically chosen WITHOUT having #if/#endif or even general if statements. You can control that in the base class initializer or "get_instance" method of the singleton design.

Because you still have a tight deadline for the release, don't start refactoring, yet. Try to refactor when you have completed the immenent release or when you start writing a new class or function.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Such refactoring needs planning and design before implementation. Don't just start without enough discussion and I don't want this refacotring effort hinder the immenent release.

Anyway, for your reference: #2482

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The concept suggestion : https://github.com/myungjoo/nntrainer/tree/suggestion/refactoring/archdep
(this is not for actual PR)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

both approaches seem great. I'll also research how other frameworks manage this. let's keep the discussion going

Expand All @@ -200,6 +211,13 @@ static void copy_float16_to_float32(const unsigned int N, const _FP16 *X,
for (unsigned int i = 0; i < N; ++i)
Y[i * incy] = X[i * incx];
}
#elif USE_AVX
if (incX == 1 && incY == 1) {
nntrainer::avx::vcvt_f16_f32(N, X, Y);
} else {
for (unsigned int i = 0; i < N; ++i)
Y[i * incy] = static_cast<float>(X[i * incx]);
}
#else
for (unsigned int i = 0; i < N; ++i)
Y[i * incy] = static_cast<float>(X[i * incx]);
Expand Down
3 changes: 3 additions & 0 deletions nntrainer/tensor/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ if get_option('enable-fp16')
tensor_sources += 'blas_neon.cpp'
tensor_headers += 'blas_neon.h'
endif
elif get_option('enable-avx')
tensor_sources += 'blas_avx.cpp'
tensor_headers += 'blas_avx.h'
endif
endif

Expand Down
Loading