Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Elbrus (e2k) architecture support #700

Open
wants to merge 14 commits into
base: wip/e2k
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 10 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -186,13 +186,16 @@ just use search and replace, manual changes are required pretty
infrequently.

For best performance, in addition to `-O3` (or whatever your compiler's
equivalent is), you should enable OpenMP 4 SIMD support by defining
`SIMDE_ENABLE_OPENMP` before including any SIMDe headers, and
enabling OpenMP support in your compiler. GCC and ICC both support a
flag to enable only OpenMP SIMD support instead of full OpenMP (the OpenMP
SIMD support doesn't require the OpenMP run-time library); for GCC the
flag is `-fopenmp-simd` (requires GCC version 4.9 or later), for ICC
the flag is `-qopenmp-simd`. SIMDe also supports
equivalent is), you should enable OpenMP 4 SIMD support in your compiler.
GCC and ICC both support a flag to enable only OpenMP SIMD support instead
of full OpenMP (the OpenMP SIMD support doesn't require the OpenMP run-time
library); for GCC the flag is `-fopenmp-simd` (requires GCC version 4.9
or later), for ICC the flag is `-qopenmp-simd`. Some compilers have this
support implicitly enabled (the example is MCST Elbrus Compiler, or LCC
for short). If for some reason you need to disable OpenMP support,
you need to define `SIMDE_DISABLE_OPENMP` before including any of SIMDe
headers, and probably disable its support in compiler, if it is implicitly
enabled (for LCC, there is `-fno-openmp` flag). SIMDe also supports
using [Cilk Plus](https://www.cilkplus.org/), [GCC loop-specific
pragmas](https://gcc.gnu.org/onlinedocs/gcc/Loop-Specific-Pragmas.html),
or [clang pragma loop hint
Expand Down
9 changes: 0 additions & 9 deletions meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,6 @@ project('SIMDe', 'c', 'cpp',
cc = meson.get_compiler('c')
cxx = meson.get_compiler('cpp')

foreach additional_arg : [ '-Wno-reduced-alignment' ]
if cc.has_argument(additional_arg)
add_global_arguments(additional_arg, language : 'c')
endif
if cxx.has_argument(additional_arg)
add_global_arguments(additional_arg, language : 'cpp')
endif
endforeach

simde_neon_families = [
'aba',
'abd',
Expand Down
2 changes: 1 addition & 1 deletion simde/arm/neon/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ SIMDE_ARM_NEON_TYPE_FLOAT_DEFINE_(64, 2, SIMDE_ALIGN_16_)
#define SIMDE_ARM_NEON_NEED_PORTABLE_F64X1XN
#define SIMDE_ARM_NEON_NEED_PORTABLE_F64X2XN
#endif
#elif defined(SIMDE_ARCH_X86) || defined(SIMDE_ARCH_AMD64)
#elif defined(SIMDE_ARCH_X86) || defined(SIMDE_ARCH_AMD64) || defined(SIMDE_ARCH_E2K)
#define SIMDE_ARM_NEON_NEED_PORTABLE_F32
#define SIMDE_ARM_NEON_NEED_PORTABLE_F64

Expand Down
19 changes: 18 additions & 1 deletion simde/simde-align.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@
#define SIMDE_ALIGN_OF(Type) alignof(Type)
#elif \
HEDLEY_GCC_VERSION_CHECK(2,95,0) || \
HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) || \
HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) || \
Expand Down Expand Up @@ -264,6 +265,7 @@
#if \
HEDLEY_HAS_ATTRIBUTE(aligned) || \
HEDLEY_GCC_VERSION_CHECK(2,95,0) || \
HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) || \
HEDLEY_CRAY_VERSION_CHECK(8,4,0) || \
HEDLEY_IBM_VERSION_CHECK(11,1,0) || \
HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
Expand Down Expand Up @@ -314,7 +316,8 @@
*/
#if \
HEDLEY_HAS_BUILTIN(__builtin_assume_aligned) || \
HEDLEY_GCC_VERSION_CHECK(4,7,0)
HEDLEY_GCC_VERSION_CHECK(4,7,0) || \
HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
#define SIMDE_ALIGN_ASSUME_TO_UNCHECKED(Pointer, Alignment) \
HEDLEY_REINTERPRET_CAST(__typeof__(Pointer), __builtin_assume_aligned(HEDLEY_CONST_CAST(void*, HEDLEY_REINTERPRET_CAST(const void*, Pointer)), Alignment))
#elif HEDLEY_INTEL_VERSION_CHECK(13,0,0)
Expand Down Expand Up @@ -446,4 +449,18 @@
*/
#define SIMDE_ALIGN_ASSUME_CAST(Type, Pointer) SIMDE_ALIGN_ASSUME_LIKE(SIMDE_ALIGN_CAST(Type, Pointer), Type)

/* In some circumstances we need to define types as packed,
* because platform alignment is fixed when variable is placed
* on the stack (but this alignment is variable otherwise).
* so there's SIMDE_ALIGN_REDUCE_STRUCT macros that allows this
* for LCC compiler on Elbrus architecture.
*/
#if defined (SIMDE_BUG_LCC_STACK_ALIGNMENT_CAP)
#define SIMDE_ALIGN_REDUCE_STRUCT __attribute__((packed, aligned(16)))
#define SIMDE_ALIGN_REDUCE_ARRAY __attribute__((aligned(16)))
#else
#define SIMDE_ALIGN_REDUCE_STRUCT
#define SIMDE_ALIGN_REDUCE_ARRAY
#endif

#endif /* !defined(SIMDE_ALIGN_H) */
25 changes: 20 additions & 5 deletions simde/simde-arch.h
Original file line number Diff line number Diff line change
Expand Up @@ -182,13 +182,28 @@
<https://en.wikipedia.org/wiki/Elbrus-8S> */
#if defined(__e2k__)
#define SIMDE_ARCH_E2K
#define SIMDE_SKIP_EXTENDED_E2K_VECTOR_OPS /* Discard features unsupported by compiler */
#endif

/* Discard features unsupported by Elbrus compiler.
For lcc > 1.25.10, it may be based on a version. */
#if defined(__LCC__)
#define SIMDE_SKIP_EXTENDED_E2K_VECTOR_OPS
#define SIMDE_BUG_LCC_TOO_STRICT_VECTOR_SHIFTS_AND_COMPARES
#define SIMDE_BUG_LCC_XOP_MISSING
#define SIMDE_BUG_LCC_WARNING_ON_SHIFTS
#define SIMDE_BUG_LCC_FMA_WRONG_RESULT
#define SIMDE_BUG_LCC_AVX_NO_LOAD_STORE_U2
#define SIMDE_BUG_LCC_STACK_ALIGNMENT_CAP

/* Some native functions on E2K with instruction set < v6
are declared as deprecated due to inefficiency.
Still they are more efficient than SIMDe implementation.
So we're using them, and switching off these deprecation warnings. */
#define SIMDE_BUG_PCLMUL_XOP_DEPRECATED
#define SIMDE_LCC_DISABLE_DEPRECATED_WARNINGS _Pragma("diag_suppress 1215,1444")
#define SIMDE_LCC_REVERT_DEPRECATED_WARNINGS _Pragma("diag_default 1215,1444")
#else
#define SIMDE_LCC_DISABLE_DEPRECATED_WARNINGS
#define SIMDE_LCC_REVERT_DEPRECATED_WARNINGS
#endif

makise-homura marked this conversation as resolved.
Show resolved Hide resolved
makise-homura marked this conversation as resolved.
Show resolved Hide resolved
/* HP/PA / PA-RISC
Expand Down Expand Up @@ -267,7 +282,7 @@
# if defined(__SSE4_2__)
# define SIMDE_ARCH_X86_SSE4_2 1
# endif
# if defined(__XOP__) && !defined(__LCC__) /* LCC incorrectly defines __XOP__ */
# if defined(__XOP__)
# define SIMDE_ARCH_X86_XOP 1
# endif
# if defined(__AVX__)
Expand All @@ -285,7 +300,7 @@
# if defined(__AVX2__)
# define SIMDE_ARCH_X86_AVX2 1
# endif
# if defined(__FMA__) && !defined(__LCC__) /* LCC incorrectly defines __FMA__ */
# if defined(__FMA__)
# define SIMDE_ARCH_X86_FMA 1
# if !defined(SIMDE_ARCH_X86_AVX)
# define SIMDE_ARCH_X86_AVX 1
Expand Down Expand Up @@ -315,7 +330,7 @@
# if defined(__GFNI__)
# define SIMDE_ARCH_X86_GFNI 1
# endif
# if defined(__PCLMUL__) && !defined(SIMDE_ARCH_E2K) /* E2K has inefficient implementation of PCLMUL */
# if defined(__PCLMUL__)
# define SIMDE_ARCH_X86_PCLMUL 1
# endif
# if defined(__VPCLMULQDQ__)
Expand Down
12 changes: 9 additions & 3 deletions simde/simde-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -291,8 +291,14 @@
# endif
#endif

#if !defined(SIMDE_ENABLE_OPENMP) && ((defined(_OPENMP) && (_OPENMP >= 201307L)) || (defined(_OPENMP_SIMD) && (_OPENMP_SIMD >= 201307L)))
# define SIMDE_ENABLE_OPENMP
#if !defined(SIMDE_DISABLE_OPENMP)
#if !defined(SIMDE_ENABLE_OPENMP) && ((defined(_OPENMP) && (_OPENMP >= 201307L)) || (defined(_OPENMP_SIMD) && (_OPENMP_SIMD >= 201307L)))
# define SIMDE_ENABLE_OPENMP
#endif
#else
#if defined(SIMDE_ENABLE_OPENMP)
# undef SIMDE_ENABLE_OPENMP
#endif
#endif

#if !defined(SIMDE_ENABLE_CILKPLUS) && (defined(__cilk) || defined(HEDLEY_INTEL_VERSION))
Expand Down Expand Up @@ -322,7 +328,7 @@
# define SIMDE_VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(clang loop vectorize_width(l))
# define SIMDE_VECTORIZE_REDUCTION(r) SIMDE_VECTORIZE
# define SIMDE_VECTORIZE_ALIGNED(a)
#elif HEDLEY_GCC_VERSION_CHECK(4,9,0) && !defined(__LCC__)
#elif HEDLEY_GCC_VERSION_CHECK(4,9,0)
# define SIMDE_VECTORIZE HEDLEY_PRAGMA(GCC ivdep)
# define SIMDE_VECTORIZE_SAFELEN(l) SIMDE_VECTORIZE
# define SIMDE_VECTORIZE_REDUCTION(r) SIMDE_VECTORIZE
Expand Down
35 changes: 24 additions & 11 deletions simde/x86/avx.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ typedef union {
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64[2];
#endif
#endif
} simde__m256_private;
} SIMDE_ALIGN_REDUCE_STRUCT simde__m256_private;

typedef union {
#if defined(SIMDE_VECTOR_SUBSCRIPT)
Expand Down Expand Up @@ -149,7 +149,7 @@ typedef union {
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64[2];
#endif
#endif
} simde__m256d_private;
} SIMDE_ALIGN_REDUCE_STRUCT simde__m256d_private;

typedef union {
#if defined(SIMDE_VECTOR_SUBSCRIPT)
Expand Down Expand Up @@ -207,7 +207,7 @@ typedef union {
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64[2];
#endif
#endif
} simde__m256i_private;
} SIMDE_ALIGN_REDUCE_STRUCT simde__m256i_private;

#if defined(SIMDE_X86_AVX_NATIVE)
typedef __m256 simde__m256;
Expand Down Expand Up @@ -3935,7 +3935,7 @@ simde_mm256_loadu_si256 (void const * mem_addr) {
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_loadu2_m128 (const float hiaddr[HEDLEY_ARRAY_PARAM(4)], const float loaddr[HEDLEY_ARRAY_PARAM(4)]) {
#if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_BUG_GCC_91341)
#if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_BUG_GCC_91341) && !defined(SIMDE_BUG_LCC_AVX_NO_LOAD_STORE_U2)
return _mm256_loadu2_m128(hiaddr, loaddr);
#else
return
Expand All @@ -3951,7 +3951,7 @@ simde_mm256_loadu2_m128 (const float hiaddr[HEDLEY_ARRAY_PARAM(4)], const float
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_loadu2_m128d (const double hiaddr[HEDLEY_ARRAY_PARAM(2)], const double loaddr[HEDLEY_ARRAY_PARAM(2)]) {
#if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_BUG_GCC_91341)
#if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_BUG_GCC_91341) && !defined(SIMDE_BUG_LCC_AVX_NO_LOAD_STORE_U2)
return _mm256_loadu2_m128d(hiaddr, loaddr);
#else
return
Expand All @@ -3967,7 +3967,7 @@ simde_mm256_loadu2_m128d (const double hiaddr[HEDLEY_ARRAY_PARAM(2)], const doub
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_loadu2_m128i (const simde__m128i* hiaddr, const simde__m128i* loaddr) {
#if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_BUG_GCC_91341)
#if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_BUG_GCC_91341) && !defined(SIMDE_BUG_LCC_AVX_NO_LOAD_STORE_U2)
return _mm256_loadu2_m128i(hiaddr, loaddr);
#else
return
Expand Down Expand Up @@ -4547,8 +4547,21 @@ simde_mm256_permute_ps (simde__m256 a, const int imm8)
return simde__m256_from_private(r_);
}
#if defined(SIMDE_X86_AVX_NATIVE)
#if defined(SIMDE_BUG_LCC_STACK_ALIGNMENT_CAP)
/* Patched implementation from e2kbuiltin.h */
# define simde_mm256_permute_ps(a, imm8) ({ \
type_union_256 __attribute__((aligned(16))) __s, __dst; \
__s.__v8sf = (__v8sf)(a); \
SELECT_CONST_32F (__s.l.l1, __s.l.l0, __dst.l.l0, (imm8)); \
SELECT_CONST_32F (__s.l.l1, __s.l.l0, __dst.l.l1, (imm8) >> 4); \
SELECT_CONST_32F (__s.l.l3, __s.l.l2, __dst.l.l2, (imm8)); \
SELECT_CONST_32F (__s.l.l3, __s.l.l2, __dst.l.l3, (imm8) >> 4); \
(__m256)(__dst.__v8sf); \
})
#else
# define simde_mm256_permute_ps(a, imm8) _mm256_permute_ps(a, imm8)
#endif
#endif
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_permute_ps
#define _mm256_permute_ps(a, imm8) simde_mm256_permute_ps(a, imm8)
Expand Down Expand Up @@ -5023,9 +5036,9 @@ simde_mm256_shuffle_ps (simde__m256 a, simde__m256 b, const int imm8)

return simde__m256_from_private(r_);
}
#if defined(SIMDE_X86_AVX_NATIVE)
#if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_BUG_LCC_STACK_ALIGNMENT_CAP)
#define simde_mm256_shuffle_ps(a, b, imm8) _mm256_shuffle_ps(a, b, imm8)
#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) || defined(SIMDE_BUG_LCC_STACK_ALIGNMENT_CAP)
#define simde_mm256_shuffle_ps(a, b, imm8) \
simde_mm256_set_m128( \
simde_mm_shuffle_ps(simde_mm256_extractf128_ps(a, 1), simde_mm256_extractf128_ps(b, 1), (imm8)), \
Expand Down Expand Up @@ -5230,7 +5243,7 @@ simde_mm256_storeu_si256 (void* mem_addr, simde__m256i a) {
SIMDE_FUNCTION_ATTRIBUTES
void
simde_mm256_storeu2_m128 (simde_float32 hi_addr[4], simde_float32 lo_addr[4], simde__m256 a) {
#if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_BUG_GCC_91341)
#if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_BUG_GCC_91341) && !defined(SIMDE_BUG_LCC_AVX_NO_LOAD_STORE_U2)
_mm256_storeu2_m128(hi_addr, lo_addr, a);
#else
simde_mm_storeu_ps(lo_addr, simde_mm256_castps256_ps128(a));
Expand All @@ -5245,7 +5258,7 @@ simde_mm256_storeu2_m128 (simde_float32 hi_addr[4], simde_float32 lo_addr[4], si
SIMDE_FUNCTION_ATTRIBUTES
void
simde_mm256_storeu2_m128d (simde_float64 hi_addr[2], simde_float64 lo_addr[2], simde__m256d a) {
#if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_BUG_GCC_91341)
#if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_BUG_GCC_91341) && !defined(SIMDE_BUG_LCC_AVX_NO_LOAD_STORE_U2)
_mm256_storeu2_m128d(hi_addr, lo_addr, a);
#else
simde_mm_storeu_pd(lo_addr, simde_mm256_castpd256_pd128(a));
Expand All @@ -5260,7 +5273,7 @@ simde_mm256_storeu2_m128d (simde_float64 hi_addr[2], simde_float64 lo_addr[2], s
SIMDE_FUNCTION_ATTRIBUTES
void
simde_mm256_storeu2_m128i (simde__m128i* hi_addr, simde__m128i* lo_addr, simde__m256i a) {
#if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_BUG_GCC_91341)
#if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_BUG_GCC_91341) && !defined(SIMDE_BUG_LCC_AVX_NO_LOAD_STORE_U2)
_mm256_storeu2_m128i(hi_addr, lo_addr, a);
#else
simde_mm_storeu_si128(lo_addr, simde_mm256_castsi256_si128(a));
Expand Down
Loading