Skip to content

Commit

Permalink
neon/ext: add _mm_alignr_{,e}pi8 implementations
Browse files Browse the repository at this point in the history
This could be good for compilers which don't support
__builtin_shufflevector or __builtin_shuffle (like MSVC).
  • Loading branch information
nemequ committed Feb 5, 2021
1 parent 12069d7 commit 6d28f04
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 20 deletions.
80 changes: 60 additions & 20 deletions simde/arm/neon/ext.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,9 @@ simde_vext_f32(simde_float32x2_t a, simde_float32x2_t b, const int n)
return simde_float32x2_from_private(r_);
#endif
}
#if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE)
#define simde_vext_f32(a, b, n) _mm_alignr_pi8(b, a, n * sizeof(simde_float32))
#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#define simde_vext_f32(a, b, n) (__extension__ ({ \
simde_float32x2_t simde_vext_f32_r; \
if (!__builtin_constant_p(n)) { \
Expand Down Expand Up @@ -94,7 +96,9 @@ simde_vext_f64(simde_float64x1_t a, simde_float64x1_t b, const int n)
return simde_float64x1_from_private(r_);
#endif
}
#if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE)
#define simde_vext_f64(a, b, n) _mm_alignr_pi8(b, a, n * sizeof(simde_float64))
#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#define simde_vext_f64(a, b, n) (__extension__ ({ \
simde_float64x1_t simde_vext_f64_r; \
if (!__builtin_constant_p(n)) { \
Expand Down Expand Up @@ -135,7 +139,9 @@ simde_vext_s8(simde_int8x8_t a, simde_int8x8_t b, const int n)
return simde_int8x8_from_private(r_);
#endif
}
#if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE)
#define simde_vext_s8(a, b, n) _mm_alignr_pi8(b, a, n * sizeof(int8_t))
#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#define simde_vext_s8(a, b, n) (__extension__ ({ \
simde_int8x8_t simde_vext_s8_r; \
if (!__builtin_constant_p(n)) { \
Expand Down Expand Up @@ -179,7 +185,9 @@ simde_vext_s16(simde_int16x4_t a, simde_int16x4_t b, const int n)
return simde_int16x4_from_private(r_);
#endif
}
#if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE)
#define simde_vext_s16(a, b, n) _mm_alignr_pi8(b, a, n * sizeof(int16_t))
#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#define simde_vext_s16(a, b, n) (__extension__ ({ \
simde_int16x4_t simde_vext_s16_r; \
if (!__builtin_constant_p(n)) { \
Expand Down Expand Up @@ -221,7 +229,9 @@ simde_vext_s32(simde_int32x2_t a, simde_int32x2_t b, const int n)
return simde_int32x2_from_private(r_);
#endif
}
#if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE)
#define simde_vext_s32(a, b, n) _mm_alignr_pi8(b, a, n * sizeof(int32_t))
#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#define simde_vext_s32(a, b, n) (__extension__ ({ \
simde_int32x2_t simde_vext_s32_r; \
if (!__builtin_constant_p(n)) { \
Expand Down Expand Up @@ -261,7 +271,9 @@ simde_vext_s64(simde_int64x1_t a, simde_int64x1_t b, const int n)
return simde_int64x1_from_private(r_);
#endif
}
#if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE)
#define simde_vext_s64(a, b, n) _mm_alignr_pi8(b, a, n * sizeof(int64_t))
#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#define simde_vext_s64(a, b, n) (__extension__ ({ \
simde_int64x1_t simde_vext_s64_r; \
if (!__builtin_constant_p(n)) { \
Expand Down Expand Up @@ -302,7 +314,9 @@ simde_vext_u8(simde_uint8x8_t a, simde_uint8x8_t b, const int n)
return simde_uint8x8_from_private(r_);
#endif
}
#if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE)
#define simde_vext_u8(a, b, n) _mm_alignr_pi8(b, a, n * sizeof(uint8_t))
#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#define simde_vext_u8(a, b, n) (__extension__ ({ \
simde_uint8x8_t simde_vext_u8_r; \
if (!__builtin_constant_p(n)) { \
Expand Down Expand Up @@ -346,7 +360,9 @@ simde_vext_u16(simde_uint16x4_t a, simde_uint16x4_t b, const int n)
return simde_uint16x4_from_private(r_);
#endif
}
#if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE)
#define simde_vext_u16(a, b, n) _mm_alignr_pi8(b, a, n * sizeof(uint16_t))
#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#define simde_vext_u16(a, b, n) (__extension__ ({ \
simde_uint16x4_t simde_vext_u16_r; \
if (!__builtin_constant_p(n)) { \
Expand Down Expand Up @@ -388,7 +404,9 @@ simde_vext_u32(simde_uint32x2_t a, simde_uint32x2_t b, const int n)
return simde_uint32x2_from_private(r_);
#endif
}
#if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE)
#define simde_vext_u32(a, b, n) _mm_alignr_pi8(b, a, n * sizeof(uint32_t))
#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#define simde_vext_u32(a, b, n) (__extension__ ({ \
simde_uint32x2_t simde_vext_u32_r; \
if (!__builtin_constant_p(n)) { \
Expand Down Expand Up @@ -428,7 +446,9 @@ simde_vext_u64(simde_uint64x1_t a, simde_uint64x1_t b, const int n)
return simde_uint64x1_from_private(r_);
#endif
}
#if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE)
#define simde_vext_u64(a, b, n) _mm_alignr_pi8(b, a, n * sizeof(uint64_t))
#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#define simde_vext_u64(a, b, n) (__extension__ ({ \
simde_uint64x1_t simde_vext_u64_r; \
if (!__builtin_constant_p(n)) { \
Expand Down Expand Up @@ -469,7 +489,9 @@ simde_vextq_f32(simde_float32x4_t a, simde_float32x4_t b, const int n)
return simde_float32x4_from_private(r_);
#endif
}
#if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE)
#define simde_vextq_f32(a, b, n) _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(b), _mm_castps_si128(a), n * sizeof(simde_float32)))
#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#define simde_vextq_f32(a, b, n) (__extension__ ({ \
simde_float32x4_t simde_vextq_f32_r; \
if (!__builtin_constant_p(n)) { \
Expand Down Expand Up @@ -511,7 +533,9 @@ simde_vextq_f64(simde_float64x2_t a, simde_float64x2_t b, const int n)
return simde_float64x2_from_private(r_);
#endif
}
#if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE)
#define simde_vextq_f64(a, b, n) _mm_castsi128_pd(_mm_alignr_epi8(_mm_castpd_si128(b), _mm_castpd_si128(a), n * sizeof(simde_float64)))
#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#define simde_vextq_f64(a, b, n) (__extension__ ({ \
simde_float64x2_t simde_vextq_f64_r; \
if (!__builtin_constant_p(n)) { \
Expand Down Expand Up @@ -552,7 +576,9 @@ simde_vextq_s8(simde_int8x16_t a, simde_int8x16_t b, const int n)
return simde_int8x16_from_private(r_);
#endif
}
#if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE)
#define simde_vextq_s8(a, b, n) _mm_alignr_epi8(b, a, n * sizeof(int8_t))
#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#define simde_vextq_s8(a, b, n) (__extension__ ({ \
simde_int8x16_t simde_vextq_s8_r; \
if (!__builtin_constant_p(n)) { \
Expand Down Expand Up @@ -600,7 +626,9 @@ simde_vextq_s16(simde_int16x8_t a, simde_int16x8_t b, const int n)
return simde_int16x8_from_private(r_);
#endif
}
#if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE)
#define simde_vextq_s16(a, b, n) _mm_alignr_epi8(b, a, n * sizeof(int16_t))
#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#define simde_vextq_s16(a, b, n) (__extension__ ({ \
simde_int16x8_t simde_vextq_s16_r; \
if (!__builtin_constant_p(n)) { \
Expand Down Expand Up @@ -644,7 +672,9 @@ simde_vextq_s32(simde_int32x4_t a, simde_int32x4_t b, const int n)
return simde_int32x4_from_private(r_);
#endif
}
#if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE)
#define simde_vextq_s32(a, b, n) _mm_alignr_epi8(b, a, n * sizeof(int32_t))
#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#define simde_vextq_s32(a, b, n) (__extension__ ({ \
simde_int32x4_t simde_vextq_s32_r; \
if (!__builtin_constant_p(n)) { \
Expand Down Expand Up @@ -686,7 +716,9 @@ simde_vextq_s64(simde_int64x2_t a, simde_int64x2_t b, const int n)
return simde_int64x2_from_private(r_);
#endif
}
#if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE)
#define simde_vextq_s64(a, b, n) _mm_alignr_epi8(b, a, n * sizeof(int64_t))
#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#define simde_vextq_s64(a, b, n) (__extension__ ({ \
simde_int64x2_t simde_vextq_s64_r; \
if (!__builtin_constant_p(n)) { \
Expand Down Expand Up @@ -727,7 +759,9 @@ simde_vextq_u8(simde_uint8x16_t a, simde_uint8x16_t b, const int n)
return simde_uint8x16_from_private(r_);
#endif
}
#if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE)
#define simde_vextq_u8(a, b, n) _mm_alignr_epi8(b, a, n * sizeof(uint8_t))
#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#define simde_vextq_u8(a, b, n) (__extension__ ({ \
simde_uint8x16_t simde_vextq_u8_r; \
if (!__builtin_constant_p(n)) { \
Expand Down Expand Up @@ -775,7 +809,9 @@ simde_vextq_u16(simde_uint16x8_t a, simde_uint16x8_t b, const int n)
return simde_uint16x8_from_private(r_);
#endif
}
#if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE)
#define simde_vextq_u16(a, b, n) _mm_alignr_epi8(b, a, n * sizeof(uint16_t))
#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#define simde_vextq_u16(a, b, n) (__extension__ ({ \
simde_uint16x8_t simde_vextq_u16_r; \
if (!__builtin_constant_p(n)) { \
Expand Down Expand Up @@ -819,7 +855,9 @@ simde_vextq_u32(simde_uint32x4_t a, simde_uint32x4_t b, const int n)
return simde_uint32x4_from_private(r_);
#endif
}
#if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE)
#define simde_vextq_u32(a, b, n) _mm_alignr_epi8(b, a, n * sizeof(uint32_t))
#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#define simde_vextq_u32(a, b, n) (__extension__ ({ \
simde_uint32x4_t simde_vextq_u32_r; \
if (!__builtin_constant_p(n)) { \
Expand Down Expand Up @@ -861,7 +899,9 @@ simde_vextq_u64(simde_uint64x2_t a, simde_uint64x2_t b, const int n)
return simde_uint64x2_from_private(r_);
#endif
}
#if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE)
#define simde_vextq_u64(a, b, n) _mm_alignr_epi8(b, a, n * sizeof(uint64_t))
#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#define simde_vextq_u64(a, b, n) (__extension__ ({ \
simde_uint64x2_t simde_vextq_u64_r; \
if (!__builtin_constant_p(n)) { \
Expand Down
3 changes: 3 additions & 0 deletions simde/simde-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -781,6 +781,9 @@ HEDLEY_DIAGNOSTIC_POP
# if !HEDLEY_GCC_VERSION_CHECK(5,0,0)
# define SIMDE_BUG_GCC_BAD_MM_SRA_EPI32 /* TODO: find relevant bug or commit */
# endif
# if !HEDLEY_GCC_VERSION_CHECK(6,0,0)
# define SIMDE_BUG_GCC_SIZEOF_IMMEDIATE
# endif
# if !HEDLEY_GCC_VERSION_CHECK(4,6,0)
# define SIMDE_BUG_GCC_BAD_MM_EXTRACT_EPI8 /* TODO: find relevant bug or commit */
# endif
Expand Down

0 comments on commit 6d28f04

Please sign in to comment.