diff --git a/include/xsimd/arch/common/xsimd_common_memory.hpp b/include/xsimd/arch/common/xsimd_common_memory.hpp index 709497ec2..c8038334a 100644 --- a/include/xsimd/arch/common/xsimd_common_memory.hpp +++ b/include/xsimd/arch/common/xsimd_common_memory.hpp @@ -223,7 +223,8 @@ namespace xsimd template XSIMD_INLINE typename batch, A>::value_type get(batch, A> const& self, ::xsimd::index, requires_arch) noexcept { - alignas(A::alignment()) T buffer[batch, A>::size]; + using value_type = typename batch, A>::value_type; + alignas(A::alignment()) value_type buffer[batch, A>::size]; self.store_aligned(&buffer[0]); return buffer[I]; } diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp index 441371643..841e904a6 100644 --- a/include/xsimd/arch/xsimd_avx.hpp +++ b/include/xsimd/arch/xsimd_avx.hpp @@ -748,6 +748,80 @@ namespace xsimd return self - batch(mask.data); } + // first (must precede get for two-phase lookup) + template + XSIMD_INLINE float first(batch const& self, requires_arch) noexcept + { + return _mm256_cvtss_f32(self); + } + + template + XSIMD_INLINE double first(batch const& self, requires_arch) noexcept + { + return _mm256_cvtsd_f64(self); + } + + template ::value>> + XSIMD_INLINE T first(batch const& self, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return static_cast(_mm_cvtsi128_si32(_mm256_castsi256_si128(self)) & 0xFF); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return static_cast(_mm_cvtsi128_si32(_mm256_castsi256_si128(self)) & 0xFFFF); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return static_cast(_mm_cvtsi128_si32(_mm256_castsi256_si128(self))); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + batch low = _mm256_castsi256_si128(self); + return first(low, sse4_2 {}); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + + // get + template + XSIMD_INLINE float get(batch const& self, ::xsimd::index, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(I == 0) { return first(self, avx {}); } + constexpr size_t elements_per_lane = batch::size; + constexpr size_t lane = I / elements_per_lane; + constexpr size_t sub_index = I % elements_per_lane; + const auto half = (lane == 0) ? detail::lower_half(self) : detail::upper_half(self); + return kernel::get(batch(half), ::xsimd::index {}, sse4_1 {}); + } + + template + XSIMD_INLINE double get(batch const& self, ::xsimd::index, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(I == 0) { return first(self, avx {}); } + constexpr size_t elements_per_lane = batch::size; + constexpr size_t lane = I / elements_per_lane; + constexpr size_t sub_index = I % elements_per_lane; + const auto half = (lane == 0) ? detail::lower_half(self) : detail::upper_half(self); + return kernel::get(batch(half), ::xsimd::index {}, sse4_1 {}); + } + + template ::value>> + XSIMD_INLINE T get(batch const& self, ::xsimd::index, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(I == 0) { return first(self, avx {}); } + constexpr size_t elements_per_lane = batch::size; + constexpr size_t lane = I / elements_per_lane; + constexpr size_t sub_index = I % elements_per_lane; + const auto half = (lane == 0) ? detail::lower_half(self) : detail::upper_half(self); + return kernel::get(batch(half), ::xsimd::index {}, sse4_1 {}); + } + // insert template ::value>> XSIMD_INLINE batch insert(batch const& self, T val, index pos, requires_arch) noexcept @@ -2015,46 +2089,6 @@ namespace xsimd return _mm256_insertf128_pd(lo, _mm256_castpd256_pd128(hi), 1); } - // first - template - XSIMD_INLINE float first(batch const& self, requires_arch) noexcept - { - return _mm256_cvtss_f32(self); - } - - template - XSIMD_INLINE double first(batch const& self, requires_arch) noexcept - { - return _mm256_cvtsd_f64(self); - } - - template ::value>> - XSIMD_INLINE T first(batch const& self, requires_arch) noexcept - { - XSIMD_IF_CONSTEXPR(sizeof(T) == 1) - { - return static_cast(_mm256_cvtsi256_si32(self) & 0xFF); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) - { - return static_cast(_mm256_cvtsi256_si32(self) & 0xFFFF); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) - { - return static_cast(_mm256_cvtsi256_si32(self)); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) - { - batch low = _mm256_castsi256_si128(self); - return first(low, sse4_2 {}); - } - else - { - assert(false && "unsupported arch/op combination"); - return {}; - } - } - // widen template XSIMD_INLINE std::array, A>, 2> widen(batch const& x, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp index fe8d33d99..8dd085ce3 100644 --- a/include/xsimd/arch/xsimd_avx512f.hpp +++ b/include/xsimd/arch/xsimd_avx512f.hpp @@ -1346,6 +1346,97 @@ namespace xsimd } } + // first (must precede get for two-phase lookup) + template + XSIMD_INLINE float first(batch const& self, requires_arch) noexcept + { + return _mm512_cvtss_f32(self); + } + + template + XSIMD_INLINE double first(batch const& self, requires_arch) noexcept + { + return _mm512_cvtsd_f64(self); + } + + template ::value>> + XSIMD_INLINE T first(batch const& self, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return static_cast(_mm_cvtsi128_si32(_mm512_castsi512_si128(self)) & 0xFF); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return static_cast(_mm_cvtsi128_si32(_mm512_castsi512_si128(self)) & 0xFFFF); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return static_cast(_mm_cvtsi128_si32(_mm512_castsi512_si128(self))); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + batch low = _mm512_castsi512_si128(self); + return first(low, sse4_2 {}); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + + // get: use valignd/valignq to rotate lane I into position 0 in a single op. + template + XSIMD_INLINE float get(batch const& self, ::xsimd::index, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(I == 0) + { + return first(self, avx512f {}); + } + const auto rotated = _mm512_alignr_epi32(_mm512_castps_si512(self), _mm512_castps_si512(self), I); + return _mm_cvtss_f32(_mm512_castps512_ps128(_mm512_castsi512_ps(rotated))); + } + + template + XSIMD_INLINE double get(batch const& self, ::xsimd::index, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(I == 0) + { + return first(self, avx512f {}); + } + const auto rotated = _mm512_alignr_epi64(_mm512_castpd_si512(self), _mm512_castpd_si512(self), I); + return _mm_cvtsd_f64(_mm512_castpd512_pd128(_mm512_castsi512_pd(rotated))); + } + + template ::value>> + XSIMD_INLINE T get(batch const& self, ::xsimd::index, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(I == 0) + { + return first(self, avx512f {}); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + const auto rotated = _mm512_alignr_epi32(self, self, I); + return first(batch(_mm512_castsi512_si128(rotated)), sse4_2 {}); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + const auto rotated = _mm512_alignr_epi64(self, self, I); + return first(batch(_mm512_castsi512_si128(rotated)), sse4_2 {}); + } + else + { + // 8/16-bit lanes have no sub-dword rotate in AVX-512F; delegate to AVX halves. + constexpr size_t elements_per_lane = batch::size; + constexpr size_t lane = I / elements_per_lane; + constexpr size_t sub_index = I % elements_per_lane; + const auto half = (lane == 0) ? detail::lower_half(self) : detail::upper_half(self); + return kernel::get(batch(half), ::xsimd::index {}, avx {}); + } + } + // insert template XSIMD_INLINE batch insert(batch const& self, float val, index, requires_arch) noexcept @@ -2753,46 +2844,6 @@ namespace xsimd 2)); } - // first - template - XSIMD_INLINE float first(batch const& self, requires_arch) noexcept - { - return _mm512_cvtss_f32(self); - } - - template - XSIMD_INLINE double first(batch const& self, requires_arch) noexcept - { - return _mm512_cvtsd_f64(self); - } - - template ::value>> - XSIMD_INLINE T first(batch const& self, requires_arch) noexcept - { - XSIMD_IF_CONSTEXPR(sizeof(T) == 1) - { - return static_cast(_mm_cvtsi128_si32(_mm512_castsi512_si128(self)) & 0xFF); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) - { - return static_cast(_mm_cvtsi128_si32(_mm512_castsi512_si128(self)) & 0xFFFF); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) - { - return static_cast(_mm_cvtsi128_si32(_mm512_castsi512_si128(self))); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) - { - batch low = _mm512_castsi512_si128(self); - return first(low, sse4_2 {}); - } - else - { - assert(false && "unsupported arch/op combination"); - return {}; - } - } - // widen template XSIMD_INLINE std::array, A>, 2> widen(batch const& x, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_neon.hpp b/include/xsimd/arch/xsimd_neon.hpp index 4af19a650..8d33d5a9e 100644 --- a/include/xsimd/arch/xsimd_neon.hpp +++ b/include/xsimd/arch/xsimd_neon.hpp @@ -2742,6 +2742,61 @@ namespace xsimd return vshrq_n_s64(x, shift); } + // get + template + XSIMD_INLINE float get(batch const& self, ::xsimd::index, requires_arch) noexcept + { + return vgetq_lane_f32(self, I); + } + + template = 0> + XSIMD_INLINE T get(batch const& self, ::xsimd::index, requires_arch) noexcept + { + return vgetq_lane_u8(self, I); + } + + template = 0> + XSIMD_INLINE T get(batch const& self, ::xsimd::index, requires_arch) noexcept + { + return vgetq_lane_s8(self, I); + } + + template = 0> + XSIMD_INLINE T get(batch const& self, ::xsimd::index, requires_arch) noexcept + { + return vgetq_lane_u16(self, I); + } + + template = 0> + XSIMD_INLINE T get(batch const& self, ::xsimd::index, requires_arch) noexcept + { + return vgetq_lane_s16(self, I); + } + + template = 0> + XSIMD_INLINE T get(batch const& self, ::xsimd::index, requires_arch) noexcept + { + return vgetq_lane_u32(self, I); + } + + template = 0> + XSIMD_INLINE T get(batch const& self, ::xsimd::index, requires_arch) noexcept + { + return vgetq_lane_s32(self, I); + } + + template = 0> + XSIMD_INLINE T get(batch const& self, ::xsimd::index, requires_arch) noexcept + { + return vgetq_lane_u64(self, I); + } + + template = 0> + XSIMD_INLINE T get(batch const& self, ::xsimd::index, requires_arch) noexcept + { + return vgetq_lane_s64(self, I); + } + // first template XSIMD_INLINE float first(batch const& self, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_neon64.hpp b/include/xsimd/arch/xsimd_neon64.hpp index 602b4b207..39b6a2edf 100644 --- a/include/xsimd/arch/xsimd_neon64.hpp +++ b/include/xsimd/arch/xsimd_neon64.hpp @@ -31,6 +31,13 @@ namespace xsimd { using namespace types; + // get + template + XSIMD_INLINE double get(batch const& self, ::xsimd::index, requires_arch) noexcept + { + return vgetq_lane_f64(self, I); + } + // first template XSIMD_INLINE double first(batch const& self, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_rvv.hpp b/include/xsimd/arch/xsimd_rvv.hpp index 7439c8037..20d1a38e4 100644 --- a/include/xsimd/arch/xsimd_rvv.hpp +++ b/include/xsimd/arch/xsimd_rvv.hpp @@ -1369,6 +1369,27 @@ namespace xsimd return std::complex { detail::rvvmv_lane0(tmpr), detail::rvvmv_lane0(tmpi) }; } + // get (compile-time index): skip the slidedown when I == 0; lane 0 maps straight to the scalar move. + template = 0> + XSIMD_INLINE T get(batch const& arg, index, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(I == 0) + { + return detail::rvvmv_lane0(arg); + } + return get(arg, I, rvv {}); + } + + template = 0> + XSIMD_INLINE std::complex get(batch, A> const& arg, index, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(I == 0) + { + return std::complex { detail::rvvmv_lane0(arg.real()), detail::rvvmv_lane0(arg.imag()) }; + } + return get(arg, I, rvv {}); + } + // all template = 0> XSIMD_INLINE bool all(batch_bool const& arg, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_sse2.hpp b/include/xsimd/arch/xsimd_sse2.hpp index 7e00b2b74..18e73f63d 100644 --- a/include/xsimd/arch/xsimd_sse2.hpp +++ b/include/xsimd/arch/xsimd_sse2.hpp @@ -2276,6 +2276,57 @@ namespace xsimd } } + // get (must appear after first and swizzle so it can delegate through the xsimd API) + namespace detail + { + // broadcast lane index I across a batch_constant matching batch::size + template + XSIMD_INLINE auto broadcast_lane_index(std::index_sequence) noexcept + -> batch_constant, A, static_cast>(Is * 0 + I)...> + { + return {}; + } + + template + XSIMD_INLINE auto broadcast_lane_index() noexcept + -> decltype(broadcast_lane_index(std::make_index_sequence::size> {})) + { + return {}; + } + } + + template + XSIMD_INLINE typename std::enable_if::value && sizeof(T) <= 2, T>::type + get(batch const& self, ::xsimd::index, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(I == 0) + { + return first(self, A {}); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return static_cast(_mm_extract_epi16(self, I)); + } + else + { + // SSE2 has no pextrb; byte-lane shift + movd is the shortest path for I>0. + return static_cast(_mm_cvtsi128_si32(_mm_srli_si128(self, I)) & 0xFF); + } + } + + template + XSIMD_INLINE typename std::enable_if<(std::is_integral::value && sizeof(T) >= 4) || std::is_floating_point::value, T>::type + get(batch const& self, ::xsimd::index, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(I == 0) + { + return first(self, A {}); + } + else + { + return first(swizzle(self, detail::broadcast_lane_index(), A {}), A {}); + } + } } } diff --git a/include/xsimd/arch/xsimd_sse4_1.hpp b/include/xsimd/arch/xsimd_sse4_1.hpp index bb3a6ca2c..f1af688c3 100644 --- a/include/xsimd/arch/xsimd_sse4_1.hpp +++ b/include/xsimd/arch/xsimd_sse4_1.hpp @@ -105,6 +105,41 @@ namespace xsimd return _mm_floor_pd(self); } + // get + template ::value>> + XSIMD_INLINE T get(batch const& self, ::xsimd::index, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(I == 0) + { + return first(self, sse2 {}); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return static_cast(_mm_extract_epi8(self, I)); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return static_cast(_mm_extract_epi16(self, I)); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return static_cast(_mm_extract_epi32(self, I)); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { +#if defined(__x86_64__) + return static_cast(_mm_extract_epi64(self, I)); +#else + return get(self, ::xsimd::index {}, sse2 {}); +#endif + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + // insert template ::value>> XSIMD_INLINE batch insert(batch const& self, T val, index pos, requires_arch) noexcept diff --git a/include/xsimd/types/xsimd_api.hpp b/include/xsimd/types/xsimd_api.hpp index 5e6b3a209..b88f49033 100644 --- a/include/xsimd/types/xsimd_api.hpp +++ b/include/xsimd/types/xsimd_api.hpp @@ -1109,6 +1109,37 @@ namespace xsimd return x > y; } + /** + * @ingroup batch_data_transfer + * + * Extract the scalar element at compile-time index \c I from batch \c b. + * @param b the batch to extract from. + * @return the scalar element at index \c I. + */ + template + XSIMD_INLINE T get(batch const& b) noexcept + { + static_assert(I < batch::size, "index out of bounds"); + detail::static_check_supported_config(); + return kernel::get(b, index {}, A {}); + } + + template + XSIMD_INLINE bool get(batch_bool const& b) noexcept + { + static_assert(I < batch_bool::size, "index out of bounds"); + detail::static_check_supported_config(); + return kernel::get(b, index {}, A {}); + } + + template + XSIMD_INLINE typename batch, A>::value_type get(batch, A> const& b) noexcept + { + static_assert(I < batch, A>::size, "index out of bounds"); + detail::static_check_supported_config(); + return kernel::get(b, index {}, A {}); + } + /** * @ingroup batch_reducers * diff --git a/test/test_batch.cpp b/test/test_batch.cpp index 5cf47f3d7..403bf00df 100644 --- a/test/test_batch.cpp +++ b/test/test_batch.cpp @@ -158,6 +158,21 @@ struct batch_test CHECK_EQ(res.first(), lhs[0]); } + template + void test_get_impl(batch_type const& res, std::index_sequence) const + { + array_type extracted = { xsimd::get(res)... }; + CHECK_EQ(extracted, lhs); + CHECK_BATCH_EQ(batch_type::load_unaligned(extracted.data()), res); + } + + void test_get() const + { + batch_type res = batch_lhs(); + CHECK_EQ(xsimd::get<0>(res), res.first()); + test_get_impl(res, std::make_index_sequence {}); + } + void test_arithmetic() const { // +batch @@ -986,6 +1001,11 @@ TEST_CASE_TEMPLATE("[batch]", B, BATCH_TYPES) Test.test_first_element(); } + SUBCASE("get") + { + Test.test_get(); + } + SUBCASE("arithmetic") { Test.test_arithmetic(); diff --git a/test/test_batch_complex.cpp b/test/test_batch_complex.cpp index e06ad83fb..8e4787c6c 100644 --- a/test/test_batch_complex.cpp +++ b/test/test_batch_complex.cpp @@ -182,6 +182,21 @@ struct batch_complex_test CHECK_EQ(res.first(), lhs[0]); } + template + void test_get_impl(batch_type const& res, std::index_sequence) const + { + array_type extracted = { xsimd::get(res)... }; + CHECK_EQ(extracted, lhs); + CHECK_BATCH_EQ(batch_type::load_unaligned(extracted.data()), res); + } + + void test_get() const + { + batch_type res = batch_lhs(); + CHECK_EQ(xsimd::get<0>(res), res.first()); + test_get_impl(res, std::make_index_sequence {}); + } + void test_arithmetic() const { // +batch @@ -689,6 +704,8 @@ TEST_CASE_TEMPLATE("[xsimd complex batches]", B, BATCH_COMPLEX_TYPES) SUBCASE("first element") { Test.test_first_element(); } + SUBCASE("get") { Test.test_get(); } + SUBCASE("arithmetic") { Test.test_arithmetic(); } SUBCASE("computed_assignment") { Test.test_computed_assignment(); }