Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion include/xsimd/arch/common/xsimd_common_memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,8 @@ namespace xsimd
template <class A, size_t I, class T>
XSIMD_INLINE typename batch<std::complex<T>, A>::value_type get(batch<std::complex<T>, A> const& self, ::xsimd::index<I>, requires_arch<common>) noexcept
{
alignas(A::alignment()) T buffer[batch<std::complex<T>, A>::size];
using value_type = typename batch<std::complex<T>, A>::value_type;
alignas(A::alignment()) value_type buffer[batch<std::complex<T>, A>::size];
self.store_aligned(&buffer[0]);
return buffer[I];
}
Expand Down
114 changes: 74 additions & 40 deletions include/xsimd/arch/xsimd_avx.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -748,6 +748,80 @@ namespace xsimd
return self - batch<T, A>(mask.data);
}

// first (must precede get for two-phase lookup)
template <class A>
XSIMD_INLINE float first(batch<float, A> const& self, requires_arch<avx>) noexcept
{
return _mm256_cvtss_f32(self);
}

template <class A>
XSIMD_INLINE double first(batch<double, A> const& self, requires_arch<avx>) noexcept
{
return _mm256_cvtsd_f64(self);
}

template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
XSIMD_INLINE T first(batch<T, A> const& self, requires_arch<avx>) noexcept
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
return static_cast<T>(_mm_cvtsi128_si32(_mm256_castsi256_si128(self)) & 0xFF);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return static_cast<T>(_mm_cvtsi128_si32(_mm256_castsi256_si128(self)) & 0xFFFF);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
return static_cast<T>(_mm_cvtsi128_si32(_mm256_castsi256_si128(self)));
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
{
batch<T, sse4_2> low = _mm256_castsi256_si128(self);
return first(low, sse4_2 {});
}
else
{
assert(false && "unsupported arch/op combination");
return {};
}
}

// get
template <class A, size_t I>
XSIMD_INLINE float get(batch<float, A> const& self, ::xsimd::index<I>, requires_arch<avx>) noexcept
{
XSIMD_IF_CONSTEXPR(I == 0) { return first(self, avx {}); }
constexpr size_t elements_per_lane = batch<float, sse4_1>::size;
constexpr size_t lane = I / elements_per_lane;
constexpr size_t sub_index = I % elements_per_lane;
const auto half = (lane == 0) ? detail::lower_half(self) : detail::upper_half(self);
return kernel::get(batch<float, sse4_1>(half), ::xsimd::index<sub_index> {}, sse4_1 {});
}

template <class A, size_t I>
XSIMD_INLINE double get(batch<double, A> const& self, ::xsimd::index<I>, requires_arch<avx>) noexcept
{
XSIMD_IF_CONSTEXPR(I == 0) { return first(self, avx {}); }
constexpr size_t elements_per_lane = batch<double, sse4_1>::size;
constexpr size_t lane = I / elements_per_lane;
constexpr size_t sub_index = I % elements_per_lane;
const auto half = (lane == 0) ? detail::lower_half(self) : detail::upper_half(self);
return kernel::get(batch<double, sse4_1>(half), ::xsimd::index<sub_index> {}, sse4_1 {});
}

template <class A, size_t I, class T, class = std::enable_if_t<std::is_integral<T>::value>>
XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<avx>) noexcept
{
XSIMD_IF_CONSTEXPR(I == 0) { return first(self, avx {}); }
constexpr size_t elements_per_lane = batch<T, sse4_1>::size;
constexpr size_t lane = I / elements_per_lane;
constexpr size_t sub_index = I % elements_per_lane;
const auto half = (lane == 0) ? detail::lower_half(self) : detail::upper_half(self);
return kernel::get(batch<T, sse4_1>(half), ::xsimd::index<sub_index> {}, sse4_1 {});
}

// insert
template <class A, class T, size_t I, class = std::enable_if_t<std::is_integral<T>::value>>
XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<avx>) noexcept
Expand Down Expand Up @@ -2015,46 +2089,6 @@ namespace xsimd
return _mm256_insertf128_pd(lo, _mm256_castpd256_pd128(hi), 1);
}

// first
template <class A>
XSIMD_INLINE float first(batch<float, A> const& self, requires_arch<avx>) noexcept
{
return _mm256_cvtss_f32(self);
}

template <class A>
XSIMD_INLINE double first(batch<double, A> const& self, requires_arch<avx>) noexcept
{
return _mm256_cvtsd_f64(self);
}

template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
XSIMD_INLINE T first(batch<T, A> const& self, requires_arch<avx>) noexcept
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
return static_cast<T>(_mm256_cvtsi256_si32(self) & 0xFF);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return static_cast<T>(_mm256_cvtsi256_si32(self) & 0xFFFF);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
return static_cast<T>(_mm256_cvtsi256_si32(self));
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
{
batch<T, sse4_2> low = _mm256_castsi256_si128(self);
return first(low, sse4_2 {});
}
else
{
assert(false && "unsupported arch/op combination");
return {};
}
}

// widen
template <class A, class T>
XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x, requires_arch<avx>) noexcept
Expand Down
131 changes: 91 additions & 40 deletions include/xsimd/arch/xsimd_avx512f.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1346,6 +1346,97 @@ namespace xsimd
}
}

// first (must precede get for two-phase lookup)
template <class A>
XSIMD_INLINE float first(batch<float, A> const& self, requires_arch<avx512f>) noexcept
{
return _mm512_cvtss_f32(self);
}

template <class A>
XSIMD_INLINE double first(batch<double, A> const& self, requires_arch<avx512f>) noexcept
{
return _mm512_cvtsd_f64(self);
}

template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
XSIMD_INLINE T first(batch<T, A> const& self, requires_arch<avx512f>) noexcept
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
return static_cast<T>(_mm_cvtsi128_si32(_mm512_castsi512_si128(self)) & 0xFF);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return static_cast<T>(_mm_cvtsi128_si32(_mm512_castsi512_si128(self)) & 0xFFFF);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
return static_cast<T>(_mm_cvtsi128_si32(_mm512_castsi512_si128(self)));
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
{
batch<T, sse4_2> low = _mm512_castsi512_si128(self);
return first(low, sse4_2 {});
}
else
{
assert(false && "unsupported arch/op combination");
return {};
}
}

// get: use valignd/valignq to rotate lane I into position 0 in a single op.
template <class A, size_t I>
XSIMD_INLINE float get(batch<float, A> const& self, ::xsimd::index<I>, requires_arch<avx512f>) noexcept
{
XSIMD_IF_CONSTEXPR(I == 0)
{
return first(self, avx512f {});
}
const auto rotated = _mm512_alignr_epi32(_mm512_castps_si512(self), _mm512_castps_si512(self), I);
return _mm_cvtss_f32(_mm512_castps512_ps128(_mm512_castsi512_ps(rotated)));
}

template <class A, size_t I>
XSIMD_INLINE double get(batch<double, A> const& self, ::xsimd::index<I>, requires_arch<avx512f>) noexcept
{
XSIMD_IF_CONSTEXPR(I == 0)
{
return first(self, avx512f {});
}
const auto rotated = _mm512_alignr_epi64(_mm512_castpd_si512(self), _mm512_castpd_si512(self), I);
return _mm_cvtsd_f64(_mm512_castpd512_pd128(_mm512_castsi512_pd(rotated)));
}

template <class A, size_t I, class T, class = std::enable_if_t<std::is_integral<T>::value>>
XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<avx512f>) noexcept
{
XSIMD_IF_CONSTEXPR(I == 0)
{
return first(self, avx512f {});
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
const auto rotated = _mm512_alignr_epi32(self, self, I);
return first(batch<T, sse4_2>(_mm512_castsi512_si128(rotated)), sse4_2 {});
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
{
const auto rotated = _mm512_alignr_epi64(self, self, I);
return first(batch<T, sse4_2>(_mm512_castsi512_si128(rotated)), sse4_2 {});
}
else
{
// 8/16-bit lanes have no sub-dword rotate in AVX-512F; delegate to AVX halves.
constexpr size_t elements_per_lane = batch<T, avx>::size;
constexpr size_t lane = I / elements_per_lane;
constexpr size_t sub_index = I % elements_per_lane;
const auto half = (lane == 0) ? detail::lower_half(self) : detail::upper_half(self);
return kernel::get(batch<T, avx>(half), ::xsimd::index<sub_index> {}, avx {});
}
}

// insert
template <class A, size_t I>
XSIMD_INLINE batch<float, A> insert(batch<float, A> const& self, float val, index<I>, requires_arch<avx512f>) noexcept
Expand Down Expand Up @@ -2753,46 +2844,6 @@ namespace xsimd
2));
}

// first
template <class A>
XSIMD_INLINE float first(batch<float, A> const& self, requires_arch<avx512f>) noexcept
{
return _mm512_cvtss_f32(self);
}

template <class A>
XSIMD_INLINE double first(batch<double, A> const& self, requires_arch<avx512f>) noexcept
{
return _mm512_cvtsd_f64(self);
}

template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
XSIMD_INLINE T first(batch<T, A> const& self, requires_arch<avx512f>) noexcept
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
return static_cast<T>(_mm_cvtsi128_si32(_mm512_castsi512_si128(self)) & 0xFF);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return static_cast<T>(_mm_cvtsi128_si32(_mm512_castsi512_si128(self)) & 0xFFFF);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
return static_cast<T>(_mm_cvtsi128_si32(_mm512_castsi512_si128(self)));
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
{
batch<T, sse4_2> low = _mm512_castsi512_si128(self);
return first(low, sse4_2 {});
}
else
{
assert(false && "unsupported arch/op combination");
return {};
}
}

// widen
template <class A, class T>
XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x, requires_arch<avx512f>) noexcept
Expand Down
55 changes: 55 additions & 0 deletions include/xsimd/arch/xsimd_neon.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2742,6 +2742,61 @@ namespace xsimd
return vshrq_n_s64(x, shift);
}

// get
template <class A, size_t I>
XSIMD_INLINE float get(batch<float, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
{
return vgetq_lane_f32(self, I);
}

template <class A, size_t I, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
{
return vgetq_lane_u8(self, I);
}

template <class A, size_t I, class T, detail::enable_sized_signed_t<T, 1> = 0>
XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
{
return vgetq_lane_s8(self, I);
}

template <class A, size_t I, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
{
return vgetq_lane_u16(self, I);
}

template <class A, size_t I, class T, detail::enable_sized_signed_t<T, 2> = 0>
XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
{
return vgetq_lane_s16(self, I);
}

template <class A, size_t I, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
{
return vgetq_lane_u32(self, I);
}

template <class A, size_t I, class T, detail::enable_sized_signed_t<T, 4> = 0>
XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
{
return vgetq_lane_s32(self, I);
}

template <class A, size_t I, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
{
return vgetq_lane_u64(self, I);
}

template <class A, size_t I, class T, detail::enable_sized_signed_t<T, 8> = 0>
XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
{
return vgetq_lane_s64(self, I);
}

// first
template <class A>
XSIMD_INLINE float first(batch<float, A> const& self, requires_arch<neon>) noexcept
Expand Down
7 changes: 7 additions & 0 deletions include/xsimd/arch/xsimd_neon64.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,13 @@ namespace xsimd
{
using namespace types;

// get
template <class A, size_t I>
XSIMD_INLINE double get(batch<double, A> const& self, ::xsimd::index<I>, requires_arch<neon64>) noexcept
{
return vgetq_lane_f64(self, I);
}

// first
template <class A>
XSIMD_INLINE double first(batch<double, A> const& self, requires_arch<neon64>) noexcept
Expand Down
21 changes: 21 additions & 0 deletions include/xsimd/arch/xsimd_rvv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1369,6 +1369,27 @@ namespace xsimd
return std::complex<T> { detail::rvvmv_lane0(tmpr), detail::rvvmv_lane0(tmpi) };
}

// get (compile-time index): skip the slidedown when I == 0; lane 0 maps straight to the scalar move.
template <class A, size_t I, class T, detail::rvv_enable_all_t<T> = 0>
XSIMD_INLINE T get(batch<T, A> const& arg, index<I>, requires_arch<rvv>) noexcept
{
XSIMD_IF_CONSTEXPR(I == 0)
{
return detail::rvvmv_lane0(arg);
}
return get(arg, I, rvv {});
}

template <class A, size_t I, class T, detail::rvv_enable_all_t<T> = 0>
XSIMD_INLINE std::complex<T> get(batch<std::complex<T>, A> const& arg, index<I>, requires_arch<rvv>) noexcept
{
XSIMD_IF_CONSTEXPR(I == 0)
{
return std::complex<T> { detail::rvvmv_lane0(arg.real()), detail::rvvmv_lane0(arg.imag()) };
}
return get(arg, I, rvv {});
}

// all
template <class A, class T, detail::rvv_enable_all_t<T> = 0>
XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<rvv>) noexcept
Expand Down
Loading
Loading