Hi,
Fast array reversal with SIMD (A small study in hardware accelerated AoS reversal) -
https://dev.to/wunk/fast-array-reversal-with-simd-j3phttps://github.com/Wunkolo/qreverse/archive/refs/heads/master.zip// Powers of two
Bench< ELEMENTSIZE, 8 >();
Bench< ELEMENTSIZE, 16 >();
Bench< ELEMENTSIZE, 32 >();
Bench< ELEMENTSIZE, 64 >();
Bench< ELEMENTSIZE, 128 >();
Bench< ELEMENTSIZE, 256 >();
Bench< ELEMENTSIZE, 512 >();
Bench< ELEMENTSIZE, 1024 >();
// Powers of ten
Bench< ELEMENTSIZE, 100 >();
Bench< ELEMENTSIZE, 1000 >();
Bench< ELEMENTSIZE, 10000 >();
Bench< ELEMENTSIZE, 100000 >();
Bench< ELEMENTSIZE, 1000000 >();
// Primes
Bench< ELEMENTSIZE, 59 >();
Bench< ELEMENTSIZE, 79 >();
Bench< ELEMENTSIZE, 173 >();
Bench< ELEMENTSIZE, 6133 >();
Bench< ELEMENTSIZE, 10177 >();
Bench< ELEMENTSIZE, 25253 >();
Bench< ELEMENTSIZE, 31391 >();
Bench< ELEMENTSIZE, 50432 >();
AVX2 byte
void qReverse<1>(void* Array, std::size_t Count)
{
std::uint8_t* Array8 = reinterpret_cast<std::uint8_t*>(Array);
std::size_t i = 0;
for( std::size_t j = i / 32; j < ((Count / 2) / 32); ++j )
{
const __m256i ShuffleRev = _mm256_set_epi8(
0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15
);
// Load 32 elements at once into one 32-byte register
__m256i Lower = _mm256_loadu_si256(
reinterpret_cast<__m256i*>(&Array8[i])
);
__m256i Upper = _mm256_loadu_si256(
reinterpret_cast<__m256i*>(&Array8[Count - i - 32])
);
// Reverse the byte order of our 32-byte vectors
Lower = _mm256_shuffle_epi8(Lower,ShuffleRev);
Upper = _mm256_shuffle_epi8(Upper,ShuffleRev);
Lower = _mm256_permute2x128_si256(Lower,Lower,1);
Upper = _mm256_permute2x128_si256(Upper,Upper,1);
// Place them at their swapped position
_mm256_storeu_si256(
reinterpret_cast<__m256i*>(&Array8[i]),
Upper
);
_mm256_storeu_si256(
reinterpret_cast<__m256i*>(&Array8[Count - i - 32]),
Lower
);
// 32 elements at a time
i += 32;
}
}// BSWAP 32
for( std::size_t j = i / 4; j < ((Count / 2) / 4); ++j )
{
// Get bswapped versions of our Upper and Lower 4-byte chunks
std::uint32_t Lower = Swap32(
*reinterpret_cast<std::uint32_t*>(&Array8[i])
);
std::uint32_t Upper = Swap32(
*reinterpret_cast<std::uint32_t*>(&Array8[Count - i - 4])
);
// Place them at their swapped position
*reinterpret_cast<std::uint32_t*>(&Array8[i]) = Upper;
*reinterpret_cast<std::uint32_t*>(&Array8[Count - i - 4]) = Lower;
// Four elements at a time
i += 4;
}
// BSWAP 16
for( std::size_t j = i / 2; j < ((Count / 2) / 2); ++j )
{
// Get bswapped versions of our Upper and Lower 4-byte chunks
std::uint16_t Lower = Swap16(
*reinterpret_cast<std::uint16_t*>(&Array8[i])
);
std::uint16_t Upper = Swap16(
*reinterpret_cast<std::uint16_t*>(&Array8[Count - i - 2])
);
// Place them at their swapped position
*reinterpret_cast<std::uint16_t*>(&Array8[i]) = Upper;
*reinterpret_cast<std::uint16_t*>(&Array8[Count - i - 2]) = Lower;
// Two elements at a time
i += 2;
}
// Everything else that we can not do a bswap on, we swap normally
// Naive swaps
for( ; i < Count / 2; ++i )
{
// Exchange the upper and lower element as we work our
// way down to the middle from either end
std::uint8_t Temp(Array8[i]);
Array8[i] = Array8[Count - i - 1];
Array8[Count - i - 1] = Temp;
}
}SSE3void qReverse<1>(void* Array, std::size_t Count)
{
std::uint8_t* Array8 = reinterpret_cast<std::uint8_t*>(Array);
std::size_t i = 0;
for( std::size_t j = i / 16; j < ((Count / 2) / 16); ++j )
{
const __m128i ShuffleRev = _mm_set_epi8(
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
);
// Load 16 elements at once into one 16-byte register
__m128i Lower = _mm_loadu_si128(
reinterpret_cast<__m128i*>(&Array8[i])
);
__m128i Upper = _mm_loadu_si128(
reinterpret_cast<__m128i*>(&Array8[Count - i - 16])
);
// Reverse the byte order of our 16-byte vectors
Lower = _mm_shuffle_epi8(Lower, ShuffleRev);
Upper = _mm_shuffle_epi8(Upper, ShuffleRev);
// Place them at their swapped position
_mm_storeu_si128(
reinterpret_cast<__m128i*>(&Array8[i]),
Upper
);
_mm_storeu_si128(
reinterpret_cast<__m128i*>(&Array8[Count - i - 16]),
Lower
);
// 16 elements at a time
i += 16;
}
// BSWAP 32
for( std::size_t j = i / 4; j < ((Count / 2) / 4); ++j )
{
// Get bswapped versions of our Upper and Lower 4-byte chunks
std::uint32_t Lower = Swap32(
*reinterpret_cast<std::uint32_t*>(&Array8[i])
);
std::uint32_t Upper = Swap32(
*reinterpret_cast<std::uint32_t*>(&Array8[Count - i - 4])
);
// Place them at their swapped position
*reinterpret_cast<std::uint32_t*>(&Array8[i]) = Upper;
*reinterpret_cast<std::uint32_t*>(&Array8[Count - i - 4]) = Lower;
// Four elements at a time
i += 4;
}
// BSWAP 16
for( std::size_t j = i / 2; j < ((Count / 2) / 2); ++j )
{
// Get bswapped versions of our Upper and Lower 4-byte chunks
std::uint16_t Lower = Swap16(
*reinterpret_cast<std::uint16_t*>(&Array8[i])
);
std::uint16_t Upper = Swap16(
*reinterpret_cast<std::uint16_t*>(&Array8[Count - i - 2])
);
// Place them at their swapped position
*reinterpret_cast<std::uint16_t*>(&Array8[i]) = Upper;
*reinterpret_cast<std::uint16_t*>(&Array8[Count - i - 2]) = Lower;
// Two elements at a time
i += 2;
}
// Everything else that we can not do a bswap on, we swap normally
// Naive swaps
for( ; i < Count / 2; ++i )
{
// Exchange the upper and lower element as we work our
// way down to the middle from either end
std::uint8_t Temp(Array8[i]);
Array8[i] = Array8[Count - i - 1];
Array8[Count - i - 1] = Temp;
}
}