Is quick sort the fastest sorting algorithm on the universe ? Well...not anymore :biggrin: :biggrin: :biggrin: :biggrin:
Here is a implementation of a Bitonic Sort algorithm that seems to be 5 times faster then quicksort (under the same conditions). The algo is designed to sort a array or chunk of 16 doubles on a faster way using old school technique :bgrin: :bgrin:
On my tests, under the same circustances, (sorting 16 double), the average speed of the algo was about 98 -102 clocks(28.5 nanoseconds in average), while in quicksort it was something around 500 clocks . Can someone, please test the algo for us to benchmark this ?
The algo works for sorting 16 positive or negative Real8 values.
The current algorithm was biased on the excellent work of mischasan and the article from Timothy Furtak in: "Using SIMD registers and Instructions to Enable Instruction-Level Parallelism in Sorting Algorithms" written in 09/Jun/2007.
References:
https://mischasan.wordpress.com/2011/09/02/update-on-bitonic-sse2-sort-of-16-doubles/
https://github.com/mischasan/sse2
Example of usage:
[DatatoSort: REAL8 136, 151, 216, 40, 13, 215, 24, 104, 230, 154, 227, 219, 52, 217, 38, 131]; Sorry i forgot the proper syntax in masm to Real8. But, it is a sequuence of doubles (Real8).
invoke BitonicSortD offset DatatoSort
BitonicSortD proc near
pCurTable = dword ptr -4
pArray = dword ptr 8
push ebp
mov ebp, esp
sub esp, 132
mov [ebp+pCurTable], esp
push esi
push edi
push edx
push ecx
mov edi, [ebp+pArray]
mov ecx, [ebp+pCurTable]
lea esi, [ecx+8]
mov edx, ecx
add edx, 128
movups xmm0, xmmword ptr [edi+0]
movups xmm2, xmm0
movups xmm1, xmmword ptr [edi+16]
minpd xmm0, xmm1
movups xmmword ptr [ecx+0], xmm0
maxpd xmm2, xmm1
movups xmmword ptr [ecx+16], xmm2
movups xmm0, xmmword ptr [edi+32]
movups xmm2, xmm0
movups xmm1, xmmword ptr [edi+48]
minpd xmm0, xmm1
movups xmmword ptr [ecx+32], xmm0
maxpd xmm2, xmm1
movups xmmword ptr [ecx+48], xmm2
movups xmm0, xmmword ptr [edi+64]
movups xmm2, xmm0
movups xmm1, xmmword ptr [edi+80]
minpd xmm0, xmm1
movups xmmword ptr [ecx+64], xmm0
maxpd xmm2, xmm1
movups xmmword ptr [ecx+80], xmm2
movups xmm0, xmmword ptr [edi+96]
movups xmm2, xmm0
movups xmm1, xmmword ptr [edi+112]
minpd xmm0, xmm1
movups xmmword ptr [ecx+96], xmm0
maxpd xmm2, xmm1
movups xmmword ptr [ecx+112], xmm2
movups xmm0, xmmword ptr [ecx+0]
movups xmm2, xmm0
movups xmm1, xmmword ptr [ecx+48]
minpd xmm0, xmm1
movups xmmword ptr [ecx+0], xmm0
maxpd xmm2, xmm1
movups xmmword ptr [ecx+48], xmm2
movups xmm0, xmmword ptr [ecx+16]
movups xmm2, xmm0
movups xmm1, xmmword ptr [ecx+32]
minpd xmm0, xmm1
movups xmmword ptr [ecx+16], xmm0
maxpd xmm2, xmm1
movups xmmword ptr [ecx+32], xmm2
movups xmm0, xmmword ptr [ecx+64]
movups xmm2, xmm0
movups xmm1, xmmword ptr [ecx+112]
minpd xmm0, xmm1
movups xmmword ptr [ecx+64], xmm0
maxpd xmm2, xmm1
movups xmmword ptr [ecx+112], xmm2
movups xmm0, xmmword ptr [ecx+80]
movups xmm2, xmm0
movups xmm1, xmmword ptr [ecx+96]
minpd xmm0, xmm1
movups xmmword ptr [ecx+80], xmm0
maxpd xmm2, xmm1
movups xmmword ptr [ecx+96], xmm2
movups xmm0, xmmword ptr [ecx+0]
movups xmm2, xmm0
movups xmm1, xmmword ptr [ecx+16]
minpd xmm0, xmm1
movups xmmword ptr [ecx+0], xmm0
maxpd xmm2, xmm1
movups xmmword ptr [ecx+16], xmm2
movups xmm0, xmmword ptr [ecx+32]
movups xmm2, xmm0
movups xmm1, xmmword ptr [ecx+48]
minpd xmm0, xmm1
movups xmmword ptr [ecx+32], xmm0
maxpd xmm2, xmm1
movups xmmword ptr [ecx+48], xmm2
movups xmm0, xmmword ptr [ecx+64]
movups xmm2, xmm0
movups xmm1, xmmword ptr [ecx+80]
minpd xmm0, xmm1
movups xmmword ptr [ecx+64], xmm0
maxpd xmm2, xmm1
movups xmmword ptr [ecx+80], xmm2
movups xmm0, xmmword ptr [ecx+96]
movups xmm2, xmm0
movups xmm1, xmmword ptr [ecx+112]
minpd xmm0, xmm1
movups xmmword ptr [ecx+96], xmm0
maxpd xmm2, xmm1
movups xmmword ptr [ecx+112], xmm2
movups xmm0, xmmword ptr [ecx+0]
movups xmm2, xmm0
movups xmm1, xmmword ptr [ecx+112]
minpd xmm0, xmm1
movups xmmword ptr [ecx+0], xmm0
maxpd xmm2, xmm1
movups xmmword ptr [ecx+112], xmm2
movups xmm0, xmmword ptr [ecx+16]
movups xmm2, xmm0
movups xmm1, xmmword ptr [ecx+96]
minpd xmm0, xmm1
movups xmmword ptr [ecx+16], xmm0
maxpd xmm2, xmm1
movups xmmword ptr [ecx+96], xmm2
movups xmm0, xmmword ptr [ecx+32]
movups xmm2, xmm0
movups xmm1, xmmword ptr [ecx+80]
minpd xmm0, xmm1
movups xmmword ptr [ecx+32], xmm0
maxpd xmm2, xmm1
movups xmmword ptr [ecx+80], xmm2
movups xmm0, xmmword ptr [ecx+48]
movups xmm2, xmm0
movups xmm1, xmmword ptr [ecx+64]
minpd xmm0, xmm1
movups xmmword ptr [ecx+48], xmm0
maxpd xmm2, xmm1
movups xmmword ptr [ecx+64], xmm2
movups xmm0, xmmword ptr [ecx+0]
movups xmm2, xmm0
movups xmm1, xmmword ptr [ecx+32]
minpd xmm0, xmm1
movups xmmword ptr [ecx+0], xmm0
maxpd xmm2, xmm1
movups xmmword ptr [ecx+32], xmm2
movups xmm0, xmmword ptr [ecx+16]
movups xmm2, xmm0
movups xmm1, xmmword ptr [ecx+48]
minpd xmm0, xmm1
movups xmmword ptr [ecx+16], xmm0
maxpd xmm2, xmm1
movups xmmword ptr [ecx+48], xmm2
movups xmm0, xmmword ptr [ecx+64]
movups xmm2, xmm0
movups xmm1, xmmword ptr [ecx+96]
minpd xmm0, xmm1
movups xmmword ptr [ecx+64], xmm0
maxpd xmm2, xmm1
movups xmmword ptr [ecx+96], xmm2
movups xmm0, xmmword ptr [ecx+80]
movups xmm2, xmm0
movups xmm1, xmmword ptr [ecx+112]
minpd xmm0, xmm1
movups xmmword ptr [ecx+80], xmm0
maxpd xmm2, xmm1
movups xmmword ptr [ecx+112], xmm2
movups xmm0, xmmword ptr [ecx+0]
movups xmm2, xmm0
movups xmm1, xmmword ptr [ecx+16]
minpd xmm0, xmm1
movups xmmword ptr [ecx+0], xmm0
maxpd xmm2, xmm1
movups xmmword ptr [ecx+16], xmm2
movups xmm0, xmmword ptr [ecx+32]
movups xmm2, xmm0
movups xmm1, xmmword ptr [ecx+48]
minpd xmm0, xmm1
movups xmmword ptr [ecx+32], xmm0
maxpd xmm2, xmm1
movups xmmword ptr [ecx+48], xmm2
movups xmm0, xmmword ptr [ecx+64]
movups xmm2, xmm0
movups xmm1, xmmword ptr [ecx+80]
minpd xmm0, xmm1
movups xmmword ptr [ecx+64], xmm0
maxpd xmm2, xmm1
movups xmmword ptr [ecx+80], xmm2
movups xmm0, xmmword ptr [ecx+96]
movups xmm2, xmm0
movups xmm1, xmmword ptr [ecx+112]
minpd xmm0, xmm1
movups xmmword ptr [ecx+96], xmm0
maxpd xmm2, xmm1
movups xmmword ptr [ecx+112], xmm2
movsd xmm1, qword ptr [esi]
MergeColumns:
movsd xmm0, qword ptr [ecx]
comisd xmm0, xmm1
jbe short loc_4CBBF3
xchg ecx, esi
movsd qword ptr [edi], xmm1
movsd xmm1, xmm0
jmp short loc_4CBBF7
; ---------------------------------------------------------------------------
loc_4CBBF3:
movsd qword ptr [edi], xmm0
loc_4CBBF7:
add edi, 8
add ecx, 16
cmp ecx, edx
jb MergeColumns
FindLeftOver:
movsd xmm0, qword ptr [esi]
movsd qword ptr [edi], xmm0
add edi, 8
add esi, 16
cmp esi, edx
jb short FindLeftOver
pop ecx
pop edx
pop edi
pop esi
mov esp, ebp
pop ebp
retn 4
BitonicSortD endp
For more information about how the algo works. I´m posting attached to here the RosAsm version i build containing full information/documentation. Also, to make easier tyo understand i created macros for RosAsm to make the algo work faster and easier to implement. Sorry, but i forgot how to build similar macros for masm, but, following the RosAsm macro set and the masm version, it shouldn´t be hard to implement the same macros for masm.
In RosAsm syntax, the macro set for those SSE retrievals of min and max, i made as:
SSE_MIN_MAX_D edi, ecx, 0, 1
; The SSE_MIN_MAX_D macro unrolled loads the data from edi from 4 to 4 chunks interleaved by N bytes (all multiple of 16) and store the
; result on the output (in ecx, on this case). It is unrolled like this:
; movups XMM0 X$#1+(16*#3) | movups xmm2 xmm0 | movups XMM1 X$#1+(16*#4) | minpd XMM0 xmm1 | movups X$#2+(16*#3) XMM0 | maxpd XMM2 xmm1 | movups X$#2+(16*#4) XMM2
; Parameter #1 = Input register or memory buffer to load the data
; Parameter #2 = The output register or Buffer where min max will be written onto
; Parameter #3 = The starting position to read/write the data (min value). It is a integer value from 0 to 7. So, it´s a maximum fo half the size of the Input of 16 Doubles
; Parameter #4 = The ending position to read/write the data (max value). It is a integer value from 0 to 7. So, it´s a maximum fo half the size of the Input of 16 Doubles
; It works like this: Ex: SSE_MIN_MAX_D edi, ecx, 0, 1
; Get the data (2 doubles) at Starting Pos * 2 == 0*2 (On this example), and compare them to ending pos*1 (1*2). The comparition is done from the min/max values
; from the the 2 doubles loaded from Pos1 and 2 doubles from Pos2.
; In bytes, since we are dealing with Doubles, the math always multiply the position of those values by 16 (8*2 = 8 from te size of the double and 2 for the interleaving)
; So, the data to be analysed is, in fact 8 bytes * 2 * Wanted Position away from the input. On this example, if we feed 1, we are getting the 1st 2 double values
; from 32 bytes (2*8*1) ahead to the start of the input
The core of the bitonic routines is this (written as macros for easier understanding)
; Bitonic step #1 - Use the inputed data in edi to swap the min max positions and store them on output at ecx (Our temporary buffer)
SSE_MIN_MAX_D edi, ecx, 0, 1
SSE_MIN_MAX_D edi, ecx, 2, 3
SSE_MIN_MAX_D edi, ecx, 4, 5
SSE_MIN_MAX_D edi, ecx, 6, 7
; Bitonic step #2 - Now that we already reordered the Inputed pairs of Min/Max onto the Temporary Buffer, is time to we swap those values from the buffer and so the same thing, all over the 2 pairs of columns.
; So, at input and output we will always use our temporary buffer in ecx, because we are now swapping the temporary buffer.
; You can see easier from the macro that we are swapping them on a kind of `square` of 2*2 Pos long ordered on a specific way
; "Square 1" - From top to bottom (0 to 1) and then bottom to top (2 to 3)
SSE_MIN_MAX_D ecx, ecx, 0, 3
SSE_MIN_MAX_D ecx, ecx, 1, 2
; "Square 2" - From top to bottom (4 to 5) and then bottom to top (6 to 7)
SSE_MIN_MAX_D ecx, ecx, 4, 7
SSE_MIN_MAX_D ecx, ecx, 5, 6
; "Square 3" - From top left to right (0 to 1) and bottom right to left (2 to 3)
SSE_MIN_MAX_D ecx, ecx, 0, 1
SSE_MIN_MAX_D ecx, ecx, 2, 3
; "Square 4" - From top left to right (4 to 5) and bottom right to left (6 to 7)
SSE_MIN_MAX_D ecx, ecx, 4, 5
SSE_MIN_MAX_D ecx, ecx, 6, 7
; Bitonic step #3 - We do the same swapping as in the previous step, but using the same kind of square of 4*2 Pos long ordered on the same way as previously
; "Square 1" - From top to bottom (0 to 3) and then bottom to top (4 to 7)
SSE_MIN_MAX_D ecx, ecx, 0, 7
SSE_MIN_MAX_D ecx, ecx, 1, 6
SSE_MIN_MAX_D ecx, ecx, 2, 5
SSE_MIN_MAX_D ecx, ecx, 3, 4
; "Square 2" - From top left to right interleaved by 2 pos (0 to 3), (1 to 3), (4 to 6) and then (5 to 7)
SSE_MIN_MAX_D ecx, ecx, 0, 2
SSE_MIN_MAX_D ecx, ecx, 1, 3
SSE_MIN_MAX_D ecx, ecx, 4, 6
SSE_MIN_MAX_D ecx, ecx, 5, 7
; "Square 3" - From top left to right (ordered) by 1 pos (0 to 1), (2 to 3), (4 to 5) and then (6 to 7)
SSE_MIN_MAX_D ecx, ecx, 0, 1
SSE_MIN_MAX_D ecx, ecx, 2, 3
SSE_MIN_MAX_D ecx, ecx, 4, 5
SSE_MIN_MAX_D ecx, ecx, 6, 7
Alternatively, you can read how the algo works, also fully documented here:
http://masm32.com/board/index.php?topic=8697.msg94951#msg94951
This algo can be extended to also work for Sorting Floats (on blocks of 32 chunks perhaps), or perhaps also bytes (on blocks of 64 bytes)
I´m not sure how fast is it, since on my tests, i limited to 16 double and comparing to quicksort it was 5 times faster.
Maybe it sill have room for further optimization, but i reached my limit of optimizating tricks. I leave further optimizations to Marinus and JJ, the masters of optimization tricks :greensml: :greensml: :greensml: I´m just a pupil on this compared to them. :greensml: :greensml: :greensml: :greensml:
Updated the attached Info. Fixed some typos :greensml: :greensml: