For a university project i made the algorithm of the bubble sort in SSE3 and AVX. So the version SSE3 is not too bad, 23 seconds for sort 100000 numbers. The AVX version is so slow, 32 seconds for 100000 numbers. For me is important to improve the avx algorithm. In pastebin there are my code: SSE3 http://pastebin.com/EimcJdQg (http://pastebin.com/EimcJdQg) and AVX http://pastebin.com/bmQtNKrq (http://pastebin.com/bmQtNKrq). I have an answer opened to StackOverflow where there are many details for any help http://stackoverflow.com/questions/17407414/avx-and-bubble-sort/ (http://stackoverflow.com/questions/17407414/avx-and-bubble-sort/).
If you want more details i'm here. Thanks
I'm afraid that bubble sort can't benefit from SIMD instructions. Also, AFAICS the AVX instructions does only increase the instructions size.
You might try this plain x86 variation for compare:
.data
array REAL4 -0.0,12.0E8,-0.1,1.0,54.0E-3,2.1,0.1,-123.0,-0.0001,123456789.0,78.0,6.0,1.0,-1234.0,123.0,1.0,-1.0,-123.0,-4.0,1.0,-4.0001,0.0,1.0
.code
lea esi,array
mov ebx,LENGTHOF array
test ebx,ebx
jz @end
@1: mov edi,1
xor ecx,ecx
mov edx,DWORD ptr [esi]
jmp @w1e
@w1: mov eax,DWORD ptr [esi+ecx*DWORD+DWORD]
test edx,eax
jns @3
cmp edx,eax
ja @2
jna @4
@3: cmp edx,eax
jle @2
@4: mov DWORD ptr [esi+ecx*DWORD],eax
mov DWORD ptr [esi+ecx*DWORD+DWORD],edx
mov eax,edx
lea edi,[ecx+1]
@2: mov edx,eax
add ecx,1
@w1e: lea eax,[ebx-1]
cmp ecx,eax
jb @w1
mov ebx,edi
cmp edi,1
ja @1
@end:
Thanks for the answer. So for my algorithm i can't do better? my code is based on comparisons, I put the 2 high elements of ymm0 in the low ymm1, then i put the max in ymm1 and min in ymm0 the problem is to put ymm0 elements in ymm1 because with SSE I have the movhlps that doesn't work in the same way in AVX so I have to use vshufps that is heavier.
Can u help me about this problem?
Thanks for the code and help :t
why do you want to use YMM registers? You might upload the code here with English comments and, if possible, with a test bed - this increase the probability that more people will reply.
BTW: did you test the above code?
Hi frankzappa,
first things first: Welcome to the forum. And yes, qWord is right: Make a test bed for your code and a lot of people here will have a look at it.
Gunther
I'm sure you can convert this to floating point
This also certainly has some AVX potential but only considering that there will be many sequential dwords on correct positions.
mov rsi, _buff3 ; 16byte aligned buffer
mov ecx, (_buff3_len*4) ; length in bytes
and ecx, not 15
jz error
sub ecx, 16
jb error
cmp ecx, 16 ; min 32byte buffer
jb error
xor edx, edx
mov ebx, 16
mov ebp, ecx
align 16
.one_pass:
movdqa xmm0, [rsi] ; 3 2 1 0
pshufd xmm1, xmm0, 111001b ; _ 3 2 1
pinsrd xmm1, [rsi+16], 3 ; 4 3 2 1
movdqa xmm2, xmm0
pcmpgtd xmm0, xmm1 ; -1 if any of theese true: 0>1 1>2 2>3 3>4
pmovmskb eax, xmm0
test ax, ax
jz @f ; we like 0, not -1
; sort from high byte in memory to lowest
pshufd xmm6, xmm1, 3
pshufd xmm4, xmm2, 3
pshufd xmm3, xmm2, 2
movdqa xmm5, xmm6
pmaxud xmm6, xmm4
pminud xmm4, xmm5
movdqa xmm5, xmm3
movd [rsi+16], xmm6
pminud xmm3, xmm4
pmaxud xmm5, xmm4
movdqa xmm4, xmm1
movd [rsi+12], xmm5
pmaxud xmm4, xmm3
pminud xmm1, xmm3
movdqa xmm3, xmm2
movd [rsi+8], xmm4
pminud xmm2, xmm1
pmaxud xmm3, xmm1
movd [rsi], xmm2
movd [rsi+4], xmm3
xor ebx, ebx
@@:
add rsi, 16
add edx, ebx ; number bytes that are on their places
sub ecx, 16
jnz .one_pass
; last 16byte chunk in the array:
movdqa xmm0, [rsi]
pshufd xmm1, xmm0, 111001b
movdqa xmm2, xmm0
pcmpgtd xmm0, xmm1
pmovmskb eax, xmm0
test eax, 4095
jz @f
pshufd xmm3, xmm2, 2
pshufd xmm4, xmm2, 3
movdqa xmm5, xmm3
pminud xmm3, xmm4
pmaxud xmm5, xmm4
movdqa xmm4, xmm1
movd [rsi+12], xmm5
pmaxud xmm4, xmm3
pminud xmm1, xmm3
movdqa xmm3, xmm2
movd [rsi+8], xmm4
pminud xmm2, xmm1
pmaxud xmm3, xmm1
movd [rsi], xmm2
movd [rsi+4], xmm3
xor ebx, ebx
@@:
add edx, ebx
xor eax, eax
sub edx, 16
cmovc edx, eax
mov ecx, ebp
cmp edx, ebp
jz .done
lea rsi, [_buff3+rdx]
sub ecx, edx
mov ebx, 16
jmp .one_pass
.done:
[/code[