News:

Masm32 SDK description, downloads and other helpful links
Message to All Guests

Main Menu

Bubble Sort in SSE3 and AVX

Started by frankzappa, July 03, 2013, 06:47:06 PM

Previous topic - Next topic

frankzappa

For a university project i made the algorithm of the bubble sort in SSE3 and AVX. So the version SSE3 is not too bad, 23 seconds for sort 100000 numbers. The AVX version is so slow, 32 seconds for 100000 numbers. For me is important to improve the avx algorithm. In pastebin there are my code:  SSE3  http://pastebin.com/EimcJdQg  and AVX http://pastebin.com/bmQtNKrq. I have an answer opened to StackOverflow where there are many details for any help http://stackoverflow.com/questions/17407414/avx-and-bubble-sort/.

If you want more details i'm here. Thanks

qWord

I'm afraid that bubble sort can't benefit from SIMD instructions. Also, AFAICS the AVX instructions does only increase the instructions size.
You might try this plain x86 variation for compare:
.data
        array REAL4 -0.0,12.0E8,-0.1,1.0,54.0E-3,2.1,0.1,-123.0,-0.0001,123456789.0,78.0,6.0,1.0,-1234.0,123.0,1.0,-1.0,-123.0,-4.0,1.0,-4.0001,0.0,1.0
.code

        lea esi,array
        mov ebx,LENGTHOF array
       
        test ebx,ebx
        jz @end
@1:     mov edi,1
        xor ecx,ecx
        mov edx,DWORD ptr [esi]
        jmp @w1e
@w1:    mov eax,DWORD ptr [esi+ecx*DWORD+DWORD]
                test edx,eax
                jns @3
                cmp edx,eax
                ja @2
                jna @4
        @3:     cmp edx,eax
                jle @2
        @4:             mov DWORD ptr [esi+ecx*DWORD],eax
                        mov DWORD ptr [esi+ecx*DWORD+DWORD],edx
                        mov eax,edx
                        lea edi,[ecx+1]
        @2:     mov edx,eax
                add ecx,1
@w1e:   lea eax,[ebx-1]
                cmp ecx,eax
                jb @w1
        mov ebx,edi
        cmp edi,1
        ja @1
@end:
MREAL macros - when you need floating point arithmetic while assembling!

frankzappa

Thanks for the answer. So for my algorithm i can't do better?  my code is based on comparisons, I put the 2 high elements of ymm0 in the low ymm1, then i put the max in ymm1 and min in ymm0 the problem is to put ymm0 elements in ymm1 because with SSE I have the movhlps that doesn't work in the same way in AVX so I have to use vshufps that is heavier.
Can u help me about this problem?

Thanks for the code and help  :t


qWord

why do you want to use YMM registers? You might upload the code here with English comments and, if possible, with a test bed - this increase the probability that more people will reply.

BTW: did you test the above code?
MREAL macros - when you need floating point arithmetic while assembling!

Gunther

Hi frankzappa,

first things first: Welcome to the forum. And yes, qWord is right: Make a test bed for your code and a lot of people here will have a look at it.

Gunther
You have to know the facts before you can distort them.

hool

I'm sure you can convert this to floating point
This also certainly has some AVX potential but only considering that there will be many sequential dwords on correct positions.
        mov     rsi, _buff3                     ; 16byte aligned buffer
        mov     ecx, (_buff3_len*4)             ; length in bytes
        and     ecx, not 15
        jz      error
        sub     ecx, 16
        jb      error
        cmp     ecx, 16                         ; min 32byte buffer
        jb      error

        xor     edx, edx
        mov     ebx, 16
        mov     ebp, ecx

        align 16
.one_pass:
        movdqa  xmm0, [rsi]                     ;  3 2 1 0
        pshufd  xmm1, xmm0, 111001b             ;  _ 3 2 1
        pinsrd  xmm1, [rsi+16],  3              ;  4 3 2 1
        movdqa  xmm2, xmm0
        pcmpgtd xmm0, xmm1                      ; -1 if any of theese true:   0>1   1>2   2>3   3>4
        pmovmskb eax, xmm0
        test    ax, ax
        jz      @f                              ; we like 0, not -1

        ; sort from high byte in memory to lowest

        pshufd  xmm6, xmm1, 3
        pshufd  xmm4, xmm2, 3
        pshufd  xmm3, xmm2, 2
        movdqa  xmm5, xmm6

        pmaxud  xmm6, xmm4
        pminud  xmm4, xmm5
        movdqa  xmm5, xmm3
        movd    [rsi+16], xmm6

        pminud  xmm3, xmm4
        pmaxud  xmm5, xmm4
        movdqa  xmm4, xmm1
        movd    [rsi+12], xmm5

        pmaxud  xmm4, xmm3
        pminud  xmm1, xmm3
        movdqa  xmm3, xmm2
        movd    [rsi+8], xmm4

        pminud  xmm2, xmm1
        pmaxud  xmm3, xmm1
        movd    [rsi], xmm2
        movd    [rsi+4], xmm3
        xor     ebx, ebx
@@:
        add     rsi, 16
        add     edx, ebx                        ; number bytes that are on their places
        sub     ecx, 16
        jnz     .one_pass


        ; last 16byte chunk in the array:
        movdqa  xmm0, [rsi]
        pshufd  xmm1, xmm0, 111001b
        movdqa  xmm2, xmm0
        pcmpgtd xmm0, xmm1
        pmovmskb eax, xmm0
        test    eax, 4095
        jz      @f
        pshufd  xmm3, xmm2, 2
        pshufd  xmm4, xmm2, 3
        movdqa  xmm5, xmm3
        pminud  xmm3, xmm4
        pmaxud  xmm5, xmm4
        movdqa  xmm4, xmm1
        movd    [rsi+12], xmm5
        pmaxud  xmm4, xmm3
        pminud  xmm1, xmm3
        movdqa  xmm3, xmm2
        movd    [rsi+8], xmm4
        pminud  xmm2, xmm1
        pmaxud  xmm3, xmm1
        movd    [rsi], xmm2
        movd    [rsi+4], xmm3
        xor     ebx, ebx
@@:
        add     edx, ebx

        xor     eax, eax
        sub     edx, 16
        cmovc   edx, eax

        mov     ecx, ebp
        cmp     edx, ebp
        jz      .done

        lea     rsi, [_buff3+rdx]
        sub     ecx, edx
        mov     ebx, 16
        jmp     .one_pass
.done:
[/code[