News:

Masm32 SDK description, downloads and other helpful links
Message to All Guests

Main Menu

Simplifying this Short MASM

Started by alfrqan, March 29, 2013, 07:49:58 PM

Previous topic - Next topic

jj2007

Quote from: alfrqan on March 30, 2013, 07:38:47 AM
Exactly true carry is most important

You might like the Optimizing Crypto++ library paper. Inside the page, search for paddq

And there is an interesting thread on "SSE2 replacement of ALU registers for 384 bit unsigned integers" here.

See also edits to my first reply above.

qWord

In my experience SIMD is unpractical for arbitrary precision integers, because the dependency chain can't be break and a lot of more operations are need to get the carry.
I've serious doubts that he will get any speed gain with SIMD.
MREAL macros - when you need floating point arithmetic while assembling!

dedndave

Quote from: alfrqan on March 30, 2013, 07:33:43 AM
Quote from: dedndave on March 30, 2013, 02:19:49 AM
i thought i saw some that did
even so, i would think you could load 64 bits (high 64 bits set to 0) and add it as a 128 bit value, move to the next 64
i will leave that for those who know SSE well   :P

this should be a reasonably fast 512-bit add...
_asm {
    push    ebx
    push    esi
    push    edi
    mov     edx,summand
    mov     ecx,this

    mov     eax,[edx]
    mov     edi,[edx+4]
    mov     esi,[edx+8]
    mov     ebx,[edx+12]
    add     [ecx],eax
    adc     [ecx+4],edi
    adc     [ecx+8],esi
    adc     [ecx+12],ebx

    mov     eax,[edx+16]
    mov     edi,[edx+20]
    mov     esi,[edx+24]
    mov     ebx,[edx+28]
    adc     [ecx+16],eax
    adc     [ecx+20],edi
    adc     [ecx+24],esi
    adc     [ecx+28],ebx

    mov     eax,[edx+32]
    mov     edi,[edx+36]
    mov     esi,[edx+40]
    mov     ebx,[edx+44]
    adc     [ecx+32],eax
    adc     [ecx+36],edi
    adc     [ecx+40],esi
    adc     [ecx+44],ebx

    mov     eax,[edx+48]
    mov     edi,[edx+52]
    mov     esi,[edx+56]
    mov     ebx,[edx+60]
    adc     [ecx+48],eax
    adc     [ecx+52],edi
    adc     [ecx+56],esi
    adc     [ecx+60],ebx

    pop     edi
    pop     esi
    pop     ebx
}

well this is very slow:P
it took nearly 2543 ms
Really appreciating your answers:D
Still waiting for best solution

that's odd, because on my old slow P4 prescott, i measure 155 clock cycles per 512-bit add
if i calculate correctly, that's about 51.667 nanoseconds on a 3 gHz machine

you do realize - that's a 512-bit add, where your original code was a 128-bit add, right ?
it will require that you modify your surrounding code

see the attachment and try it on your machine   :P

prescott w/htt
155 154 155 155 155
Press any key to continue ...


still, i think you can beat that with MMX/SSE/SSE2 code and a little ingenuity   :P

alfrqan

Well mate,i dont know if SSE or MMX works in inline assembler in visual studio
and i dont know what to do for that
how i can do my masm code in other way which may perform faster in inline assembler in visual studio?

thanks all for your trying to help
waiting your responses


dedndave

sure - you can use SSE inline
although, i am not sure about the syntax - i am not a "C" type of guy - lol

this particular page applies to VS2005 C++, but i would guess other versions support similar syntax
http://msdn.microsoft.com/en-us/library/kcwz153a%28v=vs.80%29.aspx

if i were in your shoes, i would try to speed up a larger piece of the function you are trying to perform
in other words, put the entire function in asm code, either as a linked OBJ or as inline asm
you are giving us only a small part of the loop, so we cannot see the big picture

as Jochen and others have mentioned, the inline asm is not the slow part
but, we can't give you direction when we don't know what the overall function is

even if it is written in C, give us a clue   :P

alfrqan

#20
well..still but i will post later

alfrqan

well i am doing these instruction multiplied with 100 millions so 18*100millions
so there are nearly 180 million instructions  so it takes 700 ms
I need to minimize them if possible

dedndave

let us see the outer loop
best results are likely if you do both the inner and outer loops in asm
in fact, it may well be that a linked OBJ is in order, rather than inline code

jj2007

Quote from: alfrqan on March 31, 2013, 05:02:16 AM
so there are nearly 180 million instructions  so it takes 700 ms

Quote from: jj2007 on March 30, 2013, 12:07:29 AM2. or you talking not about "every addition" but rather about "every ten Million additions".

So my guess was close :biggrin:

The usual procedure here is to set up a testbed for timing three, four alternatives.
- paddq?
- parallelising is possible?
- jmps instead of adc?
- 64-bit code?

alfrqan

    compiling this code
    _asm {
        mov     ecx,this
        mov     edx,summand
   
        mov     eax,[ecx]
        add     eax,[edx]
        mov     [ecx],eax
   
        mov     eax,[ecx+4]
        adc     eax,[edx+4]
        mov     [ecx+4],eax
   
        mov     eax,[ecx+8]
        adc     eax,[edx+8]
        mov     [ecx+8],eax
   
        mov     eax,[ecx+12]
        adc     eax,[edx+12]
        mov     [ecx+12],eax
    }

gives this result
// ASM VERSION after cmpiled///

    00EF1A00  push        ebp 
    00EF1A01  mov         ebp,esp 
    00EF1A03  push        ecx 
    00EF1A04  mov         dword ptr [ebp-4],ecx 

    00EF1A07  mov         ecx,dword ptr [ebp-4] 
    00EF1A0A  mov         edx,dword ptr [summand] 
   
    00EF1A0D  mov         eax,dword ptr [ecx] 
    00EF1A0F  add         eax,dword ptr [edx] 
   00EF1A11  mov         dword ptr [ecx],eax 
   
    00EF1A13  mov         eax,dword ptr [ecx+4] 
    00EF1A16  adc         eax,dword ptr [edx+4] 
    00EF1A19  mov         dword ptr [ecx+4],eax 
   
    00EF1A1C  mov         eax,dword ptr [ecx+8] 
    00EF1A1F  adc         eax,dword ptr [edx+8] 
    00EF1A22  mov         dword ptr [ecx+8],eax 
   
    00EF1A25  mov         eax,dword ptr [ecx+0Ch] 
    00EF1A28  adc         eax,dword ptr [edx+0Ch] 
    00EF1A2B  mov         dword ptr [ecx+0Ch],eax 

   
   

alfrqan

   

this what c++ code does after compiled for a += b
a += b;
    010319B4  mov         eax,dword ptr [esp+40h] 
    010319B8  mov         edx,dword ptr [esp+44h] 
    010319BC  add         eax,ecx 
    010319BE  cmp         eax,ecx 
    010319C0  sbb         ecx,ecx 
    010319C2  neg         ecx 
    010319C4  add         edx,edi 
    010319C6  add         edx,ecx 
    010319C8  mov         dword ptr [esp+10h],eax 
    010319CC  test        ecx,ecx 
    a += b;
    010319CE  je          main+87h (010319D7h) 
    010319D0  cmp         edi,edx 
    010319D2  sbb         eax,eax 
    010319D4  inc         eax 
    010319D5  jmp         main+8Dh (010319DDh) 
    010319D7  cmp         edx,edi 
    010319D9  sbb         eax,eax 
    010319DB  neg         eax 
    010319DD  mov         ecx,dword ptr [esp+48h] 
    010319E1  add         ecx,esi 
    010319E3  add         ecx,eax 
    010319E5  mov         edi,edx 
    010319E7  test        eax,eax 
    010319E9  je          main+0A2h (010319F2h) 
    010319EB  cmp         esi,ecx 
    010319ED  sbb         eax,eax 
    010319EF  inc         eax 
    010319F0  jmp         main+0A8h (010319F8h) 
    010319F2  cmp         ecx,esi 
    010319F4  sbb         eax,eax 
    010319F6  neg         eax 
    010319F8  add         eax,dword ptr [esp+4Ch] 
    010319FC  mov         esi,ecx 
    010319FE  add         dword ptr [esp+8],eax