reason to switch to 64 Bit Assembler

habran · February 10, 2013, 08:03:46 PM

here are three examples which will tell you why
first is 64 beauty:


xmemcpy PROC dest:QWORD,src :QWORD, count:UINT_PTR
  mov r9,rcx
  .if (rcx != rdx)
	  .for (¦r8¦al=[rdx],[rcx]=al,rcx++,rdx++,r8--) 
    .endfor
  .endif
  mov rax,r9
  ret
xmemcpy ENDP
compiled to this:
000000014004DA78  push        rbp  
000000014004DA79  mov         rbp,rsp 
000000014004DA7C  mov         r9,rcx 
000000014004DA7F  cmp         rcx,rdx 
000000014004DA82  je          xmemcpy+22h (14004DA9Ah) 
000000014004DA84  jmp         xmemcpy+1Bh (14004DA93h) 
000000014004DA86  mov         al,byte ptr [rdx] 
000000014004DA88  mov         byte ptr [rcx],al 
000000014004DA8A  inc         rcx  
000000014004DA8D  inc         rdx  
000000014004DA90  dec         r8   
000000014004DA93  and         r8,r8 
000000014004DA96  je          xmemcpy+22h (14004DA9Ah) 
000000014004DA98  jmp         xmemcpy+0Eh (14004DA86h) 
000000014004DA9A  mov         rax,r9 
000000014004DA9D  leave            
000000014004DA9E  ret

the second is the same routine but 32 bit:

Code Select


xmemcpy PROC USES ebx dest:DORD,src:DWORD,count:DWORD
  mov ecx,dest
  .if (ecx != src)
	  .for (edx=src,ebx=count¦ebx¦al=[edx],[ecx]=al,ecx++,edx++,ebx--) 
    .endfor
  .endif
  mov eax,dest
  ret
xmemcpy ENDP
compiled to this:
00401020 55              push    ebp
00401021 8bec            mov     ebp,esp
00401023 53              push    ebx
00401024 8b4d08          mov     ecx,dword ptr [ebp+8]
00401027 3b4d0c          cmp     ecx,dword ptr [ebp+0Ch]
0040102a 7415            je      xmemcpy+0x21 (00401041)
0040102c 8b550c          mov     edx,dword ptr [ebp+0Ch]
0040102f 8b5d10          mov     ebx,dword ptr [ebp+10h]
00401032 eb07            jmp     xmemcpy+0x1b (0040103b)
00401034 8a02            mov     al,byte ptr [edx]
00401036 8801            mov     byte ptr [ecx],al
00401038 41              inc     ecx
00401039 42              inc     edx
0040103a 4b              dec     ebx
0040103b 23db            and     ebx,ebx
0040103d 7402            je      xmemcpy+0x21 (00401041)
0040103f ebf3            jmp     xmemcpy+0x14 (00401034)
00401041 8b4508          mov     eax,dword ptr [ebp+8]
00401044 5b              pop     ebx
00401045 5d              pop     ebp
00401046 c20c00          ret     0Ch

and here is the C version:

Code Select


void* xmemcpy(void *dest, const void *src, int count)
{
  unsigned char *byte_dest=(unsigned char *)dest;
  unsigned char *byte_src=(unsigned char *)src;

  if (byte_dest != byte_src)
  {
    if (count)
    {
      for (;;)
      {
        *byte_dest=*byte_src;
        if (!--count) break;
        ++byte_dest;
        ++byte_src;
      }
    }
  }
  return dest;
}
compiled to this:
01271AF0  push        ebp  
01271AF1  mov         ebp,esp  
01271AF3  sub         esp,0D8h  
01271AF9  push        ebx  
01271AFA  push        esi  
01271AFB  push        edi  
01271AFC  lea         edi,[ebp-0D8h]  
01271B02  mov         ecx,36h  
01271B07  mov         eax,0CCCCCCCCh  
01271B0C  rep stos    dword ptr es:[edi]  
01271B0E  mov         eax,dword ptr [dest]  
01271B11  mov         dword ptr [byte_dest],eax  
01271B14  mov         eax,dword ptr [src]  
01271B17  mov         dword ptr [byte_src],eax  
01271B1A  mov         eax,dword ptr [byte_dest]  
01271B1D  cmp         eax,dword ptr [byte_src]  
01271B20  je          xmemcpy+63h (01271B53h)  
01271B22  cmp         dword ptr [count],0  
01271B26  je          xmemcpy+63h (01271B53h)  
01271B28  mov         eax,dword ptr [byte_dest]  
01271B2B  mov         ecx,dword ptr [byte_src]  
01271B2E  mov         dl,byte ptr [ecx]  
01271B30  mov         byte ptr [eax],dl  
01271B32  mov         eax,dword ptr [count]  
01271B35  sub         eax,1  
01271B38  mov         dword ptr [count],eax  
01271B3B  jne         xmemcpy+4Fh (01271B3Fh)  
01271B3D  jmp         xmemcpy+63h (01271B53h)  
01271B3F  mov         eax,dword ptr [byte_dest]  
01271B42  add         eax,1  
01271B45  mov         dword ptr [byte_dest],eax  
01271B48  mov         eax,dword ptr [byte_src]  
01271B4B  add         eax,1  
01271B4E  mov         dword ptr [byte_src],eax  
01271B51  jmp         xmemcpy+38h (01271B28h)  
01271B53  mov         eax,dword ptr [dest]  
01271B56  pop         edi  
01271B57  pop         esi  
01271B58  pop         ebx  
01271B59  mov         esp,ebp  
01271B5B  pop         ebp  
01271B5C  ret

Do you need more reasons? ;)

MichaelW · February 10, 2013, 08:13:54 PM

Am I missing something here, it looks like the 64-bit code is moving a byte at a time. Should not most of the move be done 64-bits at a time?

Vortex · February 10, 2013, 08:16:20 PM

Hi Habran,

Did you test the same C code with a 64-bit C compiler?

jj2007 · February 10, 2013, 09:00:23 PM

Timings would be nice

habran · February 10, 2013, 09:04:18 PM

good point wortex :t
here it is the 64 bit in C:

Code Select


void* xmemcpy(void *dest, const void *src, UINT_PTR count)
{
  unsigned char *byte_dest=(unsigned char *)dest;
  unsigned char *byte_src=(unsigned char *)src;

  if (byte_dest != byte_src)
  {
    if (count)
    {
      for (;;)
      {
        *byte_dest=*byte_src;
        if (!--count) break;
        ++byte_dest;
        ++byte_src;
      }
    }
  }
  return dest;
}

0000000140063630  mov         qword ptr [rsp+18h],r8 
0000000140063635  mov         qword ptr [rsp+10h],rdx 
000000014006363A  mov         qword ptr [rsp+8],rcx 
000000014006363F  sub         rsp,18h 
0000000140063643  mov         rax,qword ptr [dest] 
0000000140063648  mov         qword ptr [byte_dest],rax 
000000014006364D  mov         rax,qword ptr [src] 
0000000140063652  mov         qword ptr [rsp],rax 
0000000140063656  mov         rax,qword ptr [rsp] 
000000014006365A  cmp         qword ptr [byte_dest],rax 
000000014006365F  je          xmemcpy+7Bh (1400636ABh) 
0000000140063661  cmp         qword ptr [count],0 
0000000140063667  je          xmemcpy+7Bh (1400636ABh) 
0000000140063669  mov         rax,qword ptr [byte_dest] 
000000014006366E  mov         rcx,qword ptr [rsp] 
0000000140063672  movzx       ecx,byte ptr [rcx] 
0000000140063675  mov         byte ptr [rax],cl 
0000000140063677  mov         rax,qword ptr [count] 
000000014006367C  sub         rax,1 
0000000140063680  mov         qword ptr [count],rax 
0000000140063685  cmp         qword ptr [count],0 
000000014006368B  jne         xmemcpy+5Fh (14006368Fh) 
000000014006368D  jmp         xmemcpy+7Bh (1400636ABh) 
000000014006368F  mov         rax,qword ptr [byte_dest] 
0000000140063694  add         rax,1 
0000000140063698  mov         qword ptr [byte_dest],rax 
000000014006369D  mov         rax,qword ptr [rsp] 
00000001400636A1  add         rax,1 
00000001400636A5  mov         qword ptr [rsp],rax 
00000001400636A9  jmp         xmemcpy+39h (140063669h) 
00000001400636AB  mov         rax,qword ptr [dest] 
00000001400636B0  add         rsp,18h 
00000001400636B4  ret

habran · February 10, 2013, 09:06:37 PM

MichaelW,
NO :icon_exclaim:

habran · February 10, 2013, 09:26:42 PM

JJ2007,

QuoteTimings would be nice

I agree, but I have to CONFESS that I don't know that part :icon_mrgreen:
Can you please do it for mee

habran · February 10, 2013, 10:05:27 PM

MichaelW,
if data is aligned to 8 , 16 or 32 byte
than it is possible to do that like this example:

Code Select


align 8 
AXCHARINDEX	struct 
  nLine	      SDWORD	?
  lpLine	    INT_PTR	?
  nCharInLine	SDWORD	?
AXCHARINDEX	ends

 .code
 
    lea  rdi,ciPoint                ;points to first index
    lea  rsi,ciPoint1               ;points to second index
    mov  ecx,sizeof(AXCHARINDEX)/8
    rep  movsq

otherwise, if they are chars for example, it is not convenient to do that always
however, we can write more complex EG: xxxmemcpy which would be able to calculate the size of data
and than first transfer all possible QWORDS and than if left last DWORD and than if left last WORD and than if left last BYTE
EG: data size is 256+7 EQU 32 QWORDS, 1 DWORD, 1 WORD, and 1 BYTE

in my case I used simple xmemcpy because of simplicity

have look at these code above
It takes about the same amount of bytes as if you call some function but it is there in the present location
I assure you that it is faster and more appropriate than some sophisticated function especially if it is not in the cash at the time
It would be even more appropriate to create a MACRO to do the same job

Gunther · February 10, 2013, 10:37:52 PM

Hi habran,

there's no doubt that the 64 bit world is the future. But for the next years both - 32 bit and 64 bit - will coexist. I'm inside the 64 bit world since a few years; the first Linux kernel came out 2001; Windows was some years later. Under 32 bit the Application Binary Interface (ABI) are the same. So, one could write code for both platforms. That's over, because the 64 bit ABIs are very different. We can use code for both platforms only in rare cases.

All things considered: there are advantages and disadvantages.

Gunther

habran · February 10, 2013, 11:12:33 PM

hey Gunther,

QuoteBut for the next years both - 32 bit and 64 bit - will coexist.

no doubt thy will, because of accumulated 32 bit apps
however, IMO to continue to write 32 bit programs would be like holding with your nails on the cliff
in my case I would forever like to program in assembly C64, I felt like I had a chocolate in my mouth when I did that
but who needs any more those apps :(
now, I have the same sensation as with C64 when I write 64 bit JWASM :icon_exclaim: :icon_exclaim: :icon_exclaim:
if it was not for excellent JWASM (thanks Japheth :t) I would maybe return to C and C#
it looks like 64 is my favorite number
I am a lazy person by nature and I always think (I am not lazy to think ;))
about a fast and simple way to finish anything (except sex :t)
I don't walk to shop 10 time to bring home the grocery, I use a car for that
you may say that walking is healthy but it is not if you have to carry bloody grocery in your hands

Gunther · February 10, 2013, 11:57:14 PM

Hi habran,

things are a bit more complicated.

The transition from 32 bit to 64 bit will take more time as you might think. We've seen in the past the transition from 16 bit to 32 bit (by the way, I've never discussed C64 programming in my posts). That process had a time line of approximately 15 years. But: we had a lot of memory trouble under 16 bit, which wasn't easy (XMS, EMS, several DOS extenders etc). The pressure was enorm, because a lot of applications at the begin of the 90s were very memory hungry.

That's not the case by the transition from 32 to 64 bit. There are a few applications which really need more than 4 GB RAM (large data bases for example), but others do not. So, we can calculate for that transition at least 15 years. That's a long time; therefore it makes sense to write for both worlds.

I won't argue that you should write 32 bit code. Write your 64 bit applications and that's fine. But have a look for the difficulties: different ABI for the main platforms, some people in our forum can't run 64 bit operating systems (hardware limitations), other people like 32 bit programming etc. etc. A bit more tolerance for other point of views wouldn't be bad.

Gunther

dedndave · February 10, 2013, 11:58:55 PM

many of us use MichaelW's code timing macros
http://masm32.com/board/index.php?topic=49.0

attached is a 32-bit program for timing code (assemble as a console app)
you may or may not want to adapt the code to 64-bit

habran · February 11, 2013, 12:04:33 AM

thanks dedndave :t
I will look at it tomorrow and see if it pays of to translate to x64

dedndave · February 11, 2013, 12:11:47 AM

i don't know if Michael has plans to make a 64-bit version of his macro set
shouldn't be too hard :P

qWord · February 11, 2013, 12:20:14 AM

For a x64 adaption of MichaelW's counter macro see my post in this thread: http://masm32.com/board/index.php?topic=49.msg130#msg130.

The MASM Forum

News:

reason to switch to 64 Bit Assembler

habran

MichaelW

Vortex

jj2007

habran

habran

habran

habran

Gunther

habran

Gunther

dedndave

habran

dedndave

qWord