Print Page - reason to switch to 64 Bit Assembler

Title: reason to switch to 64 Bit Assembler
Post by: habran on February 10, 2013, 08:03:46 PM

here are three examples which will tell you why
first is 64 beauty:


xmemcpy PROC dest:QWORD,src :QWORD, count:UINT_PTR
  mov r9,rcx
  .if (rcx != rdx)
	  .for (¦r8¦al=[rdx],[rcx]=al,rcx++,rdx++,r8--) 
    .endfor
  .endif
  mov rax,r9
  ret
xmemcpy ENDP
compiled to this:
000000014004DA78  push        rbp  
000000014004DA79  mov         rbp,rsp 
000000014004DA7C  mov         r9,rcx 
000000014004DA7F  cmp         rcx,rdx 
000000014004DA82  je          xmemcpy+22h (14004DA9Ah) 
000000014004DA84  jmp         xmemcpy+1Bh (14004DA93h) 
000000014004DA86  mov         al,byte ptr [rdx] 
000000014004DA88  mov         byte ptr [rcx],al 
000000014004DA8A  inc         rcx  
000000014004DA8D  inc         rdx  
000000014004DA90  dec         r8   
000000014004DA93  and         r8,r8 
000000014004DA96  je          xmemcpy+22h (14004DA9Ah) 
000000014004DA98  jmp         xmemcpy+0Eh (14004DA86h) 
000000014004DA9A  mov         rax,r9 
000000014004DA9D  leave            
000000014004DA9E  ret

the second is the same routine but 32 bit:

Code Select


xmemcpy PROC USES ebx dest:DORD,src:DWORD,count:DWORD
  mov ecx,dest
  .if (ecx != src)
	  .for (edx=src,ebx=count¦ebx¦al=[edx],[ecx]=al,ecx++,edx++,ebx--) 
    .endfor
  .endif
  mov eax,dest
  ret
xmemcpy ENDP
compiled to this:
00401020 55              push    ebp
00401021 8bec            mov     ebp,esp
00401023 53              push    ebx
00401024 8b4d08          mov     ecx,dword ptr [ebp+8]
00401027 3b4d0c          cmp     ecx,dword ptr [ebp+0Ch]
0040102a 7415            je      xmemcpy+0x21 (00401041)
0040102c 8b550c          mov     edx,dword ptr [ebp+0Ch]
0040102f 8b5d10          mov     ebx,dword ptr [ebp+10h]
00401032 eb07            jmp     xmemcpy+0x1b (0040103b)
00401034 8a02            mov     al,byte ptr [edx]
00401036 8801            mov     byte ptr [ecx],al
00401038 41              inc     ecx
00401039 42              inc     edx
0040103a 4b              dec     ebx
0040103b 23db            and     ebx,ebx
0040103d 7402            je      xmemcpy+0x21 (00401041)
0040103f ebf3            jmp     xmemcpy+0x14 (00401034)
00401041 8b4508          mov     eax,dword ptr [ebp+8]
00401044 5b              pop     ebx
00401045 5d              pop     ebp
00401046 c20c00          ret     0Ch

and here is the C version:

Code Select


void* xmemcpy(void *dest, const void *src, int count)
{
  unsigned char *byte_dest=(unsigned char *)dest;
  unsigned char *byte_src=(unsigned char *)src;

  if (byte_dest != byte_src)
  {
    if (count)
    {
      for (;;)
      {
        *byte_dest=*byte_src;
        if (!--count) break;
        ++byte_dest;
        ++byte_src;
      }
    }
  }
  return dest;
}
compiled to this:
01271AF0  push        ebp  
01271AF1  mov         ebp,esp  
01271AF3  sub         esp,0D8h  
01271AF9  push        ebx  
01271AFA  push        esi  
01271AFB  push        edi  
01271AFC  lea         edi,[ebp-0D8h]  
01271B02  mov         ecx,36h  
01271B07  mov         eax,0CCCCCCCCh  
01271B0C  rep stos    dword ptr es:[edi]  
01271B0E  mov         eax,dword ptr [dest]  
01271B11  mov         dword ptr [byte_dest],eax  
01271B14  mov         eax,dword ptr [src]  
01271B17  mov         dword ptr [byte_src],eax  
01271B1A  mov         eax,dword ptr [byte_dest]  
01271B1D  cmp         eax,dword ptr [byte_src]  
01271B20  je          xmemcpy+63h (01271B53h)  
01271B22  cmp         dword ptr [count],0  
01271B26  je          xmemcpy+63h (01271B53h)  
01271B28  mov         eax,dword ptr [byte_dest]  
01271B2B  mov         ecx,dword ptr [byte_src]  
01271B2E  mov         dl,byte ptr [ecx]  
01271B30  mov         byte ptr [eax],dl  
01271B32  mov         eax,dword ptr [count]  
01271B35  sub         eax,1  
01271B38  mov         dword ptr [count],eax  
01271B3B  jne         xmemcpy+4Fh (01271B3Fh)  
01271B3D  jmp         xmemcpy+63h (01271B53h)  
01271B3F  mov         eax,dword ptr [byte_dest]  
01271B42  add         eax,1  
01271B45  mov         dword ptr [byte_dest],eax  
01271B48  mov         eax,dword ptr [byte_src]  
01271B4B  add         eax,1  
01271B4E  mov         dword ptr [byte_src],eax  
01271B51  jmp         xmemcpy+38h (01271B28h)  
01271B53  mov         eax,dword ptr [dest]  
01271B56  pop         edi  
01271B57  pop         esi  
01271B58  pop         ebx  
01271B59  mov         esp,ebp  
01271B5B  pop         ebp  
01271B5C  ret

Do you need more reasons? ;)

Title: Re: reason to switch to 64 Bit Assembler
Post by: MichaelW on February 10, 2013, 08:13:54 PM

Am I missing something here, it looks like the 64-bit code is moving a byte at a time. Should not most of the move be done 64-bits at a time?

Title: Re: reason to switch to 64 Bit Assembler
Post by: Vortex on February 10, 2013, 08:16:20 PM

Hi Habran,

Did you test the same C code with a 64-bit C compiler?

Title: Re: reason to switch to 64 Bit Assembler
Post by: jj2007 on February 10, 2013, 09:00:23 PM

Timings would be nice :biggrin:

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 10, 2013, 09:04:18 PM

good point wortex :t
here it is the 64 bit in C:

Code Select


void* xmemcpy(void *dest, const void *src, UINT_PTR count)
{
  unsigned char *byte_dest=(unsigned char *)dest;
  unsigned char *byte_src=(unsigned char *)src;

  if (byte_dest != byte_src)
  {
    if (count)
    {
      for (;;)
      {
        *byte_dest=*byte_src;
        if (!--count) break;
        ++byte_dest;
        ++byte_src;
      }
    }
  }
  return dest;
}

0000000140063630  mov         qword ptr [rsp+18h],r8 
0000000140063635  mov         qword ptr [rsp+10h],rdx 
000000014006363A  mov         qword ptr [rsp+8],rcx 
000000014006363F  sub         rsp,18h 
0000000140063643  mov         rax,qword ptr [dest] 
0000000140063648  mov         qword ptr [byte_dest],rax 
000000014006364D  mov         rax,qword ptr [src] 
0000000140063652  mov         qword ptr [rsp],rax 
0000000140063656  mov         rax,qword ptr [rsp] 
000000014006365A  cmp         qword ptr [byte_dest],rax 
000000014006365F  je          xmemcpy+7Bh (1400636ABh) 
0000000140063661  cmp         qword ptr [count],0 
0000000140063667  je          xmemcpy+7Bh (1400636ABh) 
0000000140063669  mov         rax,qword ptr [byte_dest] 
000000014006366E  mov         rcx,qword ptr [rsp] 
0000000140063672  movzx       ecx,byte ptr [rcx] 
0000000140063675  mov         byte ptr [rax],cl 
0000000140063677  mov         rax,qword ptr [count] 
000000014006367C  sub         rax,1 
0000000140063680  mov         qword ptr [count],rax 
0000000140063685  cmp         qword ptr [count],0 
000000014006368B  jne         xmemcpy+5Fh (14006368Fh) 
000000014006368D  jmp         xmemcpy+7Bh (1400636ABh) 
000000014006368F  mov         rax,qword ptr [byte_dest] 
0000000140063694  add         rax,1 
0000000140063698  mov         qword ptr [byte_dest],rax 
000000014006369D  mov         rax,qword ptr [rsp] 
00000001400636A1  add         rax,1 
00000001400636A5  mov         qword ptr [rsp],rax 
00000001400636A9  jmp         xmemcpy+39h (140063669h) 
00000001400636AB  mov         rax,qword ptr [dest] 
00000001400636B0  add         rsp,18h 
00000001400636B4  ret

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 10, 2013, 09:06:37 PM

MichaelW,
NO :icon_exclaim: :biggrin:

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 10, 2013, 09:26:42 PM

JJ2007,

QuoteTimings would be nice :biggrin:

I agree, but I have to CONFESS that I don't know that part :icon_mrgreen:
Can you please do it for mee :biggrin:

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 10, 2013, 10:05:27 PM

MichaelW,
if data is aligned to 8 , 16 or 32 byte
than it is possible to do that like this example:

Code Select


align 8 
AXCHARINDEX	struct 
  nLine	      SDWORD	?
  lpLine	    INT_PTR	?
  nCharInLine	SDWORD	?
AXCHARINDEX	ends

 .code
 
    lea  rdi,ciPoint                ;points to first index
    lea  rsi,ciPoint1               ;points to second index
    mov  ecx,sizeof(AXCHARINDEX)/8
    rep  movsq

otherwise, if they are chars for example, it is not convenient to do that always
however, we can write more complex EG: xxxmemcpy which would be able to calculate the size of data
and than first transfer all possible QWORDS and than if left last DWORD and than if left last WORD and than if left last BYTE
EG: data size is 256+7 EQU 32 QWORDS, 1 DWORD, 1 WORD, and 1 BYTE

in my case I used simple xmemcpy because of simplicity

have look at these code above
It takes about the same amount of bytes as if you call some function but it is there in the present location
I assure you that it is faster and more appropriate than some sophisticated function especially if it is not in the cash at the time
It would be even more appropriate to create a MACRO to do the same job :biggrin:

Title: Re: reason to switch to 64 Bit Assembler
Post by: Gunther on February 10, 2013, 10:37:52 PM

Hi habran,

there's no doubt that the 64 bit world is the future. But for the next years both - 32 bit and 64 bit - will coexist. I'm inside the 64 bit world since a few years; the first Linux kernel came out 2001; Windows was some years later. Under 32 bit the Application Binary Interface (ABI) are the same. So, one could write code for both platforms. That's over, because the 64 bit ABIs are very different. We can use code for both platforms only in rare cases.

All things considered: there are advantages and disadvantages.

Gunther

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 10, 2013, 11:12:33 PM

hey Gunther,

QuoteBut for the next years both - 32 bit and 64 bit - will coexist.

no doubt thy will, because of accumulated 32 bit apps
however, IMO to continue to write 32 bit programs would be like holding with your nails on the cliff
in my case I would forever like to program in assembly C64, I felt like I had a chocolate in my mouth when I did that
but who needs any more those apps :(
now, I have the same sensation as with C64 when I write 64 bit JWASM :icon_exclaim: :icon_exclaim: :icon_exclaim:
if it was not for excellent JWASM (thanks Japheth :t) I would maybe return to C and C#
it looks like 64 is my favorite number
I am a lazy person by nature and I always think (I am not lazy to think ;))
about a fast and simple way to finish anything (except sex :t)
I don't walk to shop 10 time to bring home the grocery, I use a car for that
you may say that walking is healthy but it is not if you have to carry bloody grocery in your hands :bgrin:

Title: Re: reason to switch to 64 Bit Assembler
Post by: Gunther on February 10, 2013, 11:57:14 PM

Hi habran,

things are a bit more complicated.

The transition from 32 bit to 64 bit will take more time as you might think. We've seen in the past the transition from 16 bit to 32 bit (by the way, I've never discussed C64 programming in my posts). That process had a time line of approximately 15 years. But: we had a lot of memory trouble under 16 bit, which wasn't easy (XMS, EMS, several DOS extenders etc). The pressure was enorm, because a lot of applications at the begin of the 90s were very memory hungry.

That's not the case by the transition from 32 to 64 bit. There are a few applications which really need more than 4 GB RAM (large data bases for example), but others do not. So, we can calculate for that transition at least 15 years. That's a long time; therefore it makes sense to write for both worlds.

I won't argue that you should write 32 bit code. Write your 64 bit applications and that's fine. But have a look for the difficulties: different ABI for the main platforms, some people in our forum can't run 64 bit operating systems (hardware limitations), other people like 32 bit programming etc. etc. A bit more tolerance for other point of views wouldn't be bad.

Gunther

Title: Re: reason to switch to 64 Bit Assembler
Post by: dedndave on February 10, 2013, 11:58:55 PM

many of us use MichaelW's code timing macros
http://masm32.com/board/index.php?topic=49.0 (http://masm32.com/board/index.php?topic=49.0)

attached is a 32-bit program for timing code (assemble as a console app)
you may or may not want to adapt the code to 64-bit

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 11, 2013, 12:04:33 AM

thanks dedndave :t
I will look at it tomorrow and see if it pays of to translate to x64 :biggrin:

Title: Re: reason to switch to 64 Bit Assembler
Post by: dedndave on February 11, 2013, 12:11:47 AM

i don't know if Michael has plans to make a 64-bit version of his macro set
shouldn't be too hard :P

Title: Re: reason to switch to 64 Bit Assembler
Post by: qWord on February 11, 2013, 12:20:14 AM

For a x64 adaption of MichaelW's counter macro see my post in this thread: http://masm32.com/board/index.php?topic=49.msg130#msg130.

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 11, 2013, 05:49:30 AM

thanks qWord :biggrin:

Title: Re: reason to switch to 64 Bit Assembler
Post by: qWord on February 11, 2013, 06:23:36 AM

habran,
your compare between C and ASM in your first post is unfair, because is obviously a debug build.
Also, a smarter "algorithm" would probably produce much better results. e.g. something like this (not tested):

Code Select

void* xmemcpy(void *dest, void *src, unsigned int cb)
{
	unsigned int cnt1 = cb>>((sizeof(char*)==8)?3:2);
	unsigned int cnt2 = cb&(sizeof(char*)-1);
	char** p1 = (char**)dest;
	char** p2 = (char**)src;
	char* p3;
	char* p4;

	for(;cnt1--;p1++,p2++)
		*p1 = *p2;
	
	p3 = (char*)p1;
	p4 = (char*)p2;

	if (sizeof(char*) == 8)	// dead code for x32
		if(cnt2&4)
		{	*((int*)p3)= *((int*)p4);
			p3+=4;p4+=4;cnt2-=4;
		}

	for(;cnt2--;p3++,p4++)
		*p3 = *p4;

	return dest;
}

Title: Re: reason to switch to 64 Bit Assembler
Post by: jj2007 on February 11, 2013, 09:07:53 AM

Quote from: qWord on February 11, 2013, 06:23:36 AM
habran,
your compare between C and ASM in your first post is unfair, because is obviously a debug build.
Also, a smarter "algorithm" would probably produce much better results.

I wonder how efficient this code from the "64 beauty" example is (can't test it, unfortunately):

000000014004DA90 dec r8
000000014004DA93 and r8,r8 <<< no need for that, the flag is already set
000000014004DA96 je xmemcpy+22h (14004DA9Ah) <<< why not jne xmemcpy+0Eh?? static branch prediction rules would suggest that it is even faster...
000000014004DA98 jmp xmemcpy+0Eh (14004DA86h) <<< can be dropped entirely.
000000014004DA9A mov rax,r9

Again, timings would be nice ;-)

Title: Re: reason to switch to 64 Bit Assembler
Post by: frktons on February 11, 2013, 10:35:01 AM

For what I recall, a memcopy done with native 64 bit registers, in 64
bit systems, is the fastest solution found when we tested, a couple
of years ago, XMM/SSE2 code for this kind of operation.

The test was done on a 32 MB buffer that was simply blanked, not really a memcopy
but it was set just to measure the performance of REP STOSQ vs MOVNTDQ
and measured via rdtsc.
The results were like:

Quote
Clearing done
117,940,861 clocks for a 33,554,432 bytes buffer with using REP STOSQ

Clearing done
1,208,750,068 clocks for a 33,554,432 bytes buffer with using MOVNTDQ

Code from Alex.

I agree with habran, as I said at the time, for many reasons, but I also
understand why years of work are not easily dropped or rewritten. :t

Frank

Title: Re: reason to switch to 64 Bit Assembler
Post by: Gunther on February 11, 2013, 10:51:18 AM

Frank,

Quote from: frktons on February 11, 2013, 10:35:01 AM
For what I recall, a memcopy done with native 64 bit registers, in 64
bit systems, is the fastest solution found when we tested, a couple
of years ago, XMM/SSE2 code for this kind of operation.

the situation has changed dramatically since the advent of Intel's AVX. We should do the test again.

Gunther

Title: Re: reason to switch to 64 Bit Assembler
Post by: frktons on February 11, 2013, 11:05:20 AM

Quote from: Gunther on February 11, 2013, 10:51:18 AM

the situation has changed dramatically since the advent of Intel's AVX. We should do the test again.

Gunther

I think a new test can only confirm that 64 bit mov operations are
faster than 32 bit ones. If anyone has a new processor, say habran, and
the skill to use AVX code, he could do it.
Not that difficult if he really likes to do the test, I can post the 64 bit MASM
code that I used 2 years ago. No AVX because neither Alex's, nor my PC are
AVX able.

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 11, 2013, 11:35:59 AM

I have tested a speed of 64 bit and result is 1:4 against C
for one pass JWASM is 80 or 50h
and C is 207 intersting :P (JJ2007) 0CFh
JJ207 you are correct that more optimization could be done to it
however my intent in this case was not so much focused on that but on beauty and simplicity of 64 bit JWASM
I have used ".for" loop which is portable, readable and easy to use but it can not beat human eyes and brains

thank you Frank for supporting me that's what friends are for :t

qWord,

Quoteyour compare between C and ASM in your first post is unfair, because is obviously a debug build

all of them are debug built because I needed to read a code in memory :icon_eek:
your function looks good and I will test it later

Quote
however, we can write more complex EG: xxxmemcpy which would be able to calculate the size of data
and than first transfer all possible QWORDS and than if left last DWORD and than if left last WORD and than if left last BYTE
EG: data size is 256+7 EQU 32 QWORDS, 1 DWORD, 1 WORD, and 1 BYTE

I think I have seen already on internet written similar function but I can't remember was it in C or assembler
UNFAIR :icon_eek:
what is fair in this world??? life is a bitch!
these days even death is not fair any more, if you are rich you by yourself brand new organs and live as long as you want :P

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 11, 2013, 11:43:07 AM

hi Frank,
I can try to do that dough I did not learn yet AVX
I am ready for another challenge, I am not a chicken :lol:
I have to go now to earn my living, "I'll be back 8)"

Title: Re: reason to switch to 64 Bit Assembler
Post by: jj2007 on February 11, 2013, 11:48:15 AM

Quote from: habran on February 11, 2013, 11:35:59 AMintersting :P (JJ2007) 0CFh

What do you mean with that? ::)

Title: Re: reason to switch to 64 Bit Assembler
Post by: qWord on February 11, 2013, 12:53:56 PM

Quote from: habran on February 11, 2013, 11:35:59 AM
I have tested a speed of 64 bit and result is 1:4 against C

I can't confirm that: my own quick test shows that there is nearly no difference between your .for-loop and xmemcpy.

Code Select

Function:   xmemcpy    xmemcpy2   xmemcpy_Q  xmemcpy_Q2 memcpy     @ForLoop
 --- buffer size = 13 ---
align +0    29         17         4          2          8          33
align +1    29         17         5          3          10         32
align +2    29         17         7          3          13         32
align +3    29         17         5          2          11         33
align +4    29         17         4          2          11         32
align +5    29         17         5          3          11         32
align +6    29         17         4          3          11         32
align +7    29         17         5          2          11         32
align +8    29         17         4          2          11         32
align +9    29         17         5          3          11         32
align +10   29         18         4          3          11         32
align +11   29         17         5          2          11         32
align +12   29         17         4          2          11         32
align +13   29         17         5          3          11         32
align +14   29         17         4          4          11         32
align +15   29         17         5          3          11         32
 --- buffer size = 33 ---
align +0    93         46         10         10         10         77
align +1    73         46         10         7          10         76
align +2    77         46         10         7          10         76
align +3    92         46         10         7          10         77
align +4    73         46         10         7          10         76
align +5    82         46         10         7          10         76
align +6    73         46         10         7          10         77
align +7    91         47         10         7          10         76
align +8    73         47         11         7          13         76
align +9    73         47         10         7          10         91
align +10   73         46         10         7          10         76
align +11   73         47         10         7          11         77
align +12   76         47         10         7          10         76
align +13   86         47         10         7          10         76
align +14   74         47         10         7          10         76
align +15   84         47         18         8          10         76
 --- buffer size = 59 ---
align +0    124        97         21         14         17         134
align +1    152        98         22         15         17         135
align +2    129        98         23         15         17         134
align +3    129        102        22         15         17         135
align +4    129        98         22         14         17         137
align +5    129        98         21         16         18         139
align +6    128        98         21         16         17         133
align +7    129        98         21         15         17         135
align +8    129        98         21         14         17         134
align +9    128        98         21         14         17         135
align +10   128        98         21         15         17         135
align +11   129        98         20         15         17         134
align +12   129        98         21         14         17         134
align +13   133        98         21         15         17         135
align +14   135        99         21         15         16         135
align +15   127        98         21         15         17         134
 --- buffer size = 590 ---
align +0    920        908        150        123        65         1041
align +1    915        886        149        124        62         1040
align +2    922        906        149        124        82         1048
align +3    925        887        150        124        64         1037
align +4    920        891        150        127        63         1103
align +5    918        892        149        124        64         1042
align +6    974        897        157        128        82         1087
align +7    938        888        149        124        64         1032
align +8    921        889        151        123        65         1070
align +9    941        887        154        124        85         1051
align +10   937        888        150        124        63         1056
align +11   953        887        150        123        64         1013
align +12   920        897        151        124        63         1039
align +13   925        900        150        123        63         1017
align +14   938        892        151        124        63         1090
align +15   927        889        156        125        64         1053

  ---   Functions ----
  xmemcpy    : habran , PellesC
  xmemcpy2   : habran , VC 2012
  xmemcpy_Q  : qWord  , PellesC
  xmemcpy_Q2 : qWord  , VC 2012
  memcpy     : MSVCRT
  @ForLoop   : habran

 only alignment of Src varies, Dest is allocated by HeapAlloc()

Press any key to continue ...

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 11, 2013, 03:17:47 PM

qWord,
I have used counter_begin and counter_end as MACROS like this

Code Select


local buff[256]:BYTE

    counter_begin 1,1
    invoke xmemcpy,ADDR buff,CTEXT("habran is very smart cooker"), 27
    counter_end

and I've got above mentioned results
do you want to say that I lied :icon_eek:

however, I don't believe in your testing because, looking in a C source everyone can see that there is much more
job for processor and also accessing memory in C than ASM

are you sure that your testing is correct
if so I will go back to C64 :bgrin:

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 11, 2013, 03:25:23 PM

JJ2007,

QuoteWhat do you mean with that?

207 reminded me on 2007 and it is funny because C knows that you don't like it :biggrin:

BTW 2007 reminded me on two James Bonds or double agent 007
what actually you are doing in Italy? 8)

Title: Re: reason to switch to 64 Bit Assembler
Post by: dedndave on February 11, 2013, 04:05:21 PM

:biggrin:

http://csdb.dk/forums/?roomid=11 (http://csdb.dk/forums/?roomid=11)

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 11, 2013, 04:30:28 PM

thanks dedndave :P :
Bye everyone :biggrin:

Title: Re: reason to switch to 64 Bit Assembler
Post by: Gunther on February 11, 2013, 06:25:49 PM

Frank,

Quote from: frktons on February 11, 2013, 11:05:20 AM
I think a new test can only confirm that 64 bit mov operations are
faster than 32 bit ones. If anyone has a new processor, say habran, and
the skill to use AVX code, he could do it.
Not that difficult if he really likes to do the test, I can post the 64 bit MASM
code that I used 2 years ago. No AVX because neither Alex's, nor my PC are
AVX able.

I can do that next weekend; please post your code.

Gunther

Title: Re: reason to switch to 64 Bit Assembler
Post by: frktons on February 11, 2013, 08:17:09 PM

Quote from: Gunther on February 11, 2013, 06:25:49 PM
Frank,

I can do that next weekend; please post your code.

Gunther

Here you are. The code tests only REP STOSQ vs MOVNTDQ.
You can add the tests for MOVAPS, MOVDQA, etc... if you like.

Frank

Title: Re: reason to switch to 64 Bit Assembler
Post by: qWord on February 11, 2013, 10:02:14 PM

Quote from: habran on February 11, 2013, 03:17:47 PMdo you want to say that I lied :icon_eek:

yes, the purpose of my post was to defame you :dazzled:

Quote from: habran on February 11, 2013, 03:17:47 PM
however, I don't believe in your testing because, looking in a C source everyone can see that there is much more
job for processor and also accessing memory in C than ASM

good point :t

BTW, this is what PellesC creates from your C code:

Code Select

sub_140001000   proc near
                mov     rax, rcx
                mov     rcx, rax
                cmp     rcx, rdx
                jz      short locret_140001026
                test    r8d, r8d
                jz      short locret_140001026

loc_140001010:
                mov     r9b, [rdx]
                mov     [rcx], r9b
                sub     r8d, 1
                jz      short locret_140001026
                add     rcx, 1
                add     rdx, 1
                jmp     short loc_140001010

locret_140001026:                       
                retn
sub_140001000   endp

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 11, 2013, 11:21:44 PM

qWord,

Quote
BTW, this is what PellesC creates from your C code:

Holly Cow!!! :exclaim: :icon_exclaim: :icon_eek:
are you pulling my leg :shock: actually, are you puling my both legs!???
If hat is true why can I not build 64 bit JWASM with it?
give me a proper explanation or I am gone to that C64 forum

Quoteyes, the purpose of my post was to defame you :dazzled:

I was not aware that I am famous, am I really :greenclp:
if I am really a celebrity, maybe I need a body guard, someone like Frank I meant Farmer not frktons :biggrin:( Kevin Michael Costner) or Arnold Alois Schwarzenegger 8)(Terminator)

Title: Re: reason to switch to 64 Bit Assembler
Post by: dedndave on February 11, 2013, 11:54:49 PM

how about Bullseye from DareDevil

(http://www.wrak.pl/inne/daredevil_bullseye.jpg)

funniest bad guy ever :lol:

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 12, 2013, 12:07:11 AM

dedndave, you are a genius :t :eusa_clap:
what are you doing in this forum!!!???
you could struck rich somewhere else :bgrin:
You have DEFAMED me

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 12, 2013, 12:13:28 AM

qWord PellesC produced almost perfect code which should look like this:

Code Select


sub_140001000   proc near
                mov     rax, rcx
                cmp     rcx, rdx
                jz      short locret_140001026
                test    r8d, r8d
                jz      short locret_140001026
loc_140001010:
                mov     r9b, [rdx]
                mov     [rcx], r9b
                add     rcx, 1
                add     rdx, 1
                sub     r8d, 1
                jnz      short loc_140001010
locret_140001026:                       
                retn
sub_140001000   endp

Title: Re: reason to switch to 64 Bit Assembler
Post by: Gunther on February 12, 2013, 03:00:26 AM

Frank,

Quote from: frktons on February 11, 2013, 08:17:09 PM
Here you are. The code tests only REP STOSQ vs MOVNTDQ.
You can add the tests for MOVAPS, MOVDQA, etc... if you like.

Frank

I'll first study your code and see what's to do. Thank you for uploading the source. :t

Gunther

Title: Re: reason to switch to 64 Bit Assembler
Post by: Magnum on February 12, 2013, 03:17:10 AM

I think a better word might be "celebrated".

Main Entry:
celebrated [sel-uh-brey-tid] Show IPA
Part of Speech:    adjective
Definition:    distinguished, famous
Synonyms:    acclaimed, big*, eminent, famed, glorious, great, high-powered, illustrious, immortal, important, large, laureate, lionized, notable, number one, numero uno, outstanding, popular, preeminent, prominent, renowned, revered, storied, up there, w. k., well-known

de·fame audio (d-fm) KEY

TRANSITIVE VERB:
de·famed, de·fam·ing, de·fames

To damage the reputation, character, or good name of by slander or libel. See Synonyms at malign.
Archaic To disgrace.

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 12, 2013, 06:11:18 AM

thank you Magnum, :t
now I know who I am:
distinguished, high-powered, immortal, numero uno, macho-man 8)

I also want to say(no joking this time):
This forum has gathered the most prominent assembler programmers, and if we decide HERE that:
we should not hold with our teeth
something that is already obsolete
but embrace 64 bit
other assembler programmers
will have this to swallow
and our example follow

Title: Re: reason to switch to 64 Bit Assembler
Post by: dedndave on February 12, 2013, 06:14:11 AM

i could write some 64-bit code, but i'd have to get you guys to test it for me :(

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 12, 2013, 06:32:00 AM

dedndave, I promise you I will be proud to do that for you :t

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 12, 2013, 06:37:57 AM

qWord this is what I get when compile your function in C with MSVC205:

Code Select


xmemcpy:
0000000140063090  mov         qword ptr [rsp+18h],r8 
0000000140063095  mov         qword ptr [rsp+10h],rdx 
000000014006309A  mov         qword ptr [rsp+8],rcx 
000000014006309F  sub         rsp,38h 
00000001400630A3  mov         rax,qword ptr [cb] 
00000001400630A8  shr         rax,3 
00000001400630AC  mov         dword ptr [cnt1],eax 
00000001400630B0  mov         rax,qword ptr [cb] 
00000001400630B5  and         rax,7 
00000001400630B9  mov         dword ptr [cnt2],eax 
00000001400630BD  mov         rax,qword ptr [dest] 
00000001400630C2  mov         qword ptr [p1],rax 
00000001400630C7  mov         rax,qword ptr [src] 
00000001400630CC  mov         qword ptr [p2],rax 
00000001400630D1  jmp         xmemcpy+5Fh (1400630EFh) 
00000001400630D3  mov         rax,qword ptr [p1] 
00000001400630D8  add         rax,8 
00000001400630DC  mov         qword ptr [p1],rax 
00000001400630E1  mov         rax,qword ptr [p2] 
00000001400630E6  add         rax,8 
00000001400630EA  mov         qword ptr [p2],rax 
00000001400630EF  mov         eax,dword ptr [cnt1] 
00000001400630F3  mov         ecx,dword ptr [cnt1] 
00000001400630F7  sub         ecx,1 
00000001400630FA  mov         dword ptr [cnt1],ecx 
00000001400630FE  test        eax,eax 
0000000140063100  je          xmemcpy+84h (140063114h) 
0000000140063102  mov         rax,qword ptr [p1] 
0000000140063107  mov         rcx,qword ptr [p2] 
000000014006310C  mov         rcx,qword ptr [rcx] 
000000014006310F  mov         qword ptr [rax],rcx 
0000000140063112  jmp         xmemcpy+43h (1400630D3h) 
0000000140063114  mov         rax,qword ptr [p1] 
0000000140063119  mov         qword ptr [p3],rax 
000000014006311E  mov         rax,qword ptr [p2] 
0000000140063123  mov         qword ptr [rsp],rax 
0000000140063127  xor         eax,eax 
0000000140063129  cmp         eax,1 
000000014006312C  je          xmemcpy+0DBh (14006316Bh) 
000000014006312E  mov         eax,dword ptr [cnt2] 
0000000140063132  and         eax,4 
0000000140063135  test        eax,eax 
0000000140063137  je          xmemcpy+0DBh (14006316Bh) 
0000000140063139  mov         rax,qword ptr [p3] 
000000014006313E  mov         rcx,qword ptr [rsp] 
0000000140063142  mov         ecx,dword ptr [rcx] 
0000000140063144  mov         dword ptr [rax],ecx 
0000000140063146  mov         rax,qword ptr [p3] 
000000014006314B  add         rax,4 
000000014006314F  mov         qword ptr [p3],rax 
0000000140063154  mov         rax,qword ptr [rsp] 
0000000140063158  add         rax,4 
000000014006315C  mov         qword ptr [rsp],rax 
0000000140063160  mov         eax,dword ptr [cnt2] 
0000000140063164  sub         eax,4 
0000000140063167  mov         dword ptr [cnt2],eax 
000000014006316B  jmp         xmemcpy+0F7h (140063187h) 
000000014006316D  mov         rax,qword ptr [p3] 
0000000140063172  add         rax,1 
0000000140063176  mov         qword ptr [p3],rax 
000000014006317B  mov         rax,qword ptr [rsp] 
000000014006317F  add         rax,1 
0000000140063183  mov         qword ptr [rsp],rax 
0000000140063187  mov         eax,dword ptr [cnt2] 
000000014006318B  mov         ecx,dword ptr [cnt2] 
000000014006318F  sub         ecx,1 
0000000140063192  mov         dword ptr [cnt2],ecx 
0000000140063196  test        eax,eax 
0000000140063198  je          xmemcpy+11Ah (1400631AAh) 
000000014006319A  mov         rax,qword ptr [p3] 
000000014006319F  mov         rcx,qword ptr [rsp] 
00000001400631A3  movzx       ecx,byte ptr [rcx] 
00000001400631A6  mov         byte ptr [rax],cl 
00000001400631A8  jmp         xmemcpy+0DDh (14006316Dh) 
00000001400631AA  mov         rax,qword ptr [dest] 
00000001400631AF  add         rsp,38h 
00000001400631B3  ret

I can not believe that it takes only 2 ticks

Title: Re: reason to switch to 64 Bit Assembler
Post by: dedndave on February 12, 2013, 06:43:59 AM

~~i think you must be measuring that wrong~~
i think there is something wrong with the 2 cycle measurement :P
maybe the timer code isn't doing what you think it is or something

i have a friend, not too far away...

he has a win 7-64 ultimate new-fangled machine at home, now
he uses it mostly for running his business
http://www.mesabattingcages.com/ (http://www.mesabattingcages.com/)

he will let me test whatever i like, but i would hate to mess up his machine
or even be near it if it messes up :P

Title: Re: reason to switch to 64 Bit Assembler
Post by: qWord on February 12, 2013, 07:17:35 AM

Quote from: habran on February 12, 2013, 06:37:57 AMI can not believe that it takes only 2 ticks

you are obviously not able to configure your compiler! Also, looking in the code of my testbench (and yes ... you can't compile it because there are some dependencies I've not include) you will see that I've used a high loop count, which blends out memory access.

Code Select

; MSVC 2010
sub_140008A60   proc near

                mov     r10d, r8d
                and     r8d, 7
                mov     r9, rcx
                shr     r10d, 3
                test    r10d, r10d
                jz      short loc_140008A94
                db      66h, 66h, 66h, 66h
                nop     word ptr [rax+rax+00000000h]

loc_140008A80:
                mov     rax, [rdx]
                add     r9, 8
                add     rdx, 8
                dec     r10d
                mov     [r9-8], rax
                jnz     short loc_140008A80

loc_140008A94:
                test    r8b, 4
                jz      short loc_140008AAC
                mov     eax, [rdx]
                add     r9, 4
                add     rdx, 4
                mov     [r9-4], eax
                add     r8d, 0FFFFFFFCh

loc_140008AAC:
                test    r8d, r8d
                jz      short loc_140008AD1
                sub     rdx, r9
                db      66h, 66h, 66h, 66h
                nop     dword ptr [rax+rax+00000000h]

loc_140008AC0:
                movzx   eax, byte ptr [rdx+r9]
                inc     r9
                dec     r8d
                mov     [r9-1], al
                jnz     short loc_140008AC0

loc_140008AD1:
                mov     rax, rcx
                retn
sub_140008A60   endp

in the attachment a testbench with loop count = 1

BTW: if you are not interested in a serious discussion, you may simply say that instead of this bullsh** parody.

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 12, 2013, 10:27:26 AM

yes qWord, now you are talking... :t
that looks more real then before and doesn't contradict to what I said before
my xmemcpy is OK for transferring one or two lines of characters, but for greater data transfer your function is absolute
I always admired your laser sharp mind and programmers skills :eusa_clap:

Title: Re: reason to switch to 64 Bit Assembler
Post by: frktons on February 12, 2013, 11:11:03 AM

Quote from: qWord on February 12, 2013, 07:17:35 AM
Quote from: habran on February 12, 2013, 06:37:57 AMI can not believe that it takes only 2 ticks
you are obviously not able to configure your compiler! Also, looking in the code of my testbench (and yes ... you can't compile it because there are some dependencies I've not include) you will see that I've used a high loop count, which blends out memory access.
Code Select Expand
; MSVC 2010 sub_140008A60 proc near mov r10d, r8d and r8d, 7 mov r9, rcx shr r10d, 3 test r10d, r10d jz short loc_140008A94 db 66h, 66h, 66h, 66h nop word ptr [rax+rax+00000000h] loc_140008A80: mov rax, [rdx] add r9, 8 add rdx, 8 dec r10d mov [r9-8], rax jnz short loc_140008A80 loc_140008A94: test r8b, 4 jz short loc_140008AAC mov eax, [rdx] add r9, 4 add rdx, 4 mov [r9-4], eax add r8d, 0FFFFFFFCh loc_140008AAC: test r8d, r8d jz short loc_140008AD1 sub rdx, r9 db 66h, 66h, 66h, 66h nop dword ptr [rax+rax+00000000h] loc_140008AC0: movzx eax, byte ptr [rdx+r9] inc r9 dec r8d mov [r9-1], al jnz short loc_140008AC0 loc_140008AD1: mov rax, rcx retn sub_140008A60 endp

in the attachment a testbench with loop count = 1

BTW: if you are not interested in a serious discussion, you may simply say that instead of this bullsh** parody.

The executable is quite big after unzipping = 190K. What's inside?
I can't believe a simple test on memory copy takes all that code.

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 19, 2013, 03:54:12 PM

here is the version what I was talking about IMO fastest ever :t
please prove me wrong :biggrin:
I use here xmm4 and ymm4 because first 4 registers are used in float calculation and this one is volatile as well
so we don't have to preserve it

Code Select


option win64:0
OPTION PROLOGUE:NONE 
OPTION EPILOGUE:NONE
xmemcpy PROC dest:QWORD,src :QWORD, count:UINT_PTR
   mov rax,rcx
   .if (rcx!=rdx)
         .for (r10=r8,r10>>=5¦r10¦rcx+=32,rdx+=32,r10--)   
            vmovdqu ymm4,[rdx]
            vmovdqu [rcx],ymm4
        .endfor
	shr r8,1
         .if (CARRY?)
		mov r9b,[rdx]
		mov [rcx],r9b
		inc rcx
		 inc rdx
	.endif
         shr r8,1
	.if (CARRY?)
		mov r9w,[rdx]
		mov [rcx],r9w
		add rcx,2
		add rdx,2
	 .endif
         shr r8,1
         .if (CARRY?)
		mov r9d,[rdx]
		mov [rcx],r9d 
		add rcx,4
		add rdx,4
	.endif
         shr r8,1
         .if (CARRY?)
            mov r9,[rdx]
            mov [rcx],r9
	    add rcx,8
	    add rdx,8
	 .endif
         shr r8,1
         .if (CARRY?)
           movdqu xmm4,[rdx]
           movdqu [rcx],xmm4
	 .endif
   .endif    
aexit: ret              
xmemcpy ENDP
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef

it creates this code:

Code Select


xmemcpy:
00000001`40034220 488bc1          mov     rax,rcx
00000001`40034223 483bca          cmp     rcx,rdx
00000001`40034226 747a            je      xmemcpy+0x82 (00000001`400342a2)
00000001`40034228 4d8bd0          mov     r10,r8
00000001`4003422b 49c1ea05        shr     r10,5
00000001`4003422f 4d23d2          and     r10,r10
00000001`40034232 7415            je      xmemcpy+0x29 (00000001`40034249)
00000001`40034234 c5fe6f22        vmovdqu ymm4,ymmword ptr [rdx]
00000001`40034238 c5fe7f21        vmovdqu ymmword ptr [rcx],ymm4
00000001`4003423c 4883c120        add     rcx,20h
00000001`40034240 4883c220        add     rdx,20h
00000001`40034244 49ffca          dec     r10
00000001`40034247 75eb            jne     xmemcpy+0x14 (00000001`40034234)
00000001`40034249 49d1e8          shr     r8,1
00000001`4003424c 730c            jae     xmemcpy+0x3a (00000001`4003425a)
00000001`4003424e 448a0a          mov     r9b,byte ptr [rdx]
00000001`40034251 448809          mov     byte ptr [rcx],r9b
00000001`40034254 48ffc1          inc     rcx
00000001`40034257 48ffc2          inc     rdx
00000001`4003425a 49d1e8          shr     r8,1
00000001`4003425d 7310            jae     xmemcpy+0x4f (00000001`4003426f)
00000001`4003425f 66448b0a        mov     r9w,word ptr [rdx]
00000001`40034263 66448909        mov     word ptr [rcx],r9w
00000001`40034267 4883c102        add     rcx,2
00000001`4003426b 4883c202        add     rdx,2
00000001`4003426f 49d1e8          shr     r8,1
00000001`40034272 730e            jae     xmemcpy+0x62 (00000001`40034282)
00000001`40034274 448b0a          mov     r9d,dword ptr [rdx]
00000001`40034277 448909          mov     dword ptr [rcx],r9d
00000001`4003427a 4883c104        add     rcx,4
00000001`4003427e 4883c204        add     rdx,4
00000001`40034282 49d1e8          shr     r8,1
00000001`40034285 730e            jae     xmemcpy+0x75 (00000001`40034295)
00000001`40034287 4c8b0a          mov     r9,qword ptr [rdx]
00000001`4003428a 4c8909          mov     qword ptr [rcx],r9
00000001`4003428d 4883c108        add     rcx,8
00000001`40034291 4883c208        add     rdx,8
00000001`40034295 49d1e8          shr     r8,1
00000001`40034298 7308            jae     xmemcpy+0x82 (00000001`400342a2)
00000001`4003429a f30f6f22        movdqu  xmm4,xmmword ptr [rdx]
00000001`4003429e f30f7f21        movdqu  xmmword ptr [rcx],xmm4
00000001`400342a2 c3              ret

and here is version for people without AVX

Code Select


xmemcpy PROC dest:QWORD,src :QWORD, count:UINT_PTR
   mov rax,rcx
   .if (rcx!=rdx)
     .for (r10=r8,r10>>=4¦r10¦rcx+=16,rdx+=16,r10--)   
        movdqu xmm4,[rdx]
        movdqu [rcx],xmm4
     .endfor
     shr r8,1
     .if (CARRY?)
	  mov r9b,[rdx]
	  mov [rcx],r9b
	  inc rcx
	  inc rdx
     .endif
     shr r8,1
     .if (CARRY?)
	  mov r9w,[rdx]
	  mov [rcx],r9w
	  add rcx,2
	  add rdx,2
     .endif
     shr r8,1
     .if (CARRY?)
	  mov r9d,[rdx]
	  mov [rcx],r9d 
	  add rcx,4
	  add rdx,4
     .endif
     shr r8,1
     .if (CARRY?)
       mov r9,[rdx]
       mov [rcx],r9 
   .endif
  .endif    
  ret              
xmemcpy ENDP

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 20, 2013, 12:08:41 AM

I commented this for the visitors only, not for the members of this forum :bgrin:

Code Select


option win64:0                    ;no need for any option
OPTION PROLOGUE:NONE              ;just pure code
OPTION EPILOGUE:NONE
xmemcpy PROC dest:QWORD,src :QWORD, count:UINT_PTR
   mov rax,rcx                    ;save dest of transfered data for return  befor it changes
   .if (rcx!=rdx)                 ;check if there is not the same location of src and dest
         ;here is happening the MULTO IMPORTANTE transfer of data
         .for (r10=r8,r10>>=5¦r10¦rcx+=32,rdx+=32,r10--)   
            vmovdqu ymm4,[rdx]    ;transfer 32 byte at ones 
            vmovdqu [rcx],ymm4    ;with ymm4 AVX register (The Transporter)
        .endfor                   ;RRRRRRRROOOOOOOOAAAAAAAAARRRRRRRR
  ;data is probably not aligned to 32 bytes so we have to check if so
  ;it could have been left for example 31 or 01Fh  or 0000 0000 0001 1111 in reg r8 or count
   shr r8,1                       ;check if so by shifting right 1 time
   .if (CARRY?)                   ;if 1 pops out it will enter in the carry flag
    mov r9b,[rdx]                 ;transfer only one byte to dest
    mov [rcx],r9b                 ;it can be only one byte
    inc rcx                       ;if more than 1 it will be done 
    inc rdx                       ;in the next shift
  .endif
  shr r8,1                        ;LET$;) see if there is a word prezent
  .if (CARRY?)                    ;HA! I found you
    mov r9w,[rdx]                 ;store that only word in the dest
    mov [rcx],r9w
    add rcx,2                     ;this time add two to dest pos
    add rdx,2                     ;and src
   .endif
    shr r8,1                      ;shift again for the dword
   .if (CARRY?)                   ;nock-nock are you in cf
    mov r9d,[rdx]                 ;get in 
    mov [rcx],r9d 
    add rcx,4                     ;now we add 4 to both src and dest
    add rdx,4
  .endif
  shr r8,1                       ;looking for qword
  .if (CARRY?)                   ;no job for you today go home and do some programming
     mov r9,[rdx]                
     mov [rcx],r9
     add rcx,8
     add rdx,8                   ;inrease your pay for 8 bucks an hour
   .endif
   shr r8,1                      ;oword prezent today?
   .if (CARRY?)                   
      movdqu xmm4,[rdx]          
      movdqu [rcx],xmm4          ;Last Stand!!!
   .endif                        ;no need to increase pozition
   .endif                        ;I finished!!! Did you finish yet???
aexit: ret                       ;have a smocko
xmemcpy ENDP
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 20, 2013, 09:37:06 AM

I thought maybe those who still have 32 bit old-timers machines can feel the blow of the lightning speed in they hair :biggrin:
but be careful, it can blow away that little bit of hair left on your head ;)
so I wrote a 32 bit for them 8)

Code Select


xmemcpy uses ebx dest:DWORD,src:DWORD,count:DWORD
   mov ecx,dest
   mov edx,src
   mov ebx,count
   .if (ecx!=edx)
     .for (eax=ebx,eax>>=4¦eax¦ecx+=16,edx+=16,eax--)   
        movdqu xmm4,[edx]
        movdqu [ecx],xmm4
     .endfor
     shr ebx,1
     .if (CARRY?)
       mov al,[edx]
       mov [ecx],al
       inc ecx
       inc edx
     .endif
     shr ebx,1
     .if (CARRY?)
       mov ax,[edx]
       mov [ecx],ax
       add ecx,2
       add edx,2
     .endif
     shr ebx,1
     .if (CARRY?)
       mov eax,[edx]
       mov [ecx],eax 
       add ecx,4
       add edx,4
     .endif
     shr ebx,1
     .if (CARRY?)
       movq xmm4,[edx]
       movq [ecx],xmm4 
   .endif
  .endif 
  mov eax,dest   
  ret              
xmemcpy ENDP

Title: Re: reason to switch to 64 Bit Assembler
Post by: jj2007 on February 21, 2013, 03:34:58 AM

Doesn't assemble with my version of JWasm. Where is your latest build?
And what does .for (eax=ebx,eax>>=4¦eax¦ecx+=16,edx+=16,eax--) mean? "Much bigger or equal"??

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 21, 2013, 05:48:21 AM

Hello JJ2007,
my latest build is as usual in the topic ".FOR built in JWasm" http://masm32.com/board/index.php?topic=402.0 (http://masm32.com/board/index.php?topic=402.0) :shock:

QuoteAnd what does .for (eax=ebx,eax>>=4¦eax¦ecx+=16,edx+=16,eax--) mean? "Much bigger or equal"??

and >>=4 means shift right 4 time it produces shr eax,4 :biggrin:
it means "Much much much much less" :lol:

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 21, 2013, 06:59:51 AM

Hey qWord,
Cat got your tongue? :icon_eek:
(https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcT6k8_OVV3GpA_eIvy8fUnCQ-1nGLI_RS1DuNQdCt0G9AsuOCYQ)
you have the same "Qosmio laptop" as me
did you test the speed? :bgrin:

Title: Re: reason to switch to 64 Bit Assembler
Post by: frktons on February 21, 2013, 10:16:22 AM

Quote from: habran on February 20, 2013, 09:37:06 AM
I thought maybe those who still have 32 bit old-timers machines can feel the blow of the lightning speed in they hair :biggrin:
but be careful, it can blow away that little bit of hair left on your head ;)
so I wrote a 32 bit for them 8)
Code Select Expand
xmemcpy uses ebx dest:DWORD,src:DWORD,count:DWORD mov ecx,dest mov edx,src mov ebx,count .if (ecx!=edx) .for (eax=ebx,eax>>=4¦eax¦ecx+=16,edx+=16,eax--) movdqu xmm4,[edx] movdqu [ecx],xmm4 .endfor shr ebx,1 .if (CARRY?) mov al,[edx] mov [ecx],al inc ecx inc edx .endif shr ebx,1 .if (CARRY?) mov ax,[edx] mov [ecx],ax add ecx,2 add edx,2 .endif shr ebx,1 .if (CARRY?) mov eax,[edx] mov [ecx],eax add ecx,4 add edx,4 .endif shr ebx,1 .if (CARRY?) movq xmm4,[edx] movq [ecx],xmm4 .endif .endif mov eax,dest ret xmemcpy ENDP

Habran, why do you use MOVDQU and not align the memory
pointers to 16 bytes addresses? MOVAPS/MOVDQA are faster.
Unrolling the MOV can be another good option to test.
And if the area to copy is big, > 4 MB , MOVNTDQ is the best
option. Have a look at the old forum and search for CLEARBUFFER.

REP STOSQ is probably faster than your non AVX solution, give it
a shot on 64 bit version.
A last thing. You should post the results of your tests, if you like
to get the attention of somebody on these routines.

Frank

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 21, 2013, 11:05:56 AM

Hi Frank, :biggrin:

Quotewhy do you use MOVDQU and not align the memory

because this routine is created particularly for unaligned data like text or something
I totally agree with you that MOVDQA is much faster than MOVDQU :t
however, if data is aligned to 32 byte I wouldn't need that routine I would just write in my source:

Code Select


    ;r8 can contain sizeof(buffer)
    .for (rcx=dest,rdx=src,r8=count,r8>>=5¦r8¦rcx+=32,rdx+=32,r8--)
             vmovdqa ymm4,[rdx]
             vmovdqa [rcx],ymm4
    .endfor

or for for 16 byte xmm:

Code Select


    ;r8 can contain sizeof(buffer)
    .for (rcx=dest,rdx=src,r8=count,r8>>=4¦r8¦rcx+=16,rdx+=16,r8--)
             movdqa xmm4,[rdx]
             movdqa [rcx],xmm4
    .endfor

QuoteA last thing. You should post the results of your tests

I left it to qWord to do that for me because he likes testing and arguing :P
and I like and appreciate him :biggrin:

Quoteif you like to get the attention of somebody on these routines.

I don't give a damn about attention, take it or leave it 8)

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 21, 2013, 11:16:11 AM

Frank,
IMO it is not always advisable to align data to 16 or 32 bytes :(
if you have STRUCT in 32 bit program you align it to 4
in 64 bit logically is to align it to 8
however, when you work with big data transfer than it is logical to align it as big as your machine can afford :biggrin:

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 21, 2013, 11:44:16 AM

this version is even more optimized then former and it has more logical order
as well as it can be faster for less data then 32 bytes:

Code Select


option win64:0
OPTION PROLOGUE:NONE 
OPTION EPILOGUE:NONE
xmemcpy PROC dest:QWORD,src :QWORD, count:UINT_PTR
    mov rax,rcx
    .if (rcx!=rdx)
	shr r8,1
       .if (CARRY?)
	     mov r9b,[rdx]
   	     mov [rcx],r9b
	     inc rcx
	     inc rdx
       .endif
       shr r8,1
       .if (CARRY?)
	      mov r9w,[rdx]
	      mov [rcx],r9w
	      add rcx,2
	      add rdx,2
       .endif
       shr r8,1
       .if (CARRY?)
	     mov r9d,[rdx]
	     mov [rcx],r9d 
	     add rcx,4
	     add rdx,4
       .endif
       shr r8,1
       .if (CARRY?)
            mov r9,[rdx]
            mov [rcx],r9
            add rcx,8
            add rdx,8
       .endif
       shr r8,1
       .if (CARRY?)
            movdqu xmm4,[rdx]
            movdqu [rcx],xmm4
            add rcx,16
            add rdx,16
	 .endif
	 .for (¦r8¦rcx+=32,rdx+=32,r8--)    
   		vmovdqu ymm4,[rdx]
   		vmovdqu [rcx],ymm4
	.endfor
    .endif      
aexit: ret                     
xmemcpy ENDP
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef

Title: Re: reason to switch to 64 Bit Assembler
Post by: jj2007 on February 21, 2013, 11:46:57 AM

Quote from: habran on February 21, 2013, 05:48:21 AM
Hello JJ2007,
my latest build is as usual in the topic ".FOR built in JWasm" http://masm32.com/board/index.php?topic=402.0 (http://masm32.com/board/index.php?topic=402.0)

Doesn't work on XP: "Not a valid Win32 app", access denied.

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 21, 2013, 12:08:50 PM

sorry JJ2007, :(
it doesn't work on XP

Quotebinaries need at least Windows version 6
(Japheth)

however, there is a workaround for that
source code is in the folder and you can compile yourself if not to much hustle
just replace these two files in JW209s folder
if you don't have M$VC you can compile it with PelesC
but I don't believe you have enough energy to go through all that trouble :dazzled:
prove me wrong, I dare you :P

Title: Re: reason to switch to 64 Bit Assembler
Post by: jj2007 on February 21, 2013, 12:27:12 PM

The standard JWasm works just fine on XP, I use it every day. And, no, I won't try to compile it myself. It is not a question of energy, though. I am too wise to invest my time in trying to compile a major C app :biggrin:

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 21, 2013, 01:14:23 PM

Wise man JJ2007 :biggrin:
believe it or not that excellent JWasm is written in C
and Japheth had to create binaries from it
how do you think he created it, by laying on it for four weeks or something ::)
NO!!! he compiled it!!!! and it looks that he did not dye of it
C is not a plug it is a programming language for Christ sake
don't be a chicken, roll your sleeves and get dirty
No pain no gain!!! :bgrin:

Title: Re: reason to switch to 64 Bit Assembler
Post by: jj2007 on February 21, 2013, 01:31:28 PM

Quote from: habran on February 21, 2013, 01:14:23 PM
C is not a plug it is a programming language for Christ sake

It's spelled "plague", Habran.

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 21, 2013, 01:49:47 PM

thanks wise man JJ2007 :t
what kind of spelling checker is that when it did not worn me!!! ;)
I will reward you for that and only you can use it, you deserved it! ;)
here is for you changed source:

Code Select


xmemcpy uses ebx dest:DWORD,src:DWORD,count:DWORD
    mov ecx,dest
    mov edx,src
    mov ebx,count
    .if (ecx!=edx)
       shr ebx,1
       .if (CARRY?)
          mov al,[edx]
          mov [ecx],al
          inc ecx
          inc edx
       .endif
       shr ebx,1
       .if (CARRY?)
          mov ax,[edx]
          mov [ecx],ax
          add ecx,2
          add edx,2
       .endif
       shr ebx,1
       .if (CARRY?)
          mov eax,[edx]
          mov [ecx],eax 
          add ecx,4
          add edx,4
       .endif
       shr ebx,1
       .if (CARRY?)
          movq xmm4,[edx]
          movq [ecx],xmm4
          add ecx,8
          add edx,8 
       .endif
       .while (ebx)
          movdqu xmm4,[edx]
          movdqu [ecx],xmm4
          add ecx,16
          add edx,16
          dec ebx
       .endw
   .endif 
   mov eax,dest    
   ret                     
xmemcpy ENDP

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 21, 2013, 04:05:16 PM

hey 2007,
are you going to abandon me because of a little spelling mistake :icon_eek:
plug, plague, plug in,plug out, ear plug, plagiarism... who cares :dazzled:
you are just trying to mask the main issue: compiling JWasm :bgrin:
those ENGLEZE have made mess with unnecessary complex spelling just to tease pure strangers :exclaim:
they messed it up so much that even they can not write "for sale" but use "4 sale" :icon_confused:

Title: Re: reason to switch to 64 Bit Assembler
Post by: japheth on February 21, 2013, 06:43:56 PM

Hello,

Quote from: habran on February 21, 2013, 12:08:50 PM
if you don't have M$VC you can compile it with PelesC

it's mentioned in jwasm's readme, but since nobody reads readmes, I'll repeat it here: better do NOT use PellesC to compile JWasm - the jwasm binary created by PellesC is unable to pass the regression tests supplied with the assembler. I haven't analyzed the problem too deeply, but judging from the part that fails I assume that floating-point constants don't have the values as they should.

Good compilers are: Open Watcom, MSVC, GCC (MinGW)

Title: Re: reason to switch to 64 Bit Assembler
Post by: jj2007 on February 21, 2013, 09:12:31 PM

Quote from: habran on February 21, 2013, 01:49:47 PM
thanks wise man JJ2007 :t
...
here is for you changed source:
xmemcpy uses ebx dest:DWORD,src:DWORD,count:DWORD
mov ecx,dest
mov edx,src
...
ret
xmemcpy ENDP

Thanks, it looks competitive :t

AMD Athlon(tm) Dual Core Processor 4450B (SSE3)
loop overhead is approx. 238/100 cycles

9458 cycles for 100 * xmemcpy
8056 cycles for 100 * MbCopy

9292 cycles for 100 * xmemcpy
7893 cycles for 100 * MbCopy

9289 cycles for 100 * xmemcpy
8072 cycles for 100 * MbCopy

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 21, 2013, 10:43:26 PM

Hi Japheth,

Quotedo NOT use PellesC to compile JWasm

sorry for misunderstanding :bgrin:
I've read it but I thought that it applies only to 64 bit

jj2007,
thanks for testing it
this version is created for unaligned data as I mentioned before
can you please try to compare when not aligned at all? :biggrin:

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 21, 2013, 10:46:27 PM

jj2007,
this is what my machine produce from your test:

Code Select


Intel(R) Core(TM) i7-3610QM CPU @ 2.30GHz (SSE4)
loop overhead is approx. 155/100 cycles

2242    cycles for 100 * xmemcpy
5356    cycles for 100 * MbCopy

2239    cycles for 100 * xmemcpy
5455    cycles for 100 * MbCopy

2243    cycles for 100 * xmemcpy
5166    cycles for 100 * MbCopy


--- ok ---

as double as fast as yours, wouldn't you say so :shock:

QuoteThanks, it looks competitive

I would say It looks downright stunning!!!! :t

Title: Re: reason to switch to 64 Bit Assembler
Post by: japheth on February 22, 2013, 12:14:38 AM

Quote from: habran on February 21, 2013, 10:46:27 PM
I would say It looks downright stunning!!!! :t

I fully agree! However - almost 100% faster than MB - which allegedly is already rocket-science? How is this possible? You must do something wrong...

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 22, 2013, 12:15:53 AM

here is 64 bit without .for:

Code Select


xmemcpy ENDP
option win64:0
OPTION PROLOGUE:NONE 
OPTION EPILOGUE:NONE
xmemcpy PROC dest:QWORD,src :QWORD, count:UINT_PTR
         mov rax,rcx
         .if (rcx!=rdx)
   		 shr r8,1
               .if (CARRY?)
	            mov r9b,[rdx]
      		    mov [rcx],r9b
		    inc rcx
	            inc rdx
   	       .endif
                shr r8,1
   		 .if (CARRY?)
	             mov r9w,[rdx]
		     mov [rcx],r9w
	             add rcx,2
	             add rdx,2
	         .endif
             shr r8,1
            .if (CARRY?)
	          mov r9d,[rdx]
	          mov [rcx],r9d 
	          add rcx,4
	          add rdx,4
            .endif
            shr r8,1
            .if (CARRY?)
               mov r9,[rdx]
               mov [rcx],r9
               add rcx,8
               add rdx,8
	         .endif
            shr r8,1
               .if (CARRY?)
               movdqu xmm4,[rdx]
               movdqu [rcx],xmm4
               add rcx,16
               add rdx,16
	   .endif
            .while (r8)         
      	        vmovdqu ymm4,[rdx]
      	        vmovdqu [rcx],ymm4
      		add rcx,32
      		add rdx,32
      		dec r8
	      .endw
             .endif             
         ret                                              
xmemcpy ENDP
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 22, 2013, 12:22:58 AM

Japheth,

Quote
I fully agree! However - almost 100% faster than MB - which allegedly is already rocket-science? How is this possible? You must do something wrong...

I did not touch code I just executed JJ's exe on my machine
and I can do it again now, let see:

Code Select


Intel(R) Core(TM) i7-3610QM CPU @ 2.30GHz (SSE4)
loop overhead is approx. 154/100 cycles

2237    cycles for 100 * xmemcpy
5243    cycles for 100 * MbCopy

2240    cycles for 100 * xmemcpy
5176    cycles for 100 * MbCopy

2233    cycles for 100 * xmemcpy
5163    cycles for 100 * MbCopy


--- ok ---

Japheth, why don't you try it in your machine?

Title: Re: reason to switch to 64 Bit Assembler
Post by: japheth on February 22, 2013, 12:35:02 AM

Quote from: habran on February 22, 2013, 12:22:58 AM
Japheth, why don't you try it in your machine?

The fastest machine that I have available is an 5 year old AMD 64 X2 5000+.

Title: Re: reason to switch to 64 Bit Assembler
Post by: jj2007 on February 22, 2013, 01:00:54 AM

Cool down. If it's faster than the MasmBasic algo, it just means it is faster on your CPU. Well optimised for your CPU.

In case you like it less superficially (d7=destination is align 16+7, s3=src is 16+3 etc):

AMD Athlon(tm) Dual Core Processor 4450B (SSE3)

Algo memcpy MemCo1 MemCo2 MemCoC3 MemCoP4 MemCoC2 MemCoL xmemcpy
Description CRT rep movs movdqa lps+hps movdqa movdqa Masm32 Habran's
dest-al psllq CeleronM dest-al src-al library Ferrari
Code size ? 70 291 222 200 269 33 104
------------------------------------------------------------------------------------
2048, d0s0-0 561 549 360 439 424 361 547 541
2048, d1s1-0 720 597 410 473 473 421 1061 798
2048, d7s7-0 721 598 412 474 474 412 1060 798
2048, d7s8-1 809 851 1016 578 566 582 802 558
2048, d7s9-2 809 853 1016 567 566 567 1058 798
2048, d8s7+1 810 851 868 563 564 565 819 607
2048, d8s8-0 738 587 404 465 480 416 547 541
2048, d8s9-1 801 848 994 563 564 567 804 606
2048, d9s7+2 824 864 862 565 564 579 1060 798
2048, d9s8+1 808 853 862 564 567 565 803 543
2048, d9s9-0 721 595 411 472 472 409 1061 798
2048, d15s15 722 591 425 480 486 422 1072 798

Your algo is pretty good, but for the (frequent) aligned case, there are four algos that perform better on my AMD.

Title: Re: reason to switch to 64 Bit Assembler
Post by: dedndave on February 22, 2013, 02:32:43 AM

prescott w/htt
Intel(R) Pentium(R) 4 CPU 3.00GHz (SSE3)

Algo memcpy MemCo1 MemCo2 MemCoC3 MemCoP4 MemCoC2 MemCoL xmemcpy
Description CRT rep movs movdqa lps+hps movdqa movdqa Masm32 Habran's
dest-al psllq CeleronM dest-al src-al library Ferrari
Code size ? 70 291 222 200 269 33 104
------------------------------------------------------------------------------------
2048, d0s0-0 717 719 608 609 904 610 718 1590
2048, d1s1-0 1100 846 651 651 650 650 4435 3945
2048, d7s7-0 1003 849 656 657 656 655 4437 3952
2048, d7s8-1 1368 1445 1223 868 611 613 4303 3799
2048, d7s9-2 1367 1446 1224 867 611 611 4454 3929
2048, d8s7+1 1338 1446 1188 1342 611 1023 1343 1748
2048, d8s8-0 976 849 656 657 657 656 977 1588
2048, d8s9-1 1332 1470 1212 873 611 612 1333 1733
2048, d9s7+2 1663 1440 1179 1342 611 1023 4150 4085
2048, d9s8+1 1660 1439 1182 1343 610 1023 4026 4014
2048, d9s9-0 1098 850 664 667 664 664 4135 4127
2048, d15s15 770 853 664 665 662 664 4136 4108

Title: Re: reason to switch to 64 Bit Assembler
Post by: Gunther on February 22, 2013, 04:17:44 AM

Here the test results:

Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz (SSE4)

Algo memcpy MemCo1 MemCo2 MemCoC3 MemCoP4 MemCoC2 MemCoL xmemcpy
Description CRT rep movs movdqa lps+hps movdqa movdqa Masm32 Habran's
dest-al psllq CeleronM dest-al src-al library Ferrari
Code size ? 70 291 222 200 269 33 104
------------------------------------------------------------------------------------
2048, d0s0-0 427 223 251 248 247 250 224 292
2048, d1s1-0 275 251 275 270 277 273 274 303
2048, d7s7-0 275 253 282 273 278 276 274 303
2048, d7s8-1 279 271 617 453 247 269 273 303
2048, d7s9-2 279 272 617 450 254 269 274 303
2048, d8s7+1 275 270 621 483 256 272 274 304
2048, d8s8-0 275 255 295 284 288 291 274 303
2048, d8s9-1 275 271 610 452 254 269 273 294
2048, d9s7+2 283 272 611 486 262 276 276 309
2048, d9s8+1 287 277 612 486 261 276 274 309
2048, d9s9-0 280 260 287 280 281 285 280 309
2048, d15s15 280 260 287 281 282 286 280 309

Gunther

Title: Re: reason to switch to 64 Bit Assembler
Post by: jj2007 on February 22, 2013, 05:30:19 AM

One more - not by accident, #4 was named "CeleronM" ;-)

Intel(R) Celeron(R) M CPU 420 @ 1.60GHz (SSE3)

Algo memcpy MemCo1 MemCo2 MemCoC3 MemCoP4 MemCoC2 MemCoL xmemcpy
Description CRT rep movs movdqa lps+hps movdqa movdqa Masm32 Habran's
dest-al psllq CeleronM dest-al src-al library Ferrari
Code size ? 70 291 222 200 269 33 104
------------------------------------------------------------------------------------
2048, d0s0-0 556 566 363 363 373 363 563 1051
2048, d1s1-0 1047 619 421 423 444 423 1683 1782
2048, d7s7-0 567 619 418 420 446 420 1699 1782
2048, d7s8-1 1677 1714 1090 441 1118 1123 1302 1337
2048, d7s9-2 1677 1713 1090 441 1118 1123 1716 1782
2048, d8s7+1 1655 1502 1090 857 979 975 1647 1245
2048, d8s8-0 556 619 420 422 448 422 563 1051
2048, d8s9-1 1664 1714 1083 441 1118 1123 1661 1241
2048, d9s7+2 1668 1502 1081 857 979 975 1762 1495
2048, d9s8+1 1668 1502 1081 857 979 975 1283 1052
2048, d9s9-0 1047 619 420 422 448 422 1686 1497
2048, d15s15 567 619 422 424 446 424 1678 1497

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 22, 2013, 08:56:19 AM

as I said before this routine is PARTICULARLY made for UNALIGNED data
that is why I use MOVDQU command
there is no reason to create a sophisticated algorithm for aligned data
you can just use fastest command to do that depending on the ability of your machine

Code Select


    ;r8 can contain sizeof(buffer)
    .for (rcx=dest,rdx=src,r8=count,r8>>=5¦r8¦rcx+=32,rdx+=32,r8--)
             vmovdqa ymm4,[rdx]
             vmovdqa [rcx],ymm4
    .endfor
    ;or for for 16 byte xmm:

    ;r8 can contain sizeof(buffer)
    .for (rcx=dest,rdx=src,r8=count,r8>>=4¦r8¦rcx+=16,rdx+=16,r8--)
             movdqa xmm4,[rdx]
             movdqa [rcx],xmm4
    .endfor

    ;for 32 bit machine
    ;eax can contain sizeof(buffer)
    .for (ecx=dest,edx=src,eax=count,eax>>=4¦eax¦ecx+=16,edx+=16,eax--)
             movdqa xmm4,[edx]
             movdqa [ecx],xmm4
    .endfor
;or for JJ2007
     mov ecx,dest
     mov edx,src
     mov eax,sizeof(buffer)
     shr eax,4
     .while (eax)
             movdqa xmm4,[edx]
             movdqa [ecx],xmm4
             add edx,16
             add ecx,16
             dec eax
      .endw

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 22, 2013, 09:17:31 AM

we can also use this:
mov ecx,dest
mov edx,src
mov eax,sizeof(buffer)
sub eax,16
.while (SDWORD eax > 0)
movdqa xmm4,[edx+eax]
movdqa [ecx+eax],xmm4
sub eax,16
.endw

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 22, 2013, 09:35:21 AM

we can use macros rather then subs
like this:

Code Select


xmcopy16 MACRO dest,crc,size
     mov ecx,dest
     mov edx,src
     mov eax,size
     sub eax,16
     .while (SDWORD eax >= 0)
         movdqa xmm4,[edx+eax]
         movdqa [ecx+eax],xmm4
         sub eax,16
     .endw
ENDM 
xmcopy32 MACRO dest,crc,size
     mov rcx,size
     mov rdx,src
     mov rax,size
     sub rax,32
     .while (SQWORD rax >= 0)
         movdqa xmm4,[rdx+rax]
         movdqa [rcx+rax],xmm4
         sub rax,32
     .endw
ENDM

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 22, 2013, 10:19:03 AM

here is test on my computer for JJ's exe

Code Select


Intel(R) Core(TM) i7-3610QM CPU @ 2.30GHz (SSE4)

Algo           memcpy   MemCo1   MemCo2  MemCoC3  MemCoP4  MemCoC2   MemCoL  xme
mcpy
Description       CRT rep movs   movdqa  lps+hps   movdqa   movdqa   Masm32 Habran's
                       dest-al    psllq CeleronM  dest-al   src-al  library  Ferrari
Code size           ?       70      291      222      200      269       33
 104
------------------------------------------------------------------------------------
2048, d0s0-0      133      184      205      203      202      204      184    238
2048, d1s1-0      225      206      226      223      227      223      225    249
2048, d7s7-0      225      208      229      225      228      216      225    249
2048, d7s8-1      228      223      501      367      209      219      221    246
2048, d7s9-2      225      218      498      365      206      217      221    245
2048, d8s7+1      221      217      502      390      206      219      221    244
2048, d8s8-0      221      205      238      229      232      235      221    244
2048, d8s9-1      222      218      492      365      204      218      222    245
2048, d9s7+2      220      217      488      390      206      219      221    244
2048, d9s8+1      226      218      491      390      206      219      221    245
2048, d9s9-0      221      206      224      222      224      226      221    245
2048, d15s15      221      206      226      222      225      226      221    245


--- ok ---

It is interesting how my code has steady speed in different sizes
and it is interesting how older processors perform in different way than newer

thank you JJ for taking time to write testing programs :t
however, I suspect that you are puling my leg because I don't have time nor desire to learn your BSIC$ ;)
(for the reason I mentioned before)
when I talk about a beauty of the source code I talk about visual effect ,readability and functionality
sometime your programs can be maybe even faster than someone else's but no one will try to read it
because most of your MULTO IMPORTANTE routines are hidden either in $$$$$ macros or %$#% external functions
however, it is a pleasure to exchange opinions and diversity in programming technics :biggrin:

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 22, 2013, 10:28:04 AM

Japheth,

QuoteThe fastest machine that I have available is an 5 year old AMD 64 X2 5000+.

I saw on Google that they are advertising new laptops for $249 dollars (probably with AVX) :biggrin:

Title: Re: reason to switch to 64 Bit Assembler
Post by: dedndave on February 22, 2013, 11:03:36 AM

actually, Jochen's MasmBasic is a very productive library
you can bet many of the routines are quite fast
and - many of the functions aren't found in the masm32 library
i would use it more often, myself, except for one thing....

i am trying to learn assembler for windows
high-level constructs mask the assembler code i am trying to learn
the same may be said for many of your macros

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 22, 2013, 01:24:53 PM

hi dedndave,

Quoteactually, Jochen's MasmBasic is a very productive library
you can bet many of the routines are quite fast
and - many of the functions aren't found in the masm32 library

there is no doubt about :t
we are talking here about readability of sources :bgrin:
as soon as I look at his source code I feel like piercing my eyes with a cactus torn
programs that look like this:"LET$!@#$%^&*@#$%^&*!"
who can have now-days enough patience and concentration to follow this code
"Mission Impossible 32" with JJ2007 as main actor (Tom Cruise refused the role because of the age)
and he is hiding his most important sources from public eyes like double agent 007
another drawback is that Jochen's MasmBasic is 32 bit and I am programming only 64 bit

I love assembler that's why I joined to this forum otherwise I would be a member of some BASIC community

please don't tell to JJ about our conversation, I don't want him to feel bad because I like him and appreciate his brains

Macros are helpful to make programs more readable but they should be visible to programmers and named properly :biggrin:

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 22, 2013, 01:51:55 PM

thanks Gunther for your contribution to this topic :t

QuoteIntel(R) Core(TM) i7-3770 CPU @ 3.40GHz (SSE4)

Speedy Gonzales like my "Ferrari Testarossa xmemcpy"
(AVX tires would make it even faster)
(https://encrypted-tbn3.gstatic.com/images?q=tbn:ANd9GcRIvLk3BkI91urzhSOVGY3L48NHdusYFcSO4Q9XukpSpHx9A398uQ)
thanks to our God Father JJ Corleone for naming it so

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 24, 2013, 09:04:41 AM

Japheth,

QuoteI fully agree! However - almost 100% faster than MB - which allegedly is already rocket-science? How is this possible? You must do something wrong...

I found this explanation in "INTEL® 64 AND IA-32 PROCESSOR ARCHITECTURES" manual

Quote
2.3.5.1 Efficient Handling of Alignment Hazards
The cache and memory subsystems handles a significant percentage of instructions
in every workload. Different address alignment scenarios will produce varying performance
impact for memory and cache operations. For example, 1-cycle throughput of
L1 (see Table 2-21) generally applies to naturally-aligned loads from L1 cache. But
using unaligned load instructions (e.g. MOVUPS, MOVUPD, MOVDQU, etc.) to access
data from L1 will experience varying amount of delays depending on specific microarchitectures
and alignment scenarios.
Table 2-21. Performance Impact of Address Alignments of MOVDQU from L1
Throughput (cycle) Intel Core i7 45 nm Intel Core 65 nm Intel
Processor Microarchitecture CoreMicroarchitecture
________________________________________________________________________
Alignment Scenario 06_1AH 06_17H 06_0FH
16B aligned 1 2 2
________________________________________________________________________
Not-16B aligned, not
cache split
1 ~2 ~2
________________________________________________________________________
Split cache line
boundary ~4.5 ~20 ~20
________________________________________________________________________

Because my procesor is 2.3 gig Core i7 with a lot of cashe
it takes only 1 cycle for ither MOVDQU or MOVDQA

Title: Re: reason to switch to 64 Bit Assembler
Post by: drifter on February 26, 2013, 05:49:48 PM

Intel(R) Core(TM) i7 CPU 860 @ 2.80GHz (SSE4)

Algo memcpy MemCo1 MemCo2 MemCoC3 MemCoP4 MemCoC2 MemCoL xmemcpy
Description CRT rep movs movdqa lps+hps movdqa movdqa Masm32 Habran's
dest-al psllq CeleronM dest-al src-al library Ferrari
Code size ? 70 291 222 200 269 33 104
------------------------------------------------------------------------------------
2048, d0s0-0 196 257 252 252 583 235 600 444
2048, d1s1-0 460 274 687 690 284 277 704 444
2048, d7s7-0 468 277 286 286 289 281 293 444
2048, d7s8-1 302 299 732 521 240 253 705 444
2048, d7s9-2 302 300 867 607 256 253 294 445
2048, d8s7+1 294 726 640 551 239 247 265 444
2048, d8s8-0 471 280 700 288 287 282 293 443
2048, d8s9-1 295 303 637 522 272 253 704 444
2048, d9s7+2 300 301 634 553 289 593 703 444
2048, d9s8+1 301 724 633 552 277 247 292 444
2048, d9s9-0 469 670 694 269 696 282 294 446
2048, d15s15 415 280 289 251 289 284 293 447

--- ok ---

on: February 10, 2013, 11:57:14 PM Gunther wrote:

QuoteThere are a few applications which really need more than 4 GB RAM (large data bases for example), but others do not.

The transporters of the future will need to access 7,000,000,000,000,000,000,000,000,000 points of data - that's a 795,807,864,054,000.1 terrabyte address space :icon_eek:

Title: Re: reason to switch to 64 Bit Assembler
Post by: Gunther on February 26, 2013, 09:24:23 PM

Hi drifter,

Quote from: drifter on February 26, 2013, 05:49:48 PM
The transporters of the future will need to access 7,000,000,000,000,000,000,000,000,000 points of data - that's a 795,807,864,054,000.1 terrabyte address space :icon_eek:

that might be, but that could be reached with a 64 bit architecture. But what's with the hole bunch of other applications? By the way, you'll find a few 64 bit applications in the forum, which I've written.

Gunther

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 26, 2013, 10:08:02 PM

hello drifter,
welcome to the forum :biggrin:
interesting to see the difference in speed with different processors
your is i7 2.8 gig and mine is i7 2.3 but speed is double
I am curies why is that?
Gunter, your is i7 3,4 gig and still slower than qWord's and mine

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 26, 2013, 10:29:12 PM

here are specifications:

Intel® Core™ i7-3610QM Processor Intel® Core™ i7-3770 Processor
(6M Cache, up to 3.30 GHz) (8M Cache, up to 3.90 GHz)
Specifications Specifications
Essentials Essentials
Status   Launched Status     Launched
Launch Date   Q2'12 Launch Date   Q2'12
Processor Number   i7-3610QM Processor Number   i7-3770
# of Cores   4 # of Cores   4
# of Threads   8 # of Threads   8
Clock Speed   2.3 GHz Clock Speed   3.4 GHz
Max Turbo Frequency   3.3 GHz Max Turbo Frequency   3.9 GHz
Intel® Smart Cache   6 MB Intel® Smart Cache   8 MB
Bus/Core Ratio   23 Bus/Core Ratio   34
DMI   5 GT/s DMI   5 GT/s
Instruction Set   64-bit Instruction Set   64-bit
Instruction Set Extensions   AVX Instruction Set Extensions   SSE4.1/4.2, AVX
Embedded Options Available   No Embedded Options Available   Yes
Lithography   22 nm Lithography   22 nm
Max TDP   45 W Max TDP   77 W
Recommended Customer Price   TRAY: $378.00 Recommended Customer Price   TRAY: $294.00
BOX : $305.00

Title: Re: reason to switch to 64 Bit Assembler
Post by: dedndave on February 26, 2013, 11:55:54 PM

the number of clock cycles it takes for a processor to do something doesn't make a very good benchmark
you are comparing one algo to another
not comparing one cpu to another

Title: Re: reason to switch to 64 Bit Assembler
Post by: Gunther on February 27, 2013, 01:07:02 AM

Hi habran,

Quote from: dedndave on February 26, 2013, 11:55:54 PM
the number of clock cycles it takes for a processor to do something doesn't make a very good benchmark

that's the answer.

Gunther

Title: Re: reason to switch to 64 Bit Assembler
Post by: drifter on February 27, 2013, 05:34:08 AM

on: February 26, 2013, 09:24:23 PM Gunther wrote:

Quotebut that could be reached with a 64 bit architecture

I thought 64 bit architecture could only reach 18,446,744,073,709,551,616 bits?

QuoteBy the way, you'll find a few 64 bit applications in the forum, which I've written.

Thanks! I definately look forward to studying those (once I get up to speed on 32 bits).

Title: Re: reason to switch to 64 Bit Assembler
Post by: drifter on February 27, 2013, 05:45:28 AM

on: February 26, 2013, 10:08:02 PM habran wrote:

Quoteyour is i7 2.8 gig and mine is i7 2.3 but speed is double
I am curies why is that?

I probably didn't have my computer set up for an optimum running of the race - it was just my standard operating configuration. I did close down all other applications, including the ones I didn't need in the toolbar - there's probably a lot more I could have shut down in task manager.

Title: Re: reason to switch to 64 Bit Assembler
Post by: Gunther on February 27, 2013, 05:53:51 AM

Hi drifter,

Quote from: drifter on February 27, 2013, 05:34:08 AM
I thought 64 bit architecture could only reach 18,446,744,073,709,551,616 bits?

please, have a look here: https://en.wikipedia.org/wiki/64-bit_architecture (https://en.wikipedia.org/wiki/64-bit_architecture)

Gunther

Title: Re: reason to switch to 64 Bit Assembler
Post by: dedndave on February 27, 2013, 07:02:09 AM

Frederick,
these tests are normally written to run with elevated prioriity
you shouldn't have to close down background apps to get decent results
provided the guy followed a few simple guidelines :P

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 27, 2013, 08:51:51 AM

Quote from: Gunther on February 27, 2013, 01:07:02 AM
Hi habran,

Quote from: dedndave on February 26, 2013, 11:55:54 PM
the number of clock cycles it takes for a processor to do something doesn't make a very good benchmark

that's the answer.
Gunther

.if (Gunther && dedndave)
would please explain
to habran
jmp understand
nop
nop
understand:
return knowlage
.endif
:bgrin:

Title: Re: reason to switch to 64 Bit Assembler
Post by: drifter on February 27, 2013, 08:55:13 AM

on: Feburary 26, 2013 at 07:02:09 AM dedndave wrote:

Quotethese tests are normally written to run with elevated prioriity
you shouldn't have to close down background apps to get decent results

I put this computer together a couple of years ago with and I've been very happy with it:

Computer:
MSI P55-GD80 Motherboard w/Intel Core i7CPU @ 2.8 Ghz
2 x 27" Samsung LCD monitors
2 x Seagate ST32000641AS - Barracuda XT 2 TB Hard drives
16 GB RAM
Windows 8 64-bit

Desk:
Apogee Rosetta 800 AD/DA 8-channel converter
Lexicon PCM80 Digital Effects Processor
Lexicon PCM90 Digital Reverberator
Focusrite ISA-110 Limited Edition pre-amp/equalizer
Avalon Vt-737sp Vacuum Tube pre-amp/compressor/equalizer

Miscellaneous:
Neumann U87 Ai Condenser microphone
M-Audio Keystation Pro88 keyboard

Software:
Cakewalk Sonar/Dimension Pro/Rapture (64-bit)
Sony Sound Forge/Acid/Vegas/CD Architect /DVD Architect
Spectrasonics Stylus RMX/Trilian/Omnishpere

and of course:
MASM32
IDA Pro 5.3
Visual Studio 2005

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 27, 2013, 09:10:54 AM

hey drifter,
that looks impressive and neat :shock:
how did you pay your wife to arrange all this for you ;)
however, your screens are to high and you get probably quickly tired of looking at it :dazzled:

Title: Re: reason to switch to 64 Bit Assembler
Post by: dedndave on February 27, 2013, 09:16:14 AM

when we run these tests....

do:
compare algo A to algo B on computer 1
compare algo A to algo B on computer 2

do not:
compare algo A on computer 1 to algo A on computer 2
compare algo B on computer 1 to algo B on computer 2

Title: Re: reason to switch to 64 Bit Assembler
Post by: drifter on February 27, 2013, 09:21:49 AM

on: Feburary 26, 2013 at 09:10:54 AM habran wrote:

Quotehow did you pay your wife to arrange all this for you

I have a Russian wife - she still thinks it's her duty...

Quotehowever, your screens are to high and you get probably quickly tired of looking at it :dazzled:

If I have to read something, I take it to kinkos have it printed..

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 27, 2013, 09:44:23 AM

drifter

QuoteI have a Russian wife - she still thinks it's her duty...

I am *much *much younger than you but I have learned in my life that
NOTHING is for free in this world ;)

*much = 1 year :lol:

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 27, 2013, 09:48:54 AM

Quote from: dedndave on February 27, 2013, 09:16:14 AM
when we run these tests....

do:
compare algo A to algo B on computer 1
compare algo A to algo B on computer 2

do not:
compare algo A on computer 1 to algo A on computer 2
compare algo B on computer 1 to algo B on computer 2

is that how you test which processor is faster? :dazzled:

Title: Re: reason to switch to 64 Bit Assembler
Post by: dedndave on February 27, 2013, 10:06:51 AM

no - we don't want to know which processor is faster :P

we want to know which algorithm is best, let's say, overall

Quotedo:
compare algo A to algo B on computer 1
compare algo A to algo B on computer 2

do not:
compare algo A on computer 1 to algo A on computer 2
compare algo B on computer 1 to algo B on computer 2

the real information is in (algo B)/(algo A) on a given processor
you might compare that ratio with the same ratio on another processor

we aren't here to measure cpu's :t

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 27, 2013, 11:03:43 AM

Quote from: dedndave on February 27, 2013, 10:06:51 AM
we aren't here to measure cpu's :t

so, you think I was rude to be curious :shock:

Quote
interesting to see the difference in speed with different processors
your is i7 2.8 gig and mine is i7 2.3 but speed is double
I am curies why is that?
Gunter, your is i7 3,4 gig and still slower than qWord's and mine

that was not a provocation it was a curios technical question:
to put it that way:
you are a salesman and I ask you as a customer:
"why would I by i7 6.8 gig if i7 2.3 gig runs faster?" :icon_confused:
and you tell me: "i7 6.8 gig sounds better, you moron" :icon_mrgreen:

Title: Re: reason to switch to 64 Bit Assembler
Post by: dedndave on February 27, 2013, 01:46:01 PM

a clock cycle on one cpu is not the same as a clock cycle on another (moron) :lol:

someone posted the comparison of specs - then removed it
but, i suspect the ratio of internal to external bus clocks might have something to do with it

if you want to know which is fastest, run a real-time benchmark test
set up a test to measure minutes and seconds to accomplish some specific task
then run the same test on both machines
you want it to be fairly long, say a few minutes or so

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 27, 2013, 03:05:13 PM

now you talking... :lol:
I thought bicycle, motorcycle, reversecycle, evercycle wtfcycle... are the same :dazzled:
thanks dedndave :t

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 27, 2013, 09:55:25 PM

dedndave, how do you explain this: :greenclp:

QuoteClock Speed
Clock speed is the rate at which a processor can complete a processing cycle. It is typically measured in megahertz or gigahertz. One megahertz is equal to one million cycles per second, while one gigahertz equals one billion cycles per second. This means a 1.8 GHz processor has twice the clock speed of a 900 MHz processor.

However, it is important to note that a 1.8 GHz CPU is not necessarily twice as fast as a 900 MHz CPU. This is because different processors often use different architectures. For example, one processor may require more clock cycles to complete a multiplication instruction than another processor. If the 1.8 GHz CPU can complete a multiplication instruction in 4 cycles, while the 900 MHz CPU takes 7 cycles, the 1.8 GHz processor will be more than twice as fast as the 900 MHz processor. Conversely, if the 1.8 GHz processor takes more cycles to perform the instruction, it will be less than 2x as fast as the 900 MHz processor.

Other factors, such as a computer's bus speed, cache size, speed of the RAM, and hard drive speed also contribute to the overall performance of the machine. Therefore, while the processor's clock speed is a significant indicator of how fast a computer is, it is not the only factor that matters.

you can find it here:here (http://pc.net/glossary/definition/clockspeed)

Title: Re: reason to switch to 64 Bit Assembler
Post by: dedndave on February 28, 2013, 12:02:55 AM

that's what i've been trying to tell you - lol

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 28, 2013, 06:02:48 AM

are you sure?

QuoteClock speed is the rate at which a processor can complete a processing cycle. It is typically measured in megahertz or gigahertz.

so, if some computer needs 444 cycles and another computer needs 222 cycles for the same job, does that mean that the one with 444 cycles is faster because it has bigger number? :exclaim:

Title: Re: reason to switch to 64 Bit Assembler
Post by: dedndave on February 28, 2013, 11:36:20 AM

write a test app to know the answer :biggrin:

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 28, 2013, 01:07:54 PM

dedndave,
your motto is: "Never give up" :biggrin:
my motto is : "Never give in" 8)
you and me together unbreakable :t

The MASM Forum

64 bit assembler => 64 bit assembler. Conceptual Issues => Topic started by: habran on February 10, 2013, 08:03:46 PM