The MASM Forum

64 bit assembler => 64 bit assembler. Conceptual Issues => Topic started by: habran on February 10, 2013, 08:03:46 PM

Title: reason to switch to 64 Bit Assembler
Post by: habran on February 10, 2013, 08:03:46 PM
here are three examples which will tell you why
first is 64 beauty:

xmemcpy PROC dest:QWORD,src :QWORD, count:UINT_PTR
  mov r9,rcx
  .if (rcx != rdx)
  .for (¦r8¦al=[rdx],[rcx]=al,rcx++,rdx++,r8--)
    .endfor
  .endif
  mov rax,r9
  ret
xmemcpy ENDP
compiled to this:
000000014004DA78  push        rbp 
000000014004DA79  mov         rbp,rsp
000000014004DA7C  mov         r9,rcx
000000014004DA7F  cmp         rcx,rdx
000000014004DA82  je          xmemcpy+22h (14004DA9Ah)
000000014004DA84  jmp         xmemcpy+1Bh (14004DA93h)
000000014004DA86  mov         al,byte ptr [rdx]
000000014004DA88  mov         byte ptr [rcx],al
000000014004DA8A  inc         rcx 
000000014004DA8D  inc         rdx 
000000014004DA90  dec         r8   
000000014004DA93  and         r8,r8
000000014004DA96  je          xmemcpy+22h (14004DA9Ah)
000000014004DA98  jmp         xmemcpy+0Eh (14004DA86h)
000000014004DA9A  mov         rax,r9
000000014004DA9D  leave           
000000014004DA9E  ret             


the second is the same routine but 32 bit:

xmemcpy PROC USES ebx dest:DORD,src:DWORD,count:DWORD
  mov ecx,dest
  .if (ecx != src)
  .for (edx=src,ebx=count¦ebx¦al=[edx],[ecx]=al,ecx++,edx++,ebx--)
    .endfor
  .endif
  mov eax,dest
  ret
xmemcpy ENDP
compiled to this:
00401020 55              push    ebp
00401021 8bec            mov     ebp,esp
00401023 53              push    ebx
00401024 8b4d08          mov     ecx,dword ptr [ebp+8]
00401027 3b4d0c          cmp     ecx,dword ptr [ebp+0Ch]
0040102a 7415            je      xmemcpy+0x21 (00401041)
0040102c 8b550c          mov     edx,dword ptr [ebp+0Ch]
0040102f 8b5d10          mov     ebx,dword ptr [ebp+10h]
00401032 eb07            jmp     xmemcpy+0x1b (0040103b)
00401034 8a02            mov     al,byte ptr [edx]
00401036 8801            mov     byte ptr [ecx],al
00401038 41              inc     ecx
00401039 42              inc     edx
0040103a 4b              dec     ebx
0040103b 23db            and     ebx,ebx
0040103d 7402            je      xmemcpy+0x21 (00401041)
0040103f ebf3            jmp     xmemcpy+0x14 (00401034)
00401041 8b4508          mov     eax,dword ptr [ebp+8]
00401044 5b              pop     ebx
00401045 5d              pop     ebp
00401046 c20c00          ret     0Ch

and here is the C version:

void* xmemcpy(void *dest, const void *src, int count)
{
  unsigned char *byte_dest=(unsigned char *)dest;
  unsigned char *byte_src=(unsigned char *)src;

  if (byte_dest != byte_src)
  {
    if (count)
    {
      for (;;)
      {
        *byte_dest=*byte_src;
        if (!--count) break;
        ++byte_dest;
        ++byte_src;
      }
    }
  }
  return dest;
}
compiled to this:
01271AF0  push        ebp 
01271AF1  mov         ebp,esp 
01271AF3  sub         esp,0D8h 
01271AF9  push        ebx 
01271AFA  push        esi 
01271AFB  push        edi 
01271AFC  lea         edi,[ebp-0D8h] 
01271B02  mov         ecx,36h 
01271B07  mov         eax,0CCCCCCCCh 
01271B0C  rep stos    dword ptr es:[edi] 
01271B0E  mov         eax,dword ptr [dest] 
01271B11  mov         dword ptr [byte_dest],eax 
01271B14  mov         eax,dword ptr [src] 
01271B17  mov         dword ptr [byte_src],eax 
01271B1A  mov         eax,dword ptr [byte_dest] 
01271B1D  cmp         eax,dword ptr [byte_src] 
01271B20  je          xmemcpy+63h (01271B53h) 
01271B22  cmp         dword ptr [count],0 
01271B26  je          xmemcpy+63h (01271B53h) 
01271B28  mov         eax,dword ptr [byte_dest] 
01271B2B  mov         ecx,dword ptr [byte_src] 
01271B2E  mov         dl,byte ptr [ecx] 
01271B30  mov         byte ptr [eax],dl 
01271B32  mov         eax,dword ptr [count] 
01271B35  sub         eax,1 
01271B38  mov         dword ptr [count],eax 
01271B3B  jne         xmemcpy+4Fh (01271B3Fh) 
01271B3D  jmp         xmemcpy+63h (01271B53h) 
01271B3F  mov         eax,dword ptr [byte_dest] 
01271B42  add         eax,1 
01271B45  mov         dword ptr [byte_dest],eax 
01271B48  mov         eax,dword ptr [byte_src] 
01271B4B  add         eax,1 
01271B4E  mov         dword ptr [byte_src],eax 
01271B51  jmp         xmemcpy+38h (01271B28h) 
01271B53  mov         eax,dword ptr [dest] 
01271B56  pop         edi 
01271B57  pop         esi 
01271B58  pop         ebx 
01271B59  mov         esp,ebp 
01271B5B  pop         ebp 
01271B5C  ret


Do you need more reasons? ;)
Title: Re: reason to switch to 64 Bit Assembler
Post by: MichaelW on February 10, 2013, 08:13:54 PM
Am I missing something here, it looks like the 64-bit code is moving a byte at a time. Should not most of the move be done 64-bits at a time?
Title: Re: reason to switch to 64 Bit Assembler
Post by: Vortex on February 10, 2013, 08:16:20 PM
Hi Habran,

Did you test the same C code with a 64-bit C compiler?
Title: Re: reason to switch to 64 Bit Assembler
Post by: jj2007 on February 10, 2013, 09:00:23 PM
Timings would be nice :biggrin:
Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 10, 2013, 09:04:18 PM
good point wortex :t
here it is the 64 bit in C:

void* xmemcpy(void *dest, const void *src, UINT_PTR count)
{
  unsigned char *byte_dest=(unsigned char *)dest;
  unsigned char *byte_src=(unsigned char *)src;

  if (byte_dest != byte_src)
  {
    if (count)
    {
      for (;;)
      {
        *byte_dest=*byte_src;
        if (!--count) break;
        ++byte_dest;
        ++byte_src;
      }
    }
  }
  return dest;
}

0000000140063630  mov         qword ptr [rsp+18h],r8
0000000140063635  mov         qword ptr [rsp+10h],rdx
000000014006363A  mov         qword ptr [rsp+8],rcx
000000014006363F  sub         rsp,18h
0000000140063643  mov         rax,qword ptr [dest]
0000000140063648  mov         qword ptr [byte_dest],rax
000000014006364D  mov         rax,qword ptr [src]
0000000140063652  mov         qword ptr [rsp],rax
0000000140063656  mov         rax,qword ptr [rsp]
000000014006365A  cmp         qword ptr [byte_dest],rax
000000014006365F  je          xmemcpy+7Bh (1400636ABh)
0000000140063661  cmp         qword ptr [count],0
0000000140063667  je          xmemcpy+7Bh (1400636ABh)
0000000140063669  mov         rax,qword ptr [byte_dest]
000000014006366E  mov         rcx,qword ptr [rsp]
0000000140063672  movzx       ecx,byte ptr [rcx]
0000000140063675  mov         byte ptr [rax],cl
0000000140063677  mov         rax,qword ptr [count]
000000014006367C  sub         rax,1
0000000140063680  mov         qword ptr [count],rax
0000000140063685  cmp         qword ptr [count],0
000000014006368B  jne         xmemcpy+5Fh (14006368Fh)
000000014006368D  jmp         xmemcpy+7Bh (1400636ABh)
000000014006368F  mov         rax,qword ptr [byte_dest]
0000000140063694  add         rax,1
0000000140063698  mov         qword ptr [byte_dest],rax
000000014006369D  mov         rax,qword ptr [rsp]
00000001400636A1  add         rax,1
00000001400636A5  mov         qword ptr [rsp],rax
00000001400636A9  jmp         xmemcpy+39h (140063669h)
00000001400636AB  mov         rax,qword ptr [dest]
00000001400636B0  add         rsp,18h
00000001400636B4  ret             
Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 10, 2013, 09:06:37 PM
MichaelW,
NO :icon_exclaim: :biggrin:
Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 10, 2013, 09:26:42 PM
JJ2007,
QuoteTimings would be nice :biggrin:
I agree, but I have to CONFESS that I don't know that part :icon_mrgreen:
Can you please do it for mee :biggrin:
Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 10, 2013, 10:05:27 PM
MichaelW,
if data is aligned to 8 , 16 or 32 byte
than it is possible to do that like this example:

align 8
AXCHARINDEX struct
  nLine       SDWORD ?
  lpLine     INT_PTR ?
  nCharInLine SDWORD ?
AXCHARINDEX ends

.code

    lea  rdi,ciPoint                ;points to first index
    lea  rsi,ciPoint1               ;points to second index
    mov  ecx,sizeof(AXCHARINDEX)/8
    rep  movsq


otherwise, if they are chars for example, it is not convenient to do that always
however, we can write more complex EG: xxxmemcpy which would be able to calculate the size of data
and than first transfer all possible QWORDS and than if left last DWORD and than if left last WORD and than if left last BYTE
EG: data size is 256+7 EQU  32 QWORDS, 1 DWORD, 1 WORD, and 1 BYTE

in my case I used simple xmemcpy because of simplicity

have look at these code above
It takes about the same amount of bytes as if you call some function but it is there in the present location 
I assure you that it is faster and more appropriate than some sophisticated function especially if it is not in the cash at the time
It would be even more appropriate to create a MACRO to do the same job :biggrin:
 
Title: Re: reason to switch to 64 Bit Assembler
Post by: Gunther on February 10, 2013, 10:37:52 PM
Hi habran,

there's no doubt that the 64 bit world is the future. But for the next years both - 32 bit and 64 bit - will coexist. I'm inside the 64 bit world since a few years; the first Linux kernel came out 2001; Windows was some years later. Under 32 bit the Application Binary Interface (ABI) are the same. So, one could write code for both platforms. That's over, because the 64 bit ABIs are very different. We can use code for both platforms only in rare cases.

All things considered: there are advantages and disadvantages.

Gunther
Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 10, 2013, 11:12:33 PM
hey Gunther,
QuoteBut for the next years both - 32 bit and 64 bit - will coexist.
no doubt thy will, because of accumulated 32 bit apps
however, IMO to continue to write 32 bit programs would be like holding with your nails on the cliff
in my case I would forever like to program in assembly C64, I felt like I had a chocolate in my mouth when I did that
but who needs any more those apps :(
now, I have the same sensation as with C64 when I write 64 bit JWASM :icon_exclaim: :icon_exclaim: :icon_exclaim:
if it was not for excellent JWASM (thanks Japheth :t) I would maybe return to C and C#
it looks like 64 is my favorite number
I am a lazy person by nature and I always think (I am not lazy to think ;))
about a fast and simple way to finish anything (except sex :t)
I don't walk to shop 10 time to bring home the grocery, I use a car for that
you may say that walking is healthy but it is not if you have to carry bloody grocery in your hands :bgrin:
Title: Re: reason to switch to 64 Bit Assembler
Post by: Gunther on February 10, 2013, 11:57:14 PM
Hi habran,

things are a bit more complicated.

The transition from 32 bit to 64 bit will take more time as you might think. We've seen in the past the transition from 16 bit to 32 bit (by the way, I've never discussed C64 programming in my posts). That process had a time line of approximately 15 years. But: we had a lot of memory trouble under 16 bit, which wasn't easy (XMS, EMS, several DOS extenders etc). The pressure was enorm, because a lot of applications at the begin of the 90s were very memory hungry.

That's not the case by the transition from 32 to 64 bit. There are a few applications which really need more than 4 GB RAM (large data bases for example), but others do not. So, we can calculate for that transition at least 15 years. That's a long time; therefore it makes sense to write for both worlds.

I won't argue that you should write 32 bit code. Write your 64 bit applications and that's fine. But have a look for the difficulties: different ABI for the main platforms, some people in our forum can't run 64 bit operating systems (hardware limitations), other people like 32 bit programming etc. etc. A bit more tolerance for other point of views wouldn't be bad.

Gunther
Title: Re: reason to switch to 64 Bit Assembler
Post by: dedndave on February 10, 2013, 11:58:55 PM
many of us use MichaelW's code timing macros
http://masm32.com/board/index.php?topic=49.0 (http://masm32.com/board/index.php?topic=49.0)

attached is a 32-bit program for timing code (assemble as a console app)
you may or may not want to adapt the code to 64-bit
Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 11, 2013, 12:04:33 AM
thanks dedndave :t
I will look at it tomorrow and see if it pays of to translate to x64 :biggrin:
Title: Re: reason to switch to 64 Bit Assembler
Post by: dedndave on February 11, 2013, 12:11:47 AM
i don't know if Michael has plans to make a 64-bit version of his macro set
shouldn't be too hard   :P
Title: Re: reason to switch to 64 Bit Assembler
Post by: qWord on February 11, 2013, 12:20:14 AM
For a x64 adaption of MichaelW's counter macro see my post in this thread: http://masm32.com/board/index.php?topic=49.msg130#msg130.
Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 11, 2013, 05:49:30 AM
thanks qWord :biggrin:
Title: Re: reason to switch to 64 Bit Assembler
Post by: qWord on February 11, 2013, 06:23:36 AM
habran,
your compare between C and ASM in your first post is unfair, because is obviously a debug build.
Also, a smarter "algorithm" would probably produce much better results. e.g. something like this (not tested):
void* xmemcpy(void *dest, void *src, unsigned int cb)
{
unsigned int cnt1 = cb>>((sizeof(char*)==8)?3:2);
unsigned int cnt2 = cb&(sizeof(char*)-1);
char** p1 = (char**)dest;
char** p2 = (char**)src;
char* p3;
char* p4;

for(;cnt1--;p1++,p2++)
*p1 = *p2;

p3 = (char*)p1;
p4 = (char*)p2;

if (sizeof(char*) == 8) // dead code for x32
if(cnt2&4)
{ *((int*)p3)= *((int*)p4);
p3+=4;p4+=4;cnt2-=4;
}

for(;cnt2--;p3++,p4++)
*p3 = *p4;

return dest;
}
Title: Re: reason to switch to 64 Bit Assembler
Post by: jj2007 on February 11, 2013, 09:07:53 AM
Quote from: qWord on February 11, 2013, 06:23:36 AM
habran,
your compare between C and ASM in your first post is unfair, because is obviously a debug build.
Also, a smarter "algorithm" would probably produce much better results.

I wonder how efficient this code from the "64 beauty" example is (can't test it, unfortunately):

000000014004DA90  dec         r8   
000000014004DA93  and         r8,r8 <<< no need for that, the flag is already set
000000014004DA96  je          xmemcpy+22h (14004DA9Ah) <<< why not jne xmemcpy+0Eh?? static branch prediction rules would suggest that it is even faster...
000000014004DA98  jmp         xmemcpy+0Eh (14004DA86h) <<< can be dropped entirely.
000000014004DA9A  mov         rax,r9


Again, timings would be nice ;-)
Title: Re: reason to switch to 64 Bit Assembler
Post by: frktons on February 11, 2013, 10:35:01 AM
For what I recall, a memcopy done with native 64 bit registers, in 64
bit systems, is the fastest solution found when we tested, a couple
of years ago, XMM/SSE2 code for this kind of operation.

The test was done on a 32 MB buffer that was simply blanked, not really a memcopy
but it was set just to measure the performance of REP STOSQ vs MOVNTDQ
and measured via rdtsc.
The results were like:
Quote
Clearing done
117,940,861 clocks for a 33,554,432 bytes buffer with using REP STOSQ

Clearing done
1,208,750,068 clocks for a 33,554,432 bytes buffer with using MOVNTDQ

Code from Alex.

I agree with habran, as I said at the time, for many reasons, but I also
understand why years of work are not easily dropped or rewritten.  :t


Frank
Title: Re: reason to switch to 64 Bit Assembler
Post by: Gunther on February 11, 2013, 10:51:18 AM
Frank,

Quote from: frktons on February 11, 2013, 10:35:01 AM
For what I recall, a memcopy done with native 64 bit registers, in 64
bit systems, is the fastest solution found when we tested, a couple
of years ago, XMM/SSE2 code for this kind of operation.

the situation has changed dramatically since the advent of Intel's AVX. We should do the test again.

Gunther
Title: Re: reason to switch to 64 Bit Assembler
Post by: frktons on February 11, 2013, 11:05:20 AM
Quote from: Gunther on February 11, 2013, 10:51:18 AM

the situation has changed dramatically since the advent of Intel's AVX. We should do the test again.

Gunther

I think a new test can only confirm that 64 bit mov operations are
faster than 32 bit ones. If anyone has a new processor, say habran, and
the skill to use AVX code, he could do it.
Not that difficult if he really likes to do the test, I can post the 64 bit MASM
code that I used 2 years ago. No AVX because neither Alex's, nor my PC are
AVX able.
Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 11, 2013, 11:35:59 AM
I have tested a speed of 64 bit and result is 1:4 against C
for one pass JWASM is 80   or 50h
and C is 207    intersting :P (JJ2007) 0CFh
JJ207 you are correct that more optimization could be done to it
however my intent in this case was not so much focused on that but on beauty and simplicity of 64 bit JWASM
I have used ".for" loop which is portable, readable and easy to use but it can not beat human eyes and brains

thank you Frank for supporting me that's what friends are for :t

qWord,
Quoteyour compare between C and ASM in your first post is unfair, because is obviously a debug build
all of them are debug built because I needed to read a code in memory :icon_eek:
your function looks good and I will test it later
Quote
however, we can write more complex EG: xxxmemcpy which would be able to calculate the size of data
and than first transfer all possible QWORDS and than if left last DWORD and than if left last WORD and than if left last BYTE
EG: data size is 256+7 EQU  32 QWORDS, 1 DWORD, 1 WORD, and 1 BYTE
I think I have seen already on internet written similar function but I can't remember was it in C or assembler
UNFAIR :icon_eek:
what is fair in this world??? life is a bitch!
these days even death is not fair any more, if you are rich you by yourself brand new organs and live as long as you want :P
Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 11, 2013, 11:43:07 AM
hi Frank,
I can try to do that dough I did not learn yet AVX
I am ready for another challenge, I am not a chicken :lol:
I have to go now to earn my living, "I'll be back 8)"
Title: Re: reason to switch to 64 Bit Assembler
Post by: jj2007 on February 11, 2013, 11:48:15 AM
Quote from: habran on February 11, 2013, 11:35:59 AMintersting :P (JJ2007) 0CFh

What do you mean with that? ::)
Title: Re: reason to switch to 64 Bit Assembler
Post by: qWord on February 11, 2013, 12:53:56 PM
Quote from: habran on February 11, 2013, 11:35:59 AM
I have tested a speed of 64 bit and result is 1:4 against C
I can't confirm that: my own quick test shows that there is nearly no difference between your .for-loop and xmemcpy.
Function:   xmemcpy    xmemcpy2   xmemcpy_Q  xmemcpy_Q2 memcpy     @ForLoop
--- buffer size = 13 ---
align +0    29         17         4          2          8          33
align +1    29         17         5          3          10         32
align +2    29         17         7          3          13         32
align +3    29         17         5          2          11         33
align +4    29         17         4          2          11         32
align +5    29         17         5          3          11         32
align +6    29         17         4          3          11         32
align +7    29         17         5          2          11         32
align +8    29         17         4          2          11         32
align +9    29         17         5          3          11         32
align +10   29         18         4          3          11         32
align +11   29         17         5          2          11         32
align +12   29         17         4          2          11         32
align +13   29         17         5          3          11         32
align +14   29         17         4          4          11         32
align +15   29         17         5          3          11         32
--- buffer size = 33 ---
align +0    93         46         10         10         10         77
align +1    73         46         10         7          10         76
align +2    77         46         10         7          10         76
align +3    92         46         10         7          10         77
align +4    73         46         10         7          10         76
align +5    82         46         10         7          10         76
align +6    73         46         10         7          10         77
align +7    91         47         10         7          10         76
align +8    73         47         11         7          13         76
align +9    73         47         10         7          10         91
align +10   73         46         10         7          10         76
align +11   73         47         10         7          11         77
align +12   76         47         10         7          10         76
align +13   86         47         10         7          10         76
align +14   74         47         10         7          10         76
align +15   84         47         18         8          10         76
--- buffer size = 59 ---
align +0    124        97         21         14         17         134
align +1    152        98         22         15         17         135
align +2    129        98         23         15         17         134
align +3    129        102        22         15         17         135
align +4    129        98         22         14         17         137
align +5    129        98         21         16         18         139
align +6    128        98         21         16         17         133
align +7    129        98         21         15         17         135
align +8    129        98         21         14         17         134
align +9    128        98         21         14         17         135
align +10   128        98         21         15         17         135
align +11   129        98         20         15         17         134
align +12   129        98         21         14         17         134
align +13   133        98         21         15         17         135
align +14   135        99         21         15         16         135
align +15   127        98         21         15         17         134
--- buffer size = 590 ---
align +0    920        908        150        123        65         1041
align +1    915        886        149        124        62         1040
align +2    922        906        149        124        82         1048
align +3    925        887        150        124        64         1037
align +4    920        891        150        127        63         1103
align +5    918        892        149        124        64         1042
align +6    974        897        157        128        82         1087
align +7    938        888        149        124        64         1032
align +8    921        889        151        123        65         1070
align +9    941        887        154        124        85         1051
align +10   937        888        150        124        63         1056
align +11   953        887        150        123        64         1013
align +12   920        897        151        124        63         1039
align +13   925        900        150        123        63         1017
align +14   938        892        151        124        63         1090
align +15   927        889        156        125        64         1053

  ---   Functions ----
  xmemcpy    : habran , PellesC
  xmemcpy2   : habran , VC 2012
  xmemcpy_Q  : qWord  , PellesC
  xmemcpy_Q2 : qWord  , VC 2012
  memcpy     : MSVCRT
  @ForLoop   : habran

only alignment of Src varies, Dest is allocated by HeapAlloc()

Press any key to continue ...
Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 11, 2013, 03:17:47 PM
qWord,
I have used counter_begin and counter_end as MACROS like this

local buff[256]:BYTE

    counter_begin 1,1
    invoke xmemcpy,ADDR buff,CTEXT("habran is very smart cooker"), 27
    counter_end

and I've got above mentioned results
do you want to say that I lied :icon_eek:

however, I don't believe in your testing because, looking in a C source everyone can see that there is much more
job for processor and also accessing memory in C than ASM

are you sure that your testing is correct
if so I will go back to C64  :bgrin:

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 11, 2013, 03:25:23 PM
JJ2007,
QuoteWhat do you mean with that?

207 reminded me on 2007 and it is funny because C knows that you don't like it :biggrin:

BTW 2007 reminded me on two James Bonds or double agent 007
what actually you are doing in Italy? 8)
Title: Re: reason to switch to 64 Bit Assembler
Post by: dedndave on February 11, 2013, 04:05:21 PM
 :biggrin:

http://csdb.dk/forums/?roomid=11 (http://csdb.dk/forums/?roomid=11)
Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 11, 2013, 04:30:28 PM
thanks dedndave :P :
Bye everyone  :biggrin:
Title: Re: reason to switch to 64 Bit Assembler
Post by: Gunther on February 11, 2013, 06:25:49 PM
Frank,

Quote from: frktons on February 11, 2013, 11:05:20 AM
I think a new test can only confirm that 64 bit mov operations are
faster than 32 bit ones. If anyone has a new processor, say habran, and
the skill to use AVX code, he could do it.
Not that difficult if he really likes to do the test, I can post the 64 bit MASM
code that I used 2 years ago. No AVX because neither Alex's, nor my PC are
AVX able.

I can do that next weekend; please post your code.

Gunther
Title: Re: reason to switch to 64 Bit Assembler
Post by: frktons on February 11, 2013, 08:17:09 PM
Quote from: Gunther on February 11, 2013, 06:25:49 PM
Frank,

I can do that next weekend; please post your code.

Gunther

Here you are. The code tests only  REP STOSQ vs MOVNTDQ.
You can add the tests for MOVAPS, MOVDQA, etc... if you like.

Frank
Title: Re: reason to switch to 64 Bit Assembler
Post by: qWord on February 11, 2013, 10:02:14 PM
Quote from: habran on February 11, 2013, 03:17:47 PMdo you want to say that I lied :icon_eek:
yes, the purpose of my post was to defame you  :dazzled:

Quote from: habran on February 11, 2013, 03:17:47 PM
however, I don't believe in your testing because, looking in a C source everyone can see that there is much more
job for processor and also accessing memory in C than ASM
good point  :t

BTW, this is what PellesC creates from your C code:
sub_140001000   proc near
                mov     rax, rcx
                mov     rcx, rax
                cmp     rcx, rdx
                jz      short locret_140001026
                test    r8d, r8d
                jz      short locret_140001026

loc_140001010:
                mov     r9b, [rdx]
                mov     [rcx], r9b
                sub     r8d, 1
                jz      short locret_140001026
                add     rcx, 1
                add     rdx, 1
                jmp     short loc_140001010

locret_140001026:                       
                retn
sub_140001000   endp
Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 11, 2013, 11:21:44 PM
qWord,
Quote
BTW, this is what PellesC creates from your C code:
Holly Cow!!! :exclaim: :icon_exclaim: :icon_eek:
are you pulling my leg :shock: actually, are you puling my both legs!???
If hat is true why can I not build 64 bit JWASM with it?
give me a proper explanation or I am gone to that C64 forum

Quoteyes, the purpose of my post was to defame you   :dazzled:
I was not aware that I am famous, am I really  :greenclp:
if I am really a celebrity, maybe I need a body guard, someone like Frank I meant Farmer not frktons  :biggrin:( Kevin Michael Costner) or Arnold Alois Schwarzenegger 8)(Terminator)
Title: Re: reason to switch to 64 Bit Assembler
Post by: dedndave on February 11, 2013, 11:54:49 PM
how about Bullseye from DareDevil

(http://www.wrak.pl/inne/daredevil_bullseye.jpg)

funniest bad guy ever   :lol:
Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 12, 2013, 12:07:11 AM
dedndave, you are a genius  :t :eusa_clap:
what are you doing in this forum!!!???
you could struck rich somewhere else :bgrin:
You have DEFAMED me
Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 12, 2013, 12:13:28 AM
qWord PellesC produced almost perfect code which should look like this:

sub_140001000   proc near
                mov     rax, rcx
                cmp     rcx, rdx
                jz      short locret_140001026
                test    r8d, r8d
                jz      short locret_140001026
loc_140001010:
                mov     r9b, [rdx]
                mov     [rcx], r9b
                add     rcx, 1
                add     rdx, 1
                sub     r8d, 1
                jnz      short loc_140001010
locret_140001026:                       
                retn
sub_140001000   endp
Title: Re: reason to switch to 64 Bit Assembler
Post by: Gunther on February 12, 2013, 03:00:26 AM
Frank,

Quote from: frktons on February 11, 2013, 08:17:09 PM
Here you are. The code tests only  REP STOSQ vs MOVNTDQ.
You can add the tests for MOVAPS, MOVDQA, etc... if you like.

Frank

I'll first study your code and see what's to do. Thank you for uploading the source.  :t

Gunther
Title: Re: reason to switch to 64 Bit Assembler
Post by: Magnum on February 12, 2013, 03:17:10 AM
I think a better word might  be "celebrated".

Main Entry:    
celebrated  [sel-uh-brey-tid] Show IPA
Part of Speech:    adjective
Definition:    distinguished, famous
Synonyms:    acclaimed, big*, eminent, famed, glorious, great, high-powered, illustrious, immortal, important, large, laureate, lionized, notable, number one, numero uno, outstanding, popular, preeminent, prominent, renowned, revered, storied, up there, w. k., well-known

de·fame  audio  (d-fm) KEY

TRANSITIVE VERB:
de·famed, de·fam·ing, de·fames

    To damage the reputation, character, or good name of by slander or libel. See Synonyms at malign.
    Archaic To disgrace.
Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 12, 2013, 06:11:18 AM
thank you Magnum, :t
now I know who I am:
distinguished, high-powered, immortal, numero uno, macho-man 8)

I also want to say(no joking this time):
This forum has gathered the most prominent assembler programmers, and if we decide HERE that:
we should not hold with our teeth
something that is already obsolete
but embrace 64 bit
other assembler programmers
will have this to swallow
and our example follow


Title: Re: reason to switch to 64 Bit Assembler
Post by: dedndave on February 12, 2013, 06:14:11 AM
i could write some 64-bit code, but i'd have to get you guys to test it for me   :(
Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 12, 2013, 06:32:00 AM
dedndave, I promise you I will be proud to do that for you :t
Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 12, 2013, 06:37:57 AM
qWord this is what I get when compile your function in C with MSVC205:

xmemcpy:
0000000140063090  mov         qword ptr [rsp+18h],r8
0000000140063095  mov         qword ptr [rsp+10h],rdx
000000014006309A  mov         qword ptr [rsp+8],rcx
000000014006309F  sub         rsp,38h
00000001400630A3  mov         rax,qword ptr [cb]
00000001400630A8  shr         rax,3
00000001400630AC  mov         dword ptr [cnt1],eax
00000001400630B0  mov         rax,qword ptr [cb]
00000001400630B5  and         rax,7
00000001400630B9  mov         dword ptr [cnt2],eax
00000001400630BD  mov         rax,qword ptr [dest]
00000001400630C2  mov         qword ptr [p1],rax
00000001400630C7  mov         rax,qword ptr [src]
00000001400630CC  mov         qword ptr [p2],rax
00000001400630D1  jmp         xmemcpy+5Fh (1400630EFh)
00000001400630D3  mov         rax,qword ptr [p1]
00000001400630D8  add         rax,8
00000001400630DC  mov         qword ptr [p1],rax
00000001400630E1  mov         rax,qword ptr [p2]
00000001400630E6  add         rax,8
00000001400630EA  mov         qword ptr [p2],rax
00000001400630EF  mov         eax,dword ptr [cnt1]
00000001400630F3  mov         ecx,dword ptr [cnt1]
00000001400630F7  sub         ecx,1
00000001400630FA  mov         dword ptr [cnt1],ecx
00000001400630FE  test        eax,eax
0000000140063100  je          xmemcpy+84h (140063114h)
0000000140063102  mov         rax,qword ptr [p1]
0000000140063107  mov         rcx,qword ptr [p2]
000000014006310C  mov         rcx,qword ptr [rcx]
000000014006310F  mov         qword ptr [rax],rcx
0000000140063112  jmp         xmemcpy+43h (1400630D3h)
0000000140063114  mov         rax,qword ptr [p1]
0000000140063119  mov         qword ptr [p3],rax
000000014006311E  mov         rax,qword ptr [p2]
0000000140063123  mov         qword ptr [rsp],rax
0000000140063127  xor         eax,eax
0000000140063129  cmp         eax,1
000000014006312C  je          xmemcpy+0DBh (14006316Bh)
000000014006312E  mov         eax,dword ptr [cnt2]
0000000140063132  and         eax,4
0000000140063135  test        eax,eax
0000000140063137  je          xmemcpy+0DBh (14006316Bh)
0000000140063139  mov         rax,qword ptr [p3]
000000014006313E  mov         rcx,qword ptr [rsp]
0000000140063142  mov         ecx,dword ptr [rcx]
0000000140063144  mov         dword ptr [rax],ecx
0000000140063146  mov         rax,qword ptr [p3]
000000014006314B  add         rax,4
000000014006314F  mov         qword ptr [p3],rax
0000000140063154  mov         rax,qword ptr [rsp]
0000000140063158  add         rax,4
000000014006315C  mov         qword ptr [rsp],rax
0000000140063160  mov         eax,dword ptr [cnt2]
0000000140063164  sub         eax,4
0000000140063167  mov         dword ptr [cnt2],eax
000000014006316B  jmp         xmemcpy+0F7h (140063187h)
000000014006316D  mov         rax,qword ptr [p3]
0000000140063172  add         rax,1
0000000140063176  mov         qword ptr [p3],rax
000000014006317B  mov         rax,qword ptr [rsp]
000000014006317F  add         rax,1
0000000140063183  mov         qword ptr [rsp],rax
0000000140063187  mov         eax,dword ptr [cnt2]
000000014006318B  mov         ecx,dword ptr [cnt2]
000000014006318F  sub         ecx,1
0000000140063192  mov         dword ptr [cnt2],ecx
0000000140063196  test        eax,eax
0000000140063198  je          xmemcpy+11Ah (1400631AAh)
000000014006319A  mov         rax,qword ptr [p3]
000000014006319F  mov         rcx,qword ptr [rsp]
00000001400631A3  movzx       ecx,byte ptr [rcx]
00000001400631A6  mov         byte ptr [rax],cl
00000001400631A8  jmp         xmemcpy+0DDh (14006316Dh)
00000001400631AA  mov         rax,qword ptr [dest]
00000001400631AF  add         rsp,38h
00000001400631B3  ret

I can not believe that it takes only 2 ticks
Title: Re: reason to switch to 64 Bit Assembler
Post by: dedndave on February 12, 2013, 06:43:59 AM
i think you must be measuring that wrong
i think there is something wrong with the 2 cycle measurement   :P
maybe the timer code isn't doing what you think it is or something


i have a friend, not too far away...

he has a win 7-64 ultimate new-fangled machine at home, now
he uses it mostly for running his business
http://www.mesabattingcages.com/ (http://www.mesabattingcages.com/)

he will let me test whatever i like, but i would hate to mess up his machine
or even be near it if it messes up   :P
Title: Re: reason to switch to 64 Bit Assembler
Post by: qWord on February 12, 2013, 07:17:35 AM
Quote from: habran on February 12, 2013, 06:37:57 AMI can not believe that it takes only 2 ticks
you are obviously not able to configure your compiler! Also, looking in the code of my testbench (and yes ... you can't compile it because there are some dependencies I've not include) you will see that I've used a high loop count, which blends out memory access.
; MSVC 2010
sub_140008A60   proc near

                mov     r10d, r8d
                and     r8d, 7
                mov     r9, rcx
                shr     r10d, 3
                test    r10d, r10d
                jz      short loc_140008A94
                db      66h, 66h, 66h, 66h
                nop     word ptr [rax+rax+00000000h]

loc_140008A80:
                mov     rax, [rdx]
                add     r9, 8
                add     rdx, 8
                dec     r10d
                mov     [r9-8], rax
                jnz     short loc_140008A80

loc_140008A94:
                test    r8b, 4
                jz      short loc_140008AAC
                mov     eax, [rdx]
                add     r9, 4
                add     rdx, 4
                mov     [r9-4], eax
                add     r8d, 0FFFFFFFCh

loc_140008AAC:
                test    r8d, r8d
                jz      short loc_140008AD1
                sub     rdx, r9
                db      66h, 66h, 66h, 66h
                nop     dword ptr [rax+rax+00000000h]

loc_140008AC0:
                movzx   eax, byte ptr [rdx+r9]
                inc     r9
                dec     r8d
                mov     [r9-1], al
                jnz     short loc_140008AC0

loc_140008AD1:
                mov     rax, rcx
                retn
sub_140008A60   endp


in the attachment a testbench with loop count = 1

BTW: if you are not interested in a serious discussion, you may simply say that instead of this bullsh**  parody.
Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 12, 2013, 10:27:26 AM
yes qWord, now you are talking... :t
that looks more real then before and doesn't contradict to what I said before
my xmemcpy is OK for transferring one or two lines of characters, but for greater data transfer your function is absolute
I always admired your laser sharp mind and programmers skills :eusa_clap:   
Title: Re: reason to switch to 64 Bit Assembler
Post by: frktons on February 12, 2013, 11:11:03 AM
Quote from: qWord on February 12, 2013, 07:17:35 AM
Quote from: habran on February 12, 2013, 06:37:57 AMI can not believe that it takes only 2 ticks
you are obviously not able to configure your compiler! Also, looking in the code of my testbench (and yes ... you can't compile it because there are some dependencies I've not include) you will see that I've used a high loop count, which blends out memory access.
; MSVC 2010
sub_140008A60   proc near

                mov     r10d, r8d
                and     r8d, 7
                mov     r9, rcx
                shr     r10d, 3
                test    r10d, r10d
                jz      short loc_140008A94
                db      66h, 66h, 66h, 66h
                nop     word ptr [rax+rax+00000000h]

loc_140008A80:
                mov     rax, [rdx]
                add     r9, 8
                add     rdx, 8
                dec     r10d
                mov     [r9-8], rax
                jnz     short loc_140008A80

loc_140008A94:
                test    r8b, 4
                jz      short loc_140008AAC
                mov     eax, [rdx]
                add     r9, 4
                add     rdx, 4
                mov     [r9-4], eax
                add     r8d, 0FFFFFFFCh

loc_140008AAC:
                test    r8d, r8d
                jz      short loc_140008AD1
                sub     rdx, r9
                db      66h, 66h, 66h, 66h
                nop     dword ptr [rax+rax+00000000h]

loc_140008AC0:
                movzx   eax, byte ptr [rdx+r9]
                inc     r9
                dec     r8d
                mov     [r9-1], al
                jnz     short loc_140008AC0

loc_140008AD1:
                mov     rax, rcx
                retn
sub_140008A60   endp


in the attachment a testbench with loop count = 1

BTW: if you are not interested in a serious discussion, you may simply say that instead of this bullsh**  parody.

The executable is quite big after unzipping = 190K. What's inside?
I can't believe a simple test on memory copy takes all that code.
Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 19, 2013, 03:54:12 PM
here is the version what I was talking about  IMO fastest ever  :t
please prove me wrong  :biggrin:
I use here xmm4 and ymm4 because first 4 registers are used in float calculation and this one is volatile as well
so we don't have to preserve it

option win64:0
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
xmemcpy PROC dest:QWORD,src :QWORD, count:UINT_PTR
   mov rax,rcx
   .if (rcx!=rdx)
         .for (r10=r8,r10>>=5¦r10¦rcx+=32,rdx+=32,r10--)   
            vmovdqu ymm4,[rdx]
            vmovdqu [rcx],ymm4
        .endfor
shr r8,1
         .if (CARRY?)
mov r9b,[rdx]
mov [rcx],r9b
inc rcx
inc rdx
.endif
         shr r8,1
.if (CARRY?)
mov r9w,[rdx]
mov [rcx],r9w
add rcx,2
add rdx,2
.endif
         shr r8,1
         .if (CARRY?)
mov r9d,[rdx]
mov [rcx],r9d
add rcx,4
add rdx,4
.endif
         shr r8,1
         .if (CARRY?)
            mov r9,[rdx]
            mov [rcx],r9
    add rcx,8
    add rdx,8
.endif
         shr r8,1
         .if (CARRY?)
           movdqu xmm4,[rdx]
           movdqu [rcx],xmm4
.endif
   .endif   
aexit: ret             
xmemcpy ENDP
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef


it creates this code:


xmemcpy:
00000001`40034220 488bc1          mov     rax,rcx
00000001`40034223 483bca          cmp     rcx,rdx
00000001`40034226 747a            je      xmemcpy+0x82 (00000001`400342a2)
00000001`40034228 4d8bd0          mov     r10,r8
00000001`4003422b 49c1ea05        shr     r10,5
00000001`4003422f 4d23d2          and     r10,r10
00000001`40034232 7415            je      xmemcpy+0x29 (00000001`40034249)
00000001`40034234 c5fe6f22        vmovdqu ymm4,ymmword ptr [rdx]
00000001`40034238 c5fe7f21        vmovdqu ymmword ptr [rcx],ymm4
00000001`4003423c 4883c120        add     rcx,20h
00000001`40034240 4883c220        add     rdx,20h
00000001`40034244 49ffca          dec     r10
00000001`40034247 75eb            jne     xmemcpy+0x14 (00000001`40034234)
00000001`40034249 49d1e8          shr     r8,1
00000001`4003424c 730c            jae     xmemcpy+0x3a (00000001`4003425a)
00000001`4003424e 448a0a          mov     r9b,byte ptr [rdx]
00000001`40034251 448809          mov     byte ptr [rcx],r9b
00000001`40034254 48ffc1          inc     rcx
00000001`40034257 48ffc2          inc     rdx
00000001`4003425a 49d1e8          shr     r8,1
00000001`4003425d 7310            jae     xmemcpy+0x4f (00000001`4003426f)
00000001`4003425f 66448b0a        mov     r9w,word ptr [rdx]
00000001`40034263 66448909        mov     word ptr [rcx],r9w
00000001`40034267 4883c102        add     rcx,2
00000001`4003426b 4883c202        add     rdx,2
00000001`4003426f 49d1e8          shr     r8,1
00000001`40034272 730e            jae     xmemcpy+0x62 (00000001`40034282)
00000001`40034274 448b0a          mov     r9d,dword ptr [rdx]
00000001`40034277 448909          mov     dword ptr [rcx],r9d
00000001`4003427a 4883c104        add     rcx,4
00000001`4003427e 4883c204        add     rdx,4
00000001`40034282 49d1e8          shr     r8,1
00000001`40034285 730e            jae     xmemcpy+0x75 (00000001`40034295)
00000001`40034287 4c8b0a          mov     r9,qword ptr [rdx]
00000001`4003428a 4c8909          mov     qword ptr [rcx],r9
00000001`4003428d 4883c108        add     rcx,8
00000001`40034291 4883c208        add     rdx,8
00000001`40034295 49d1e8          shr     r8,1
00000001`40034298 7308            jae     xmemcpy+0x82 (00000001`400342a2)
00000001`4003429a f30f6f22        movdqu  xmm4,xmmword ptr [rdx]
00000001`4003429e f30f7f21        movdqu  xmmword ptr [rcx],xmm4
00000001`400342a2 c3              ret


and here is version for people without  AVX

xmemcpy PROC dest:QWORD,src :QWORD, count:UINT_PTR
   mov rax,rcx
   .if (rcx!=rdx)
     .for (r10=r8,r10>>=4¦r10¦rcx+=16,rdx+=16,r10--)   
        movdqu xmm4,[rdx]
        movdqu [rcx],xmm4
     .endfor
     shr r8,1
     .if (CARRY?)
  mov r9b,[rdx]
  mov [rcx],r9b
  inc rcx
  inc rdx
     .endif
     shr r8,1
     .if (CARRY?)
  mov r9w,[rdx]
  mov [rcx],r9w
  add rcx,2
  add rdx,2
     .endif
     shr r8,1
     .if (CARRY?)
  mov r9d,[rdx]
  mov [rcx],r9d
  add rcx,4
  add rdx,4
     .endif
     shr r8,1
     .if (CARRY?)
       mov r9,[rdx]
       mov [rcx],r9
   .endif
  .endif   
  ret             
xmemcpy ENDP
Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 20, 2013, 12:08:41 AM
I commented this for the visitors only, not for the members of this forum  :bgrin:

option win64:0                    ;no need for any option
OPTION PROLOGUE:NONE              ;just pure code
OPTION EPILOGUE:NONE
xmemcpy PROC dest:QWORD,src :QWORD, count:UINT_PTR
   mov rax,rcx                    ;save dest of transfered data for return  befor it changes
   .if (rcx!=rdx)                 ;check if there is not the same location of src and dest
         ;here is happening the MULTO IMPORTANTE transfer of data
         .for (r10=r8,r10>>=5¦r10¦rcx+=32,rdx+=32,r10--)   
            vmovdqu ymm4,[rdx]    ;transfer 32 byte at ones
            vmovdqu [rcx],ymm4    ;with ymm4 AVX register (The Transporter)
        .endfor                   ;RRRRRRRROOOOOOOOAAAAAAAAARRRRRRRR
  ;data is probably not aligned to 32 bytes so we have to check if so
  ;it could have been left for example 31 or 01Fh  or 0000 0000 0001 1111 in reg r8 or count
   shr r8,1                       ;check if so by shifting right 1 time
   .if (CARRY?)                   ;if 1 pops out it will enter in the carry flag
    mov r9b,[rdx]                 ;transfer only one byte to dest
    mov [rcx],r9b                 ;it can be only one byte
    inc rcx                       ;if more than 1 it will be done
    inc rdx                       ;in the next shift
  .endif
  shr r8,1                        ;LET$;) see if there is a word prezent
  .if (CARRY?)                    ;HA! I found you
    mov r9w,[rdx]                 ;store that only word in the dest
    mov [rcx],r9w
    add rcx,2                     ;this time add two to dest pos
    add rdx,2                     ;and src
   .endif
    shr r8,1                      ;shift again for the dword
   .if (CARRY?)                   ;nock-nock are you in cf
    mov r9d,[rdx]                 ;get in
    mov [rcx],r9d
    add rcx,4                     ;now we add 4 to both src and dest
    add rdx,4
  .endif
  shr r8,1                       ;looking for qword
  .if (CARRY?)                   ;no job for you today go home and do some programming
     mov r9,[rdx]               
     mov [rcx],r9
     add rcx,8
     add rdx,8                   ;inrease your pay for 8 bucks an hour
   .endif
   shr r8,1                      ;oword prezent today?
   .if (CARRY?)                   
      movdqu xmm4,[rdx]         
      movdqu [rcx],xmm4          ;Last Stand!!!
   .endif                        ;no need to increase pozition
   .endif                        ;I finished!!! Did you finish yet???
aexit: ret                       ;have a smocko
xmemcpy ENDP
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef


Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 20, 2013, 09:37:06 AM
I thought maybe those who still have 32 bit old-timers machines can feel the blow of the lightning speed in they hair :biggrin:
but be careful, it can blow away that little bit of hair left on your head ;)
so I wrote a 32 bit for them 8)

xmemcpy uses ebx dest:DWORD,src:DWORD,count:DWORD
   mov ecx,dest
   mov edx,src
   mov ebx,count
   .if (ecx!=edx)
     .for (eax=ebx,eax>>=4¦eax¦ecx+=16,edx+=16,eax--)   
        movdqu xmm4,[edx]
        movdqu [ecx],xmm4
     .endfor
     shr ebx,1
     .if (CARRY?)
       mov al,[edx]
       mov [ecx],al
       inc ecx
       inc edx
     .endif
     shr ebx,1
     .if (CARRY?)
       mov ax,[edx]
       mov [ecx],ax
       add ecx,2
       add edx,2
     .endif
     shr ebx,1
     .if (CARRY?)
       mov eax,[edx]
       mov [ecx],eax
       add ecx,4
       add edx,4
     .endif
     shr ebx,1
     .if (CARRY?)
       movq xmm4,[edx]
       movq [ecx],xmm4
   .endif
  .endif
  mov eax,dest   
  ret             
xmemcpy ENDP
Title: Re: reason to switch to 64 Bit Assembler
Post by: jj2007 on February 21, 2013, 03:34:58 AM
Doesn't assemble with my version of JWasm. Where is your latest build?
And what does .for (eax=ebx,eax>>=4¦eax¦ecx+=16,edx+=16,eax--) mean? "Much bigger or equal"??
Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 21, 2013, 05:48:21 AM
Hello JJ2007,
my latest build is as usual in the topic ".FOR built in JWasm" http://masm32.com/board/index.php?topic=402.0 (http://masm32.com/board/index.php?topic=402.0) :shock:
QuoteAnd what does .for (eax=ebx,eax>>=4¦eax¦ecx+=16,edx+=16,eax--) mean? "Much bigger or equal"??
and >>=4 means shift right 4 time    it produces shr eax,4  :biggrin:
it means "Much much much much less" :lol:
Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 21, 2013, 06:59:51 AM
Hey qWord,
Cat got your tongue?  :icon_eek:
(https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcT6k8_OVV3GpA_eIvy8fUnCQ-1nGLI_RS1DuNQdCt0G9AsuOCYQ)
you have the same "Qosmio laptop" as me
did you test the speed? :bgrin:
Title: Re: reason to switch to 64 Bit Assembler
Post by: frktons on February 21, 2013, 10:16:22 AM
Quote from: habran on February 20, 2013, 09:37:06 AM
I thought maybe those who still have 32 bit old-timers machines can feel the blow of the lightning speed in they hair :biggrin:
but be careful, it can blow away that little bit of hair left on your head ;)
so I wrote a 32 bit for them 8)

xmemcpy uses ebx dest:DWORD,src:DWORD,count:DWORD
   mov ecx,dest
   mov edx,src
   mov ebx,count
   .if (ecx!=edx)
     .for (eax=ebx,eax>>=4¦eax¦ecx+=16,edx+=16,eax--)   
        movdqu xmm4,[edx]
        movdqu [ecx],xmm4
     .endfor
     shr ebx,1
     .if (CARRY?)
       mov al,[edx]
       mov [ecx],al
       inc ecx
       inc edx
     .endif
     shr ebx,1
     .if (CARRY?)
       mov ax,[edx]
       mov [ecx],ax
       add ecx,2
       add edx,2
     .endif
     shr ebx,1
     .if (CARRY?)
       mov eax,[edx]
       mov [ecx],eax
       add ecx,4
       add edx,4
     .endif
     shr ebx,1
     .if (CARRY?)
       movq xmm4,[edx]
       movq [ecx],xmm4
   .endif
  .endif
  mov eax,dest   
  ret             
xmemcpy ENDP


Habran, why do you use MOVDQU and not align the memory
pointers to 16 bytes addresses? MOVAPS/MOVDQA are faster.
Unrolling the MOV can be another good option to test.
And if the area to copy is big, > 4 MB , MOVNTDQ is the best
option. Have a look at the old forum and search for CLEARBUFFER.

REP STOSQ is probably faster than your non AVX solution, give it
a shot on 64 bit version.
A last thing. You should post the results of your tests, if you like
to get the attention of somebody on these routines.

Frank
Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 21, 2013, 11:05:56 AM
Hi Frank, :biggrin:
Quotewhy do you use MOVDQU and not align the memory
because this routine is created particularly for unaligned data like text or something
I totally agree with you that MOVDQA is much faster  than MOVDQU  :t
however, if data is aligned to 32 byte I wouldn't need that routine I would just write in my source:


    ;r8 can contain sizeof(buffer)
    .for (rcx=dest,rdx=src,r8=count,r8>>=5¦r8¦rcx+=32,rdx+=32,r8--)
             vmovdqa ymm4,[rdx]
             vmovdqa [rcx],ymm4
    .endfor

or for for 16 byte xmm:


    ;r8 can contain sizeof(buffer)
    .for (rcx=dest,rdx=src,r8=count,r8>>=4¦r8¦rcx+=16,rdx+=16,r8--)
             movdqa xmm4,[rdx]
             movdqa [rcx],xmm4
    .endfor


QuoteA last thing. You should post the results of your tests

I left it to qWord to do that for me because he likes testing and arguing :P
and I like and appreciate him :biggrin:

Quoteif you like to get the attention of somebody on these routines.

I don't give a damn about attention, take it or leave it 8)
Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 21, 2013, 11:16:11 AM
Frank,
IMO it is not always advisable to align data to 16 or 32 bytes :(
if you have STRUCT in 32 bit program you align it to 4
in 64 bit logically is to align it to 8
however, when you work with big data transfer than it is logical to align it as big as your machine can afford :biggrin:
Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 21, 2013, 11:44:16 AM
this version is even more optimized then former and it has more logical order
as well as it can be faster for less data then 32 bytes:

option win64:0
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
xmemcpy PROC dest:QWORD,src :QWORD, count:UINT_PTR
    mov rax,rcx
    .if (rcx!=rdx)
shr r8,1
       .if (CARRY?)
     mov r9b,[rdx]
         mov [rcx],r9b
     inc rcx
     inc rdx
       .endif
       shr r8,1
       .if (CARRY?)
      mov r9w,[rdx]
      mov [rcx],r9w
      add rcx,2
      add rdx,2
       .endif
       shr r8,1
       .if (CARRY?)
     mov r9d,[rdx]
     mov [rcx],r9d
     add rcx,4
     add rdx,4
       .endif
       shr r8,1
       .if (CARRY?)
            mov r9,[rdx]
            mov [rcx],r9
            add rcx,8
            add rdx,8
       .endif
       shr r8,1
       .if (CARRY?)
            movdqu xmm4,[rdx]
            movdqu [rcx],xmm4
            add rcx,16
            add rdx,16
.endif
.for (¦r8¦rcx+=32,rdx+=32,r8--)   
    vmovdqu ymm4,[rdx]
    vmovdqu [rcx],ymm4
.endfor
    .endif     
aexit: ret                     
xmemcpy ENDP
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef
Title: Re: reason to switch to 64 Bit Assembler
Post by: jj2007 on February 21, 2013, 11:46:57 AM
Quote from: habran on February 21, 2013, 05:48:21 AM
Hello JJ2007,
my latest build is as usual in the topic ".FOR built in JWasm" http://masm32.com/board/index.php?topic=402.0 (http://masm32.com/board/index.php?topic=402.0)

Doesn't work on XP: "Not a valid Win32 app", access denied.
Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 21, 2013, 12:08:50 PM
sorry JJ2007, :(
it doesn't work on XP
Quotebinaries need at least Windows version 6
(Japheth)
however, there is a workaround for that
source code is in the folder and you can compile yourself if not to much hustle
just replace these two files in JW209s folder
if you don't have M$VC you can compile it with PelesC
but I don't believe you have enough energy to go through all that trouble :dazzled:
prove me wrong, I dare you :P
Title: Re: reason to switch to 64 Bit Assembler
Post by: jj2007 on February 21, 2013, 12:27:12 PM
The standard JWasm works just fine on XP, I use it every day. And, no, I won't try to compile it myself. It is not a question of energy, though. I am too wise to invest my time in trying to compile a major C app :biggrin:
Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 21, 2013, 01:14:23 PM
Wise man JJ2007 :biggrin:
believe it or not that excellent JWasm is written in C
and Japheth had to create binaries from it
how do you think he created it, by laying on it for four weeks or something ::)
NO!!! he compiled it!!!! and it looks that he did not dye of it
C is not a plug it is a programming language for Christ sake
don't be a chicken, roll your sleeves and get dirty
No pain no gain!!! :bgrin: 
Title: Re: reason to switch to 64 Bit Assembler
Post by: jj2007 on February 21, 2013, 01:31:28 PM
Quote from: habran on February 21, 2013, 01:14:23 PM
C is not a plug it is a programming language for Christ sake

It's spelled "plague", Habran.
Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 21, 2013, 01:49:47 PM
thanks wise man JJ2007 :t
what kind of spelling checker is that when it did not worn me!!! ;)
I will reward you for that and only you can use it, you deserved it! ;)
here is for you changed source:

xmemcpy uses ebx dest:DWORD,src:DWORD,count:DWORD
    mov ecx,dest
    mov edx,src
    mov ebx,count
    .if (ecx!=edx)
       shr ebx,1
       .if (CARRY?)
          mov al,[edx]
          mov [ecx],al
          inc ecx
          inc edx
       .endif
       shr ebx,1
       .if (CARRY?)
          mov ax,[edx]
          mov [ecx],ax
          add ecx,2
          add edx,2
       .endif
       shr ebx,1
       .if (CARRY?)
          mov eax,[edx]
          mov [ecx],eax
          add ecx,4
          add edx,4
       .endif
       shr ebx,1
       .if (CARRY?)
          movq xmm4,[edx]
          movq [ecx],xmm4
          add ecx,8
          add edx,8
       .endif
       .while (ebx)
          movdqu xmm4,[edx]
          movdqu [ecx],xmm4
          add ecx,16
          add edx,16
          dec ebx
       .endw
   .endif
   mov eax,dest   
   ret                     
xmemcpy ENDP

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 21, 2013, 04:05:16 PM
hey 2007,
are you going to abandon me because of a little spelling mistake :icon_eek:
plug, plague, plug in,plug out, ear plug, plagiarism... who cares  :dazzled:
you are just trying to mask the main issue: compiling JWasm :bgrin:
those ENGLEZE have made mess with unnecessary complex spelling just to tease pure strangers :exclaim:
they messed it up so much that even they can not write "for sale" but use "4 sale" :icon_confused:
Title: Re: reason to switch to 64 Bit Assembler
Post by: japheth on February 21, 2013, 06:43:56 PM
Hello,

Quote from: habran on February 21, 2013, 12:08:50 PM
if you don't have M$VC you can compile it with PelesC

it's mentioned in jwasm's readme, but since nobody reads readmes, I'll repeat it here: better do NOT use PellesC to compile JWasm - the jwasm binary created by PellesC is unable to pass the regression tests supplied with the assembler. I haven't analyzed the problem too deeply, but judging from the part that fails I assume that floating-point constants don't have the values as they should.

Good compilers are: Open Watcom, MSVC, GCC (MinGW)
Title: Re: reason to switch to 64 Bit Assembler
Post by: jj2007 on February 21, 2013, 09:12:31 PM
Quote from: habran on February 21, 2013, 01:49:47 PM
thanks wise man JJ2007 :t
...
here is for you changed source:
xmemcpy uses ebx dest:DWORD,src:DWORD,count:DWORD
    mov ecx,dest
    mov edx,src
...
   ret                     
xmemcpy ENDP

Thanks, it looks competitive :t

AMD Athlon(tm) Dual Core Processor 4450B (SSE3)
loop overhead is approx. 238/100 cycles

9458    cycles for 100 * xmemcpy
8056    cycles for 100 * MbCopy

9292    cycles for 100 * xmemcpy
7893    cycles for 100 * MbCopy

9289    cycles for 100 * xmemcpy
8072    cycles for 100 * MbCopy
Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 21, 2013, 10:43:26 PM
Hi Japheth,
Quotedo NOT use PellesC to compile JWasm
sorry for misunderstanding :bgrin:
I've read it but I thought that it applies only to 64 bit

jj2007,
thanks for testing it
this version is created for unaligned data as I mentioned before
can you please try to compare when not aligned at all? :biggrin:

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 21, 2013, 10:46:27 PM
jj2007,
this is what my machine produce from your test:

Intel(R) Core(TM) i7-3610QM CPU @ 2.30GHz (SSE4)
loop overhead is approx. 155/100 cycles

2242    cycles for 100 * xmemcpy
5356    cycles for 100 * MbCopy

2239    cycles for 100 * xmemcpy
5455    cycles for 100 * MbCopy

2243    cycles for 100 * xmemcpy
5166    cycles for 100 * MbCopy


--- ok ---


as double as fast as yours, wouldn't you say so :shock:
QuoteThanks, it looks competitive

I would say It looks downright stunning!!!! :t

Title: Re: reason to switch to 64 Bit Assembler
Post by: japheth on February 22, 2013, 12:14:38 AM
Quote from: habran on February 21, 2013, 10:46:27 PM
I would say It looks downright stunning!!!! :t

I fully agree! However - almost 100% faster than MB - which allegedly is already rocket-science? How is this possible? You must do something wrong...
Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 22, 2013, 12:15:53 AM
here is 64 bit without .for:

xmemcpy ENDP
option win64:0
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
xmemcpy PROC dest:QWORD,src :QWORD, count:UINT_PTR
         mov rax,rcx
         .if (rcx!=rdx)
    shr r8,1
               .if (CARRY?)
            mov r9b,[rdx]
          mov [rcx],r9b
    inc rcx
            inc rdx
           .endif
                shr r8,1
    .if (CARRY?)
             mov r9w,[rdx]
     mov [rcx],r9w
             add rcx,2
             add rdx,2
         .endif
             shr r8,1
            .if (CARRY?)
          mov r9d,[rdx]
          mov [rcx],r9d
          add rcx,4
          add rdx,4
            .endif
            shr r8,1
            .if (CARRY?)
               mov r9,[rdx]
               mov [rcx],r9
               add rcx,8
               add rdx,8
         .endif
            shr r8,1
               .if (CARRY?)
               movdqu xmm4,[rdx]
               movdqu [rcx],xmm4
               add rcx,16
               add rdx,16
   .endif
            .while (r8)         
              vmovdqu ymm4,[rdx]
              vmovdqu [rcx],ymm4
      add rcx,32
      add rdx,32
      dec r8
      .endw
             .endif             
         ret                                             
xmemcpy ENDP
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef
Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 22, 2013, 12:22:58 AM
Japheth,
Quote
I fully agree! However - almost 100% faster than MB - which allegedly is already rocket-science? How is this possible? You must do something wrong...
I did not touch code I just executed JJ's exe on my machine
and I can do it again now, let see: 

Intel(R) Core(TM) i7-3610QM CPU @ 2.30GHz (SSE4)
loop overhead is approx. 154/100 cycles

2237    cycles for 100 * xmemcpy
5243    cycles for 100 * MbCopy

2240    cycles for 100 * xmemcpy
5176    cycles for 100 * MbCopy

2233    cycles for 100 * xmemcpy
5163    cycles for 100 * MbCopy


--- ok ---


Japheth, why don't you try it in your machine?

Title: Re: reason to switch to 64 Bit Assembler
Post by: japheth on February 22, 2013, 12:35:02 AM
Quote from: habran on February 22, 2013, 12:22:58 AM
Japheth, why don't you try it in your machine?

The fastest machine that I have available is an 5 year old AMD 64 X2 5000+.
Title: Re: reason to switch to 64 Bit Assembler
Post by: jj2007 on February 22, 2013, 01:00:54 AM
Cool down. If it's faster than the MasmBasic algo, it just means it is faster on your CPU. Well optimised for your CPU.

In case you like it less superficially (d7=destination is align 16+7, s3=src is 16+3 etc):

AMD Athlon(tm) Dual Core Processor 4450B (SSE3)

Algo           memcpy   MemCo1   MemCo2  MemCoC3  MemCoP4  MemCoC2   MemCoL  xmemcpy
Description       CRT rep movs   movdqa  lps+hps   movdqa   movdqa   Masm32 Habran's
                       dest-al    psllq CeleronM  dest-al   src-al  library  Ferrari
Code size           ?       70      291      222      200      269       33      104
------------------------------------------------------------------------------------
2048, d0s0-0      561      549      360      439      424      361      547      541
2048, d1s1-0      720      597      410      473      473      421     1061      798
2048, d7s7-0      721      598      412      474      474      412     1060      798
2048, d7s8-1      809      851     1016      578      566      582      802      558
2048, d7s9-2      809      853     1016      567      566      567     1058      798
2048, d8s7+1      810      851      868      563      564      565      819      607
2048, d8s8-0      738      587      404      465      480      416      547      541
2048, d8s9-1      801      848      994      563      564      567      804      606
2048, d9s7+2      824      864      862      565      564      579     1060      798
2048, d9s8+1      808      853      862      564      567      565      803      543
2048, d9s9-0      721      595      411      472      472      409     1061      798
2048, d15s15      722      591      425      480      486      422     1072      798


Your algo is pretty good, but for the (frequent) aligned case, there are four algos that perform better on my AMD.
Title: Re: reason to switch to 64 Bit Assembler
Post by: dedndave on February 22, 2013, 02:32:43 AM
prescott w/htt
Intel(R) Pentium(R) 4 CPU 3.00GHz (SSE3)

Algo           memcpy   MemCo1   MemCo2  MemCoC3  MemCoP4  MemCoC2   MemCoL  xmemcpy
Description       CRT rep movs   movdqa  lps+hps   movdqa   movdqa   Masm32 Habran's
                       dest-al    psllq CeleronM  dest-al   src-al  library  Ferrari
Code size           ?       70      291      222      200      269       33      104
------------------------------------------------------------------------------------
2048, d0s0-0      717      719      608      609      904      610      718     1590
2048, d1s1-0     1100      846      651      651      650      650     4435     3945
2048, d7s7-0     1003      849      656      657      656      655     4437     3952
2048, d7s8-1     1368     1445     1223      868      611      613     4303     3799
2048, d7s9-2     1367     1446     1224      867      611      611     4454     3929
2048, d8s7+1     1338     1446     1188     1342      611     1023     1343     1748
2048, d8s8-0      976      849      656      657      657      656      977     1588
2048, d8s9-1     1332     1470     1212      873      611      612     1333     1733
2048, d9s7+2     1663     1440     1179     1342      611     1023     4150     4085
2048, d9s8+1     1660     1439     1182     1343      610     1023     4026     4014
2048, d9s9-0     1098      850      664      667      664      664     4135     4127
2048, d15s15      770      853      664      665      662      664     4136     4108
Title: Re: reason to switch to 64 Bit Assembler
Post by: Gunther on February 22, 2013, 04:17:44 AM
Here the test results:


Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz (SSE4)

Algo           memcpy   MemCo1   MemCo2  MemCoC3  MemCoP4  MemCoC2   MemCoL  xmemcpy
Description       CRT rep movs   movdqa  lps+hps   movdqa   movdqa   Masm32 Habran's
                       dest-al    psllq CeleronM  dest-al   src-al  library  Ferrari
Code size           ?       70      291      222      200      269       33      104
------------------------------------------------------------------------------------
2048, d0s0-0      427      223      251      248      247      250      224      292
2048, d1s1-0      275      251      275      270      277      273      274      303
2048, d7s7-0      275      253      282      273      278      276      274      303
2048, d7s8-1      279      271      617      453      247      269      273      303
2048, d7s9-2      279      272      617      450      254      269      274      303
2048, d8s7+1      275      270      621      483      256      272      274      304
2048, d8s8-0      275      255      295      284      288      291      274      303
2048, d8s9-1      275      271      610      452      254      269      273      294
2048, d9s7+2      283      272      611      486      262      276      276      309
2048, d9s8+1      287      277      612      486      261      276      274      309
2048, d9s9-0      280      260      287      280      281      285      280      309
2048, d15s15      280      260      287      281      282      286      280      309


Gunther
Title: Re: reason to switch to 64 Bit Assembler
Post by: jj2007 on February 22, 2013, 05:30:19 AM
One more - not by accident, #4 was named "CeleronM" ;-)

Intel(R) Celeron(R) M CPU        420  @ 1.60GHz (SSE3)

Algo           memcpy   MemCo1   MemCo2  MemCoC3  MemCoP4  MemCoC2   MemCoL  xmemcpy
Description       CRT rep movs   movdqa  lps+hps   movdqa   movdqa   Masm32 Habran's
                       dest-al    psllq CeleronM  dest-al   src-al  library  Ferrari
Code size           ?       70      291      222      200      269       33      104
------------------------------------------------------------------------------------
2048, d0s0-0      556      566      363      363      373      363      563     1051
2048, d1s1-0     1047      619      421      423      444      423     1683     1782
2048, d7s7-0      567      619      418      420      446      420     1699     1782
2048, d7s8-1     1677     1714     1090      441     1118     1123     1302     1337
2048, d7s9-2     1677     1713     1090      441     1118     1123     1716     1782
2048, d8s7+1     1655     1502     1090      857      979      975     1647     1245
2048, d8s8-0      556      619      420      422      448      422      563     1051
2048, d8s9-1     1664     1714     1083      441     1118     1123     1661     1241
2048, d9s7+2     1668     1502     1081      857      979      975     1762     1495
2048, d9s8+1     1668     1502     1081      857      979      975     1283     1052
2048, d9s9-0     1047      619      420      422      448      422     1686     1497
2048, d15s15      567      619      422      424      446      424     1678     1497
Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 22, 2013, 08:56:19 AM
as I said before this routine is PARTICULARLY made for UNALIGNED data
that is why I use MOVDQU command
there is no reason to create a  sophisticated algorithm for aligned data
you can just use fastest command to do that depending on  the ability of your machine

    ;r8 can contain sizeof(buffer)
    .for (rcx=dest,rdx=src,r8=count,r8>>=5¦r8¦rcx+=32,rdx+=32,r8--)
             vmovdqa ymm4,[rdx]
             vmovdqa [rcx],ymm4
    .endfor
    ;or for for 16 byte xmm:

    ;r8 can contain sizeof(buffer)
    .for (rcx=dest,rdx=src,r8=count,r8>>=4¦r8¦rcx+=16,rdx+=16,r8--)
             movdqa xmm4,[rdx]
             movdqa [rcx],xmm4
    .endfor

    ;for 32 bit machine
    ;eax can contain sizeof(buffer)
    .for (ecx=dest,edx=src,eax=count,eax>>=4¦eax¦ecx+=16,edx+=16,eax--)
             movdqa xmm4,[edx]
             movdqa [ecx],xmm4
    .endfor
;or for JJ2007
     mov ecx,dest
     mov edx,src
     mov eax,sizeof(buffer)
     shr eax,4
     .while (eax)
             movdqa xmm4,[edx]
             movdqa [ecx],xmm4
             add edx,16
             add ecx,16
             dec eax
      .endw     





Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 22, 2013, 09:17:31 AM
we can also use this:
     mov ecx,dest
     mov edx,src
     mov eax,sizeof(buffer)
     sub eax,16
     .while (SDWORD eax > 0)
             movdqa xmm4,[edx+eax]
             movdqa [ecx+eax],xmm4
              sub eax,16
      .endw     
Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 22, 2013, 09:35:21 AM
we can use macros rather then subs
like this:

xmcopy16 MACRO dest,crc,size
     mov ecx,dest
     mov edx,src
     mov eax,size
     sub eax,16
     .while (SDWORD eax >= 0)
         movdqa xmm4,[edx+eax]
         movdqa [ecx+eax],xmm4
         sub eax,16
     .endw
ENDM
xmcopy32 MACRO dest,crc,size
     mov rcx,size
     mov rdx,src
     mov rax,size
     sub rax,32
     .while (SQWORD rax >= 0)
         movdqa xmm4,[rdx+rax]
         movdqa [rcx+rax],xmm4
         sub rax,32
     .endw
ENDM             
Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 22, 2013, 10:19:03 AM
here is test on my computer for JJ's exe

Intel(R) Core(TM) i7-3610QM CPU @ 2.30GHz (SSE4)

Algo           memcpy   MemCo1   MemCo2  MemCoC3  MemCoP4  MemCoC2   MemCoL  xme
mcpy
Description       CRT rep movs   movdqa  lps+hps   movdqa   movdqa   Masm32 Habran's
                       dest-al    psllq CeleronM  dest-al   src-al  library  Ferrari
Code size           ?       70      291      222      200      269       33
104
------------------------------------------------------------------------------------
2048, d0s0-0      133      184      205      203      202      204      184    238
2048, d1s1-0      225      206      226      223      227      223      225    249
2048, d7s7-0      225      208      229      225      228      216      225    249
2048, d7s8-1      228      223      501      367      209      219      221    246
2048, d7s9-2      225      218      498      365      206      217      221    245
2048, d8s7+1      221      217      502      390      206      219      221    244
2048, d8s8-0      221      205      238      229      232      235      221    244
2048, d8s9-1      222      218      492      365      204      218      222    245
2048, d9s7+2      220      217      488      390      206      219      221    244
2048, d9s8+1      226      218      491      390      206      219      221    245
2048, d9s9-0      221      206      224      222      224      226      221    245
2048, d15s15      221      206      226      222      225      226      221    245


--- ok ---

It is interesting how my code has steady speed in different sizes
and it is interesting how older processors perform in different way than newer

thank you JJ for taking time to write testing programs :t
however, I suspect that you are puling my leg because I don't have time nor desire to learn your BSIC$  ;)
(for the reason I mentioned before)
when I talk about a beauty of the source code I talk about visual effect ,readability and functionality
sometime your programs can be maybe even faster than someone else's but no one will try to read it
because most of your MULTO IMPORTANTE routines are hidden either in $$$$$ macros or %$#% external functions
however, it is a pleasure to exchange opinions and diversity in programming technics  :biggrin:
Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 22, 2013, 10:28:04 AM
Japheth,
     
QuoteThe fastest machine that I have available is an 5 year old AMD 64 X2 5000+.
I saw on Google that they are advertising new laptops for $249 dollars (probably with AVX) :biggrin:
Title: Re: reason to switch to 64 Bit Assembler
Post by: dedndave on February 22, 2013, 11:03:36 AM
actually, Jochen's MasmBasic is a very productive library
you can bet many of the routines are quite fast
and - many of the functions aren't found in the masm32 library
i would use it more often, myself, except for one thing....

i am trying to learn assembler for windows
high-level constructs mask the assembler code i am trying to learn
the same may be said for many of your macros
Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 22, 2013, 01:24:53 PM
hi dedndave,

Quoteactually, Jochen's MasmBasic is a very productive library
you can bet many of the routines are quite fast
and - many of the functions aren't found in the masm32 library
there is no doubt about :t
we are talking here about readability of sources  :bgrin:
as soon as I look at his source code I feel like piercing my eyes  with a cactus torn
programs that look like this:"LET$!@#$%^&*@#$%^&*!"
who can have now-days enough patience and concentration to follow this code
"Mission Impossible 32" with JJ2007 as main actor (Tom Cruise refused the role because of the age) 
and he is hiding his most important sources from public eyes like double agent 007
another drawback is that Jochen's MasmBasic is 32 bit and I am programming only 64 bit

I love assembler that's why I joined to this forum otherwise I would be a member of some BASIC community

please don't tell to JJ about our conversation, I don't want him to feel bad because I like him and appreciate his brains

Macros are helpful to make programs more readable but they should be visible to programmers and named properly :biggrin:






Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 22, 2013, 01:51:55 PM
thanks Gunther for your contribution to this topic :t
QuoteIntel(R) Core(TM) i7-3770 CPU @ 3.40GHz (SSE4)
Speedy Gonzales like my "Ferrari Testarossa xmemcpy"
(AVX tires would make it even faster)
(https://encrypted-tbn3.gstatic.com/images?q=tbn:ANd9GcRIvLk3BkI91urzhSOVGY3L48NHdusYFcSO4Q9XukpSpHx9A398uQ)
thanks to our God Father JJ Corleone for naming it so

Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 24, 2013, 09:04:41 AM
Japheth,
QuoteI fully agree! However - almost 100% faster than MB - which allegedly is already rocket-science? How is this possible? You must do something wrong...
I found this explanation in "INTEL® 64 AND IA-32 PROCESSOR ARCHITECTURES" manual
Quote
2.3.5.1 Efficient Handling of Alignment Hazards
The cache and memory subsystems handles a significant percentage of instructions
in every workload. Different address alignment scenarios will produce varying performance
impact for memory and cache operations. For example, 1-cycle throughput of
L1 (see Table 2-21) generally applies to naturally-aligned loads from L1 cache. But
using unaligned load instructions (e.g. MOVUPS, MOVUPD, MOVDQU, etc.) to access
data from L1 will experience varying amount of delays depending on specific microarchitectures
and alignment scenarios.
Table 2-21. Performance Impact of Address Alignments of MOVDQU from L1
Throughput (cycle) Intel Core i7           45 nm Intel Core                   65 nm Intel
                                Processor            Microarchitecture            CoreMicroarchitecture
________________________________________________________________________
Alignment Scenario     06_1AH                 06_17H                               06_0FH
16B aligned                   1                            2                                         2
________________________________________________________________________
Not-16B aligned, not
cache split
                                    1                          ~2                                       ~2
________________________________________________________________________
Split cache line
boundary                     ~4.5                     ~20                                      ~20
________________________________________________________________________

Because my procesor is 2.3 gig Core i7 with a lot of cashe
it takes only 1 cycle for ither MOVDQU or MOVDQA
Title: Re: reason to switch to 64 Bit Assembler
Post by: drifter on February 26, 2013, 05:49:48 PM
Intel(R) Core(TM) i7 CPU         860  @ 2.80GHz (SSE4)

Algo           memcpy   MemCo1   MemCo2  MemCoC3  MemCoP4  MemCoC2   MemCoL  xmemcpy
Description       CRT rep movs   movdqa  lps+hps   movdqa   movdqa   Masm32 Habran's
                       dest-al    psllq CeleronM  dest-al   src-al  library  Ferrari
Code size           ?       70      291      222      200      269       33      104
------------------------------------------------------------------------------------
2048, d0s0-0      196      257      252      252      583      235      600      444
2048, d1s1-0      460      274      687      690      284      277      704      444
2048, d7s7-0      468      277      286      286      289      281      293      444
2048, d7s8-1      302      299      732      521      240      253      705      444
2048, d7s9-2      302      300      867      607      256      253      294      445
2048, d8s7+1      294      726      640      551      239      247      265      444
2048, d8s8-0      471      280      700      288      287      282      293      443
2048, d8s9-1      295      303      637      522      272      253      704      444
2048, d9s7+2      300      301      634      553      289      593      703      444
2048, d9s8+1      301      724      633      552      277      247      292      444
2048, d9s9-0      469      670      694      269      696      282      294      446
2048, d15s15      415      280      289      251      289      284      293      447


--- ok ---


on: February 10, 2013, 11:57:14 PM Gunther wrote:
QuoteThere are a few applications which really need more than 4 GB RAM (large data bases for example), but others do not.

The transporters of the future will need to access 7,000,000,000,000,000,000,000,000,000 points of data - that's a 795,807,864,054,000.1 terrabyte address space  :icon_eek:
Title: Re: reason to switch to 64 Bit Assembler
Post by: Gunther on February 26, 2013, 09:24:23 PM
Hi drifter,

Quote from: drifter on February 26, 2013, 05:49:48 PM
The transporters of the future will need to access 7,000,000,000,000,000,000,000,000,000 points of data - that's a 795,807,864,054,000.1 terrabyte address space  :icon_eek:

that might be, but that could be reached with a 64 bit architecture. But what's with the hole bunch of other applications? By the way, you'll find a few 64 bit applications in the forum, which I've written.

Gunther
Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 26, 2013, 10:08:02 PM
hello drifter,
welcome to the forum :biggrin:
interesting to see the difference in speed with different processors
your is i7 2.8 gig and mine is i7 2.3 but speed is double
I am curies why is that?
Gunter, your is i7 3,4 gig and still slower than qWord's and mine


Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 26, 2013, 10:29:12 PM
here are specifications:

Intel® Core™ i7-3610QM Processor                         Intel® Core™ i7-3770 Processor                     
(6M Cache, up to 3.30 GHz)                                    (8M Cache, up to 3.90 GHz)     
Specifications                                                         Specifications                 
Essentials                                                              Essentials                                               
Status   Launched                                               Status       Launched                                           
Launch Date   Q2'12                                            Launch Date   Q2'12                                 
Processor Number   i7-3610QM                             Processor Number   i7-3770                                 
# of Cores   4                                                           # of Cores   4                                             
# of Threads   8                                                   # of Threads   8                                           
Clock Speed   2.3 GHz                                         Clock Speed   3.4 GHz                                 
Max Turbo Frequency   3.3 GHz                                 Max Turbo Frequency   3.9 GHz                         
Intel® Smart Cache   6 MB                                      Intel® Smart Cache   8 MB                                 
Bus/Core Ratio   23                                                  Bus/Core Ratio   34                                             
DMI   5 GT/s                                                            DMI   5 GT/s                                             
Instruction Set   64-bit                                             Instruction Set   64-bit                   
Instruction Set Extensions   AVX                                Instruction Set Extensions   SSE4.1/4.2, AVX       
Embedded Options Available   No                          Embedded Options Available   Yes                           
Lithography   22 nm                                             Lithography   22 nm                                           
Max TDP   45 W                                                       Max TDP   77 W                               
Recommended Customer Price   TRAY: $378.00        Recommended Customer Price   TRAY: $294.00                   
                                                                              BOX : $305.00                             
Title: Re: reason to switch to 64 Bit Assembler
Post by: dedndave on February 26, 2013, 11:55:54 PM
the number of clock cycles it takes for a processor to do something doesn't make a very good benchmark
you are comparing one algo to another
not comparing one cpu to another
Title: Re: reason to switch to 64 Bit Assembler
Post by: Gunther on February 27, 2013, 01:07:02 AM
Hi habran,

Quote from: dedndave on February 26, 2013, 11:55:54 PM
the number of clock cycles it takes for a processor to do something doesn't make a very good benchmark

that's the answer.

Gunther
Title: Re: reason to switch to 64 Bit Assembler
Post by: drifter on February 27, 2013, 05:34:08 AM
on: February 26, 2013, 09:24:23 PM Gunther wrote:
Quotebut that could be reached with a 64 bit architecture

I thought 64 bit architecture could only reach 18,446,744,073,709,551,616 bits?

QuoteBy the way, you'll find a few 64 bit applications in the forum, which I've written.

Thanks!  I definately look forward to studying those (once I get up to speed on 32 bits).
Title: Re: reason to switch to 64 Bit Assembler
Post by: drifter on February 27, 2013, 05:45:28 AM
on: February 26, 2013, 10:08:02 PM habran wrote:
Quoteyour is i7 2.8 gig and mine is i7 2.3 but speed is double
I am curies why is that?

I probably didn't have my computer set up for an optimum running of the race - it was just my standard operating configuration.  I did close down all other applications, including the ones I didn't need in the toolbar - there's probably a lot more I could have shut down in task manager.
Title: Re: reason to switch to 64 Bit Assembler
Post by: Gunther on February 27, 2013, 05:53:51 AM
Hi drifter,

Quote from: drifter on February 27, 2013, 05:34:08 AM
I thought 64 bit architecture could only reach 18,446,744,073,709,551,616 bits?

please, have a look here: https://en.wikipedia.org/wiki/64-bit_architecture (https://en.wikipedia.org/wiki/64-bit_architecture)

Gunther
Title: Re: reason to switch to 64 Bit Assembler
Post by: dedndave on February 27, 2013, 07:02:09 AM
Frederick,
these tests are normally written to run with elevated prioriity
you shouldn't have to close down background apps to get decent results
provided the guy followed a few simple guidelines   :P
Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 27, 2013, 08:51:51 AM
Quote from: Gunther on February 27, 2013, 01:07:02 AM
Hi habran,

Quote from: dedndave on February 26, 2013, 11:55:54 PM
the number of clock cycles it takes for a processor to do something doesn't make a very good benchmark

that's the answer.
Gunther

.if (Gunther && dedndave)
    would please explain
    to habran
    jmp understand
    nop
    nop
understand:
   return knowlage
.endif
:bgrin:
Title: Re: reason to switch to 64 Bit Assembler
Post by: drifter on February 27, 2013, 08:55:13 AM
on: Feburary 26, 2013 at 07:02:09 AM dedndave wrote:
Quotethese tests are normally written to run with elevated prioriity
you shouldn't have to close down background apps to get decent results

I put this computer together a couple of years ago with and I've been very happy with it:

Computer:
MSI P55-GD80 Motherboard w/Intel Core i7CPU @ 2.8 Ghz
2 x 27" Samsung LCD monitors
2 x Seagate ST32000641AS - Barracuda XT 2 TB Hard drives
16 GB RAM
Windows 8 64-bit

Desk:
Apogee Rosetta 800 AD/DA 8-channel converter
Lexicon PCM80 Digital Effects Processor
Lexicon PCM90 Digital Reverberator
Focusrite ISA-110 Limited Edition pre-amp/equalizer
Avalon Vt-737sp Vacuum Tube pre-amp/compressor/equalizer

Miscellaneous:
Neumann U87 Ai Condenser microphone
M-Audio Keystation Pro88 keyboard

Software:
Cakewalk Sonar/Dimension Pro/Rapture (64-bit)
Sony Sound Forge/Acid/Vegas/CD Architect /DVD Architect
Spectrasonics Stylus RMX/Trilian/Omnishpere

and of course:
MASM32
IDA Pro 5.3
Visual Studio 2005
Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 27, 2013, 09:10:54 AM
hey drifter,
that looks impressive and neat :shock:
how did you pay your wife to arrange all this for you ;)
however, your screens are to high and you get probably quickly tired of looking at it :dazzled:
 
Title: Re: reason to switch to 64 Bit Assembler
Post by: dedndave on February 27, 2013, 09:16:14 AM
when we run these tests....

do:
compare algo A to algo B on computer 1
compare algo A to algo B on computer 2

do not:
compare algo A on computer 1 to algo A on computer 2
compare algo B on computer 1 to algo B on computer 2
Title: Re: reason to switch to 64 Bit Assembler
Post by: drifter on February 27, 2013, 09:21:49 AM
on: Feburary 26, 2013 at 09:10:54 AM habran wrote:
Quotehow did you pay your wife to arrange all this for you

I have a Russian wife - she still thinks it's her duty...

Quotehowever, your screens are to high and you get probably quickly tired of looking at it :dazzled:

If I have to read something, I take it to kinkos have it printed..
Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 27, 2013, 09:44:23 AM
drifter
QuoteI have a Russian wife - she still thinks it's her duty...
I am *much *much younger than you but I have learned in my life that
NOTHING is for free in this world ;)




*much = 1 year :lol:
Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 27, 2013, 09:48:54 AM
Quote from: dedndave on February 27, 2013, 09:16:14 AM
when we run these tests....

do:
compare algo A to algo B on computer 1
compare algo A to algo B on computer 2

do not:
compare algo A on computer 1 to algo A on computer 2
compare algo B on computer 1 to algo B on computer 2

is that how you test which processor is faster? :dazzled:
Title: Re: reason to switch to 64 Bit Assembler
Post by: dedndave on February 27, 2013, 10:06:51 AM
no - we don't want to know which processor is faster   :P

we want to know which algorithm is best, let's say, overall
Quotedo:
compare algo A to algo B on computer 1
compare algo A to algo B on computer 2

do not:
compare algo A on computer 1 to algo A on computer 2
compare algo B on computer 1 to algo B on computer 2

the real information is in (algo B)/(algo A) on a given processor
you might compare that ratio with the same ratio on another processor

we aren't here to measure cpu's   :t
Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 27, 2013, 11:03:43 AM
Quote from: dedndave on February 27, 2013, 10:06:51 AM
we aren't here to measure cpu's   :t
so, you think I was rude to be curious :shock:
Quote
interesting to see the difference in speed with different processors
your is i7 2.8 gig and mine is i7 2.3 but speed is double
I am curies why is that?
Gunter, your is i7 3,4 gig and still slower than qWord's and mine

that was not a provocation it was a curios technical question:
to put it that way:
you are a salesman and I ask you as a customer:
"why would I by i7 6.8 gig if i7 2.3 gig runs faster?"  :icon_confused:
and you tell me: "i7 6.8 gig sounds better, you moron" :icon_mrgreen:
Title: Re: reason to switch to 64 Bit Assembler
Post by: dedndave on February 27, 2013, 01:46:01 PM
a clock cycle on one cpu is not the same as a clock cycle on another (moron)   :lol:

someone posted the comparison of specs - then removed it
but, i suspect the ratio of internal to external bus clocks might have something to do with it

if you want to know which is fastest, run a real-time benchmark test
set up a test to measure minutes and seconds to accomplish some specific task
then run the same test on both machines
you want it to be fairly long, say a few minutes or so
Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 27, 2013, 03:05:13 PM
now you talking... :lol:
I thought bicycle, motorcycle, reversecycle, evercycle wtfcycle...  are the same :dazzled:
thanks dedndave :t
Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 27, 2013, 09:55:25 PM
dedndave, how do you explain this: :greenclp:
QuoteClock Speed
Clock speed is the rate at which a processor can complete a processing cycle. It is typically measured in megahertz or gigahertz. One megahertz is equal to one million cycles per second, while one gigahertz equals one billion cycles per second. This means a 1.8 GHz processor has twice the clock speed of a 900 MHz processor.

However, it is important to note that a 1.8 GHz CPU is not necessarily twice as fast as a 900 MHz CPU. This is because different processors often use different architectures. For example, one processor may require more clock cycles to complete a multiplication instruction than another processor. If the 1.8 GHz CPU can complete a multiplication instruction in 4 cycles, while the 900 MHz CPU takes 7 cycles, the 1.8 GHz processor will be more than twice as fast as the 900 MHz processor. Conversely, if the 1.8 GHz processor takes more cycles to perform the instruction, it will be less than 2x as fast as the 900 MHz processor.

Other factors, such as a computer's bus speed, cache size, speed of the RAM, and hard drive speed also contribute to the overall performance of the machine. Therefore, while the processor's clock speed is a significant indicator of how fast a computer is, it is not the only factor that matters.

you can find it here:here (http://pc.net/glossary/definition/clockspeed)
Title: Re: reason to switch to 64 Bit Assembler
Post by: dedndave on February 28, 2013, 12:02:55 AM
that's what i've been trying to tell you - lol
Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 28, 2013, 06:02:48 AM
are you sure?
QuoteClock speed is the rate at which a processor can complete a processing cycle. It is typically measured in megahertz or gigahertz.
so, if some computer needs 444 cycles and another computer needs 222 cycles  for the same job, does that mean that the one with 444 cycles is faster because it has bigger number? :exclaim:
Title: Re: reason to switch to 64 Bit Assembler
Post by: dedndave on February 28, 2013, 11:36:20 AM
write a test app to know the answer   :biggrin:
Title: Re: reason to switch to 64 Bit Assembler
Post by: habran on February 28, 2013, 01:07:54 PM
dedndave,
your motto is: "Never give up" :biggrin:
my motto is :  "Never give in" 8)
you and me together unbreakable   :t