reason to switch to 64 Bit Assembler

frktons · February 12, 2013, 11:11:03 AM

Quote from: qWord on February 12, 2013, 07:17:35 AM
Quote from: habran on February 12, 2013, 06:37:57 AMI can not believe that it takes only 2 ticks
you are obviously not able to configure your compiler! Also, looking in the code of my testbench (and yes ... you can't compile it because there are some dependencies I've not include) you will see that I've used a high loop count, which blends out memory access.
Code Select Expand
; MSVC 2010 sub_140008A60 proc near mov r10d, r8d and r8d, 7 mov r9, rcx shr r10d, 3 test r10d, r10d jz short loc_140008A94 db 66h, 66h, 66h, 66h nop word ptr [rax+rax+00000000h] loc_140008A80: mov rax, [rdx] add r9, 8 add rdx, 8 dec r10d mov [r9-8], rax jnz short loc_140008A80 loc_140008A94: test r8b, 4 jz short loc_140008AAC mov eax, [rdx] add r9, 4 add rdx, 4 mov [r9-4], eax add r8d, 0FFFFFFFCh loc_140008AAC: test r8d, r8d jz short loc_140008AD1 sub rdx, r9 db 66h, 66h, 66h, 66h nop dword ptr [rax+rax+00000000h] loc_140008AC0: movzx eax, byte ptr [rdx+r9] inc r9 dec r8d mov [r9-1], al jnz short loc_140008AC0 loc_140008AD1: mov rax, rcx retn sub_140008A60 endp

in the attachment a testbench with loop count = 1

BTW: if you are not interested in a serious discussion, you may simply say that instead of this bullsh** parody.

The executable is quite big after unzipping = 190K. What's inside?
I can't believe a simple test on memory copy takes all that code.

habran · February 19, 2013, 03:54:12 PM

here is the version what I was talking about IMO fastest ever :t
please prove me wrong

I use here xmm4 and ymm4 because first 4 registers are used in float calculation and this one is volatile as well
so we don't have to preserve it

Code Select


option win64:0
OPTION PROLOGUE:NONE 
OPTION EPILOGUE:NONE
xmemcpy PROC dest:QWORD,src :QWORD, count:UINT_PTR
   mov rax,rcx
   .if (rcx!=rdx)
         .for (r10=r8,r10>>=5¦r10¦rcx+=32,rdx+=32,r10--)   
            vmovdqu ymm4,[rdx]
            vmovdqu [rcx],ymm4
        .endfor
	shr r8,1
         .if (CARRY?)
		mov r9b,[rdx]
		mov [rcx],r9b
		inc rcx
		 inc rdx
	.endif
         shr r8,1
	.if (CARRY?)
		mov r9w,[rdx]
		mov [rcx],r9w
		add rcx,2
		add rdx,2
	 .endif
         shr r8,1
         .if (CARRY?)
		mov r9d,[rdx]
		mov [rcx],r9d 
		add rcx,4
		add rdx,4
	.endif
         shr r8,1
         .if (CARRY?)
            mov r9,[rdx]
            mov [rcx],r9
	    add rcx,8
	    add rdx,8
	 .endif
         shr r8,1
         .if (CARRY?)
           movdqu xmm4,[rdx]
           movdqu [rcx],xmm4
	 .endif
   .endif    
aexit: ret              
xmemcpy ENDP
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef

it creates this code:

Code Select


xmemcpy:
00000001`40034220 488bc1          mov     rax,rcx
00000001`40034223 483bca          cmp     rcx,rdx
00000001`40034226 747a            je      xmemcpy+0x82 (00000001`400342a2)
00000001`40034228 4d8bd0          mov     r10,r8
00000001`4003422b 49c1ea05        shr     r10,5
00000001`4003422f 4d23d2          and     r10,r10
00000001`40034232 7415            je      xmemcpy+0x29 (00000001`40034249)
00000001`40034234 c5fe6f22        vmovdqu ymm4,ymmword ptr [rdx]
00000001`40034238 c5fe7f21        vmovdqu ymmword ptr [rcx],ymm4
00000001`4003423c 4883c120        add     rcx,20h
00000001`40034240 4883c220        add     rdx,20h
00000001`40034244 49ffca          dec     r10
00000001`40034247 75eb            jne     xmemcpy+0x14 (00000001`40034234)
00000001`40034249 49d1e8          shr     r8,1
00000001`4003424c 730c            jae     xmemcpy+0x3a (00000001`4003425a)
00000001`4003424e 448a0a          mov     r9b,byte ptr [rdx]
00000001`40034251 448809          mov     byte ptr [rcx],r9b
00000001`40034254 48ffc1          inc     rcx
00000001`40034257 48ffc2          inc     rdx
00000001`4003425a 49d1e8          shr     r8,1
00000001`4003425d 7310            jae     xmemcpy+0x4f (00000001`4003426f)
00000001`4003425f 66448b0a        mov     r9w,word ptr [rdx]
00000001`40034263 66448909        mov     word ptr [rcx],r9w
00000001`40034267 4883c102        add     rcx,2
00000001`4003426b 4883c202        add     rdx,2
00000001`4003426f 49d1e8          shr     r8,1
00000001`40034272 730e            jae     xmemcpy+0x62 (00000001`40034282)
00000001`40034274 448b0a          mov     r9d,dword ptr [rdx]
00000001`40034277 448909          mov     dword ptr [rcx],r9d
00000001`4003427a 4883c104        add     rcx,4
00000001`4003427e 4883c204        add     rdx,4
00000001`40034282 49d1e8          shr     r8,1
00000001`40034285 730e            jae     xmemcpy+0x75 (00000001`40034295)
00000001`40034287 4c8b0a          mov     r9,qword ptr [rdx]
00000001`4003428a 4c8909          mov     qword ptr [rcx],r9
00000001`4003428d 4883c108        add     rcx,8
00000001`40034291 4883c208        add     rdx,8
00000001`40034295 49d1e8          shr     r8,1
00000001`40034298 7308            jae     xmemcpy+0x82 (00000001`400342a2)
00000001`4003429a f30f6f22        movdqu  xmm4,xmmword ptr [rdx]
00000001`4003429e f30f7f21        movdqu  xmmword ptr [rcx],xmm4
00000001`400342a2 c3              ret

and here is version for people without AVX

Code Select


xmemcpy PROC dest:QWORD,src :QWORD, count:UINT_PTR
   mov rax,rcx
   .if (rcx!=rdx)
     .for (r10=r8,r10>>=4¦r10¦rcx+=16,rdx+=16,r10--)   
        movdqu xmm4,[rdx]
        movdqu [rcx],xmm4
     .endfor
     shr r8,1
     .if (CARRY?)
	  mov r9b,[rdx]
	  mov [rcx],r9b
	  inc rcx
	  inc rdx
     .endif
     shr r8,1
     .if (CARRY?)
	  mov r9w,[rdx]
	  mov [rcx],r9w
	  add rcx,2
	  add rdx,2
     .endif
     shr r8,1
     .if (CARRY?)
	  mov r9d,[rdx]
	  mov [rcx],r9d 
	  add rcx,4
	  add rdx,4
     .endif
     shr r8,1
     .if (CARRY?)
       mov r9,[rdx]
       mov [rcx],r9 
   .endif
  .endif    
  ret              
xmemcpy ENDP

habran · February 20, 2013, 12:08:41 AM

I commented this for the visitors only, not for the members of this forum

Code Select


option win64:0                    ;no need for any option
OPTION PROLOGUE:NONE              ;just pure code
OPTION EPILOGUE:NONE
xmemcpy PROC dest:QWORD,src :QWORD, count:UINT_PTR
   mov rax,rcx                    ;save dest of transfered data for return  befor it changes
   .if (rcx!=rdx)                 ;check if there is not the same location of src and dest
         ;here is happening the MULTO IMPORTANTE transfer of data
         .for (r10=r8,r10>>=5¦r10¦rcx+=32,rdx+=32,r10--)   
            vmovdqu ymm4,[rdx]    ;transfer 32 byte at ones 
            vmovdqu [rcx],ymm4    ;with ymm4 AVX register (The Transporter)
        .endfor                   ;RRRRRRRROOOOOOOOAAAAAAAAARRRRRRRR
  ;data is probably not aligned to 32 bytes so we have to check if so
  ;it could have been left for example 31 or 01Fh  or 0000 0000 0001 1111 in reg r8 or count
   shr r8,1                       ;check if so by shifting right 1 time
   .if (CARRY?)                   ;if 1 pops out it will enter in the carry flag
    mov r9b,[rdx]                 ;transfer only one byte to dest
    mov [rcx],r9b                 ;it can be only one byte
    inc rcx                       ;if more than 1 it will be done 
    inc rdx                       ;in the next shift
  .endif
  shr r8,1                        ;LET$;) see if there is a word prezent
  .if (CARRY?)                    ;HA! I found you
    mov r9w,[rdx]                 ;store that only word in the dest
    mov [rcx],r9w
    add rcx,2                     ;this time add two to dest pos
    add rdx,2                     ;and src
   .endif
    shr r8,1                      ;shift again for the dword
   .if (CARRY?)                   ;nock-nock are you in cf
    mov r9d,[rdx]                 ;get in 
    mov [rcx],r9d 
    add rcx,4                     ;now we add 4 to both src and dest
    add rdx,4
  .endif
  shr r8,1                       ;looking for qword
  .if (CARRY?)                   ;no job for you today go home and do some programming
     mov r9,[rdx]                
     mov [rcx],r9
     add rcx,8
     add rdx,8                   ;inrease your pay for 8 bucks an hour
   .endif
   shr r8,1                      ;oword prezent today?
   .if (CARRY?)                   
      movdqu xmm4,[rdx]          
      movdqu [rcx],xmm4          ;Last Stand!!!
   .endif                        ;no need to increase pozition
   .endif                        ;I finished!!! Did you finish yet???
aexit: ret                       ;have a smocko
xmemcpy ENDP
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef

habran · February 20, 2013, 09:37:06 AM

I thought maybe those who still have 32 bit old-timers machines can feel the blow of the lightning speed in they hair

but be careful, it can blow away that little bit of hair left on your head ;)
so I wrote a 32 bit for them 8)

Code Select


xmemcpy uses ebx dest:DWORD,src:DWORD,count:DWORD
   mov ecx,dest
   mov edx,src
   mov ebx,count
   .if (ecx!=edx)
     .for (eax=ebx,eax>>=4¦eax¦ecx+=16,edx+=16,eax--)   
        movdqu xmm4,[edx]
        movdqu [ecx],xmm4
     .endfor
     shr ebx,1
     .if (CARRY?)
       mov al,[edx]
       mov [ecx],al
       inc ecx
       inc edx
     .endif
     shr ebx,1
     .if (CARRY?)
       mov ax,[edx]
       mov [ecx],ax
       add ecx,2
       add edx,2
     .endif
     shr ebx,1
     .if (CARRY?)
       mov eax,[edx]
       mov [ecx],eax 
       add ecx,4
       add edx,4
     .endif
     shr ebx,1
     .if (CARRY?)
       movq xmm4,[edx]
       movq [ecx],xmm4 
   .endif
  .endif 
  mov eax,dest   
  ret              
xmemcpy ENDP

jj2007 · February 21, 2013, 03:34:58 AM

Doesn't assemble with my version of JWasm. Where is your latest build?
And what does .for (eax=ebx,eax>>=4¦eax¦ecx+=16,edx+=16,eax--) mean? "Much bigger or equal"??

habran · February 21, 2013, 05:48:21 AM

Hello JJ2007,
my latest build is as usual in the topic ".FOR built in JWasm" http://masm32.com/board/index.php?topic=402.0 :shock:

QuoteAnd what does .for (eax=ebx,eax>>=4¦eax¦ecx+=16,edx+=16,eax--) mean? "Much bigger or equal"??

and >>=4 means shift right 4 time it produces shr eax,4

it means "Much much much much less" :lol:

habran · February 21, 2013, 06:59:51 AM

Hey qWord,
Cat got your tongue? :icon_eek:

you have the same "Qosmio laptop" as me
did you test the speed?

frktons · February 21, 2013, 10:16:22 AM

Quote from: habran on February 20, 2013, 09:37:06 AM
I thought maybe those who still have 32 bit old-timers machines can feel the blow of the lightning speed in they hair
but be careful, it can blow away that little bit of hair left on your head ;)
so I wrote a 32 bit for them 8)
Code Select Expand
xmemcpy uses ebx dest:DWORD,src:DWORD,count:DWORD mov ecx,dest mov edx,src mov ebx,count .if (ecx!=edx) .for (eax=ebx,eax>>=4¦eax¦ecx+=16,edx+=16,eax--) movdqu xmm4,[edx] movdqu [ecx],xmm4 .endfor shr ebx,1 .if (CARRY?) mov al,[edx] mov [ecx],al inc ecx inc edx .endif shr ebx,1 .if (CARRY?) mov ax,[edx] mov [ecx],ax add ecx,2 add edx,2 .endif shr ebx,1 .if (CARRY?) mov eax,[edx] mov [ecx],eax add ecx,4 add edx,4 .endif shr ebx,1 .if (CARRY?) movq xmm4,[edx] movq [ecx],xmm4 .endif .endif mov eax,dest ret xmemcpy ENDP

Habran, why do you use MOVDQU and not align the memory
pointers to 16 bytes addresses? MOVAPS/MOVDQA are faster.
Unrolling the MOV can be another good option to test.
And if the area to copy is big, > 4 MB , MOVNTDQ is the best
option. Have a look at the old forum and search for CLEARBUFFER.

REP STOSQ is probably faster than your non AVX solution, give it
a shot on 64 bit version.
A last thing. You should post the results of your tests, if you like
to get the attention of somebody on these routines.

Frank

habran · February 21, 2013, 11:05:56 AM

Hi Frank,

Quotewhy do you use MOVDQU and not align the memory

because this routine is created particularly for unaligned data like text or something
I totally agree with you that MOVDQA is much faster than MOVDQU :t
however, if data is aligned to 32 byte I wouldn't need that routine I would just write in my source:

Code Select


    ;r8 can contain sizeof(buffer)
    .for (rcx=dest,rdx=src,r8=count,r8>>=5¦r8¦rcx+=32,rdx+=32,r8--)
             vmovdqa ymm4,[rdx]
             vmovdqa [rcx],ymm4
    .endfor

or for for 16 byte xmm:

Code Select


    ;r8 can contain sizeof(buffer)
    .for (rcx=dest,rdx=src,r8=count,r8>>=4¦r8¦rcx+=16,rdx+=16,r8--)
             movdqa xmm4,[rdx]
             movdqa [rcx],xmm4
    .endfor

QuoteA last thing. You should post the results of your tests

I left it to qWord to do that for me because he likes testing and arguing :P
and I like and appreciate him

Quoteif you like to get the attention of somebody on these routines.

I don't give a damn about attention, take it or leave it 8)

habran · February 21, 2013, 11:16:11 AM

Frank,
IMO it is not always advisable to align data to 16 or 32 bytes :(
if you have STRUCT in 32 bit program you align it to 4
in 64 bit logically is to align it to 8
however, when you work with big data transfer than it is logical to align it as big as your machine can afford

habran · February 21, 2013, 11:44:16 AM

this version is even more optimized then former and it has more logical order
as well as it can be faster for less data then 32 bytes:

Code Select


option win64:0
OPTION PROLOGUE:NONE 
OPTION EPILOGUE:NONE
xmemcpy PROC dest:QWORD,src :QWORD, count:UINT_PTR
    mov rax,rcx
    .if (rcx!=rdx)
	shr r8,1
       .if (CARRY?)
	     mov r9b,[rdx]
   	     mov [rcx],r9b
	     inc rcx
	     inc rdx
       .endif
       shr r8,1
       .if (CARRY?)
	      mov r9w,[rdx]
	      mov [rcx],r9w
	      add rcx,2
	      add rdx,2
       .endif
       shr r8,1
       .if (CARRY?)
	     mov r9d,[rdx]
	     mov [rcx],r9d 
	     add rcx,4
	     add rdx,4
       .endif
       shr r8,1
       .if (CARRY?)
            mov r9,[rdx]
            mov [rcx],r9
            add rcx,8
            add rdx,8
       .endif
       shr r8,1
       .if (CARRY?)
            movdqu xmm4,[rdx]
            movdqu [rcx],xmm4
            add rcx,16
            add rdx,16
	 .endif
	 .for (¦r8¦rcx+=32,rdx+=32,r8--)    
   		vmovdqu ymm4,[rdx]
   		vmovdqu [rcx],ymm4
	.endfor
    .endif      
aexit: ret                     
xmemcpy ENDP
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef

jj2007 · February 21, 2013, 11:46:57 AM

Quote from: habran on February 21, 2013, 05:48:21 AM
Hello JJ2007,
my latest build is as usual in the topic ".FOR built in JWasm" http://masm32.com/board/index.php?topic=402.0

Doesn't work on XP: "Not a valid Win32 app", access denied.

habran · February 21, 2013, 12:08:50 PM

sorry JJ2007, :(
it doesn't work on XP

Quotebinaries need at least Windows version 6
(Japheth)

however, there is a workaround for that
source code is in the folder and you can compile yourself if not to much hustle
just replace these two files in JW209s folder
if you don't have M$VC you can compile it with PelesC
but I don't believe you have enough energy to go through all that trouble

prove me wrong, I dare you :P

jj2007 · February 21, 2013, 12:27:12 PM

The standard JWasm works just fine on XP, I use it every day. And, no, I won't try to compile it myself. It is not a question of energy, though. I am too wise to invest my time in trying to compile a major C app

habran · February 21, 2013, 01:14:23 PM

Wise man JJ2007

believe it or not that excellent JWasm is written in C
and Japheth had to create binaries from it
how do you think he created it, by laying on it for four weeks or something ::)
NO!!! he compiled it!!!! and it looks that he did not dye of it
C is not a plug it is a programming language for Christ sake
don't be a chicken, roll your sleeves and get dirty
No pain no gain!!!

The MASM Forum

News:

reason to switch to 64 Bit Assembler

frktons

habran

habran

habran

jj2007

habran

habran

frktons

habran

habran

habran

jj2007

habran

jj2007

habran