News:

Masm32 SDK description, downloads and other helpful links
Message to All Guests
NB: Posting URL's See here: Posted URL Change

Main Menu

reason to switch to 64 Bit Assembler

Started by habran, February 10, 2013, 08:03:46 PM

Previous topic - Next topic

frktons

Quote from: qWord on February 12, 2013, 07:17:35 AM
Quote from: habran on February 12, 2013, 06:37:57 AMI can not believe that it takes only 2 ticks
you are obviously not able to configure your compiler! Also, looking in the code of my testbench (and yes ... you can't compile it because there are some dependencies I've not include) you will see that I've used a high loop count, which blends out memory access.
; MSVC 2010
sub_140008A60   proc near

                mov     r10d, r8d
                and     r8d, 7
                mov     r9, rcx
                shr     r10d, 3
                test    r10d, r10d
                jz      short loc_140008A94
                db      66h, 66h, 66h, 66h
                nop     word ptr [rax+rax+00000000h]

loc_140008A80:
                mov     rax, [rdx]
                add     r9, 8
                add     rdx, 8
                dec     r10d
                mov     [r9-8], rax
                jnz     short loc_140008A80

loc_140008A94:
                test    r8b, 4
                jz      short loc_140008AAC
                mov     eax, [rdx]
                add     r9, 4
                add     rdx, 4
                mov     [r9-4], eax
                add     r8d, 0FFFFFFFCh

loc_140008AAC:
                test    r8d, r8d
                jz      short loc_140008AD1
                sub     rdx, r9
                db      66h, 66h, 66h, 66h
                nop     dword ptr [rax+rax+00000000h]

loc_140008AC0:
                movzx   eax, byte ptr [rdx+r9]
                inc     r9
                dec     r8d
                mov     [r9-1], al
                jnz     short loc_140008AC0

loc_140008AD1:
                mov     rax, rcx
                retn
sub_140008A60   endp


in the attachment a testbench with loop count = 1

BTW: if you are not interested in a serious discussion, you may simply say that instead of this bullsh**  parody.

The executable is quite big after unzipping = 190K. What's inside?
I can't believe a simple test on memory copy takes all that code.
There are only two days a year when you can't do anything: one is called yesterday, the other is called tomorrow, so today is the right day to love, believe, do and, above all, live.

Dalai Lama

habran

here is the version what I was talking about  IMO fastest ever  :t
please prove me wrong  :biggrin:
I use here xmm4 and ymm4 because first 4 registers are used in float calculation and this one is volatile as well
so we don't have to preserve it

option win64:0
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
xmemcpy PROC dest:QWORD,src :QWORD, count:UINT_PTR
   mov rax,rcx
   .if (rcx!=rdx)
         .for (r10=r8,r10>>=5¦r10¦rcx+=32,rdx+=32,r10--)   
            vmovdqu ymm4,[rdx]
            vmovdqu [rcx],ymm4
        .endfor
shr r8,1
         .if (CARRY?)
mov r9b,[rdx]
mov [rcx],r9b
inc rcx
inc rdx
.endif
         shr r8,1
.if (CARRY?)
mov r9w,[rdx]
mov [rcx],r9w
add rcx,2
add rdx,2
.endif
         shr r8,1
         .if (CARRY?)
mov r9d,[rdx]
mov [rcx],r9d
add rcx,4
add rdx,4
.endif
         shr r8,1
         .if (CARRY?)
            mov r9,[rdx]
            mov [rcx],r9
    add rcx,8
    add rdx,8
.endif
         shr r8,1
         .if (CARRY?)
           movdqu xmm4,[rdx]
           movdqu [rcx],xmm4
.endif
   .endif   
aexit: ret             
xmemcpy ENDP
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef


it creates this code:


xmemcpy:
00000001`40034220 488bc1          mov     rax,rcx
00000001`40034223 483bca          cmp     rcx,rdx
00000001`40034226 747a            je      xmemcpy+0x82 (00000001`400342a2)
00000001`40034228 4d8bd0          mov     r10,r8
00000001`4003422b 49c1ea05        shr     r10,5
00000001`4003422f 4d23d2          and     r10,r10
00000001`40034232 7415            je      xmemcpy+0x29 (00000001`40034249)
00000001`40034234 c5fe6f22        vmovdqu ymm4,ymmword ptr [rdx]
00000001`40034238 c5fe7f21        vmovdqu ymmword ptr [rcx],ymm4
00000001`4003423c 4883c120        add     rcx,20h
00000001`40034240 4883c220        add     rdx,20h
00000001`40034244 49ffca          dec     r10
00000001`40034247 75eb            jne     xmemcpy+0x14 (00000001`40034234)
00000001`40034249 49d1e8          shr     r8,1
00000001`4003424c 730c            jae     xmemcpy+0x3a (00000001`4003425a)
00000001`4003424e 448a0a          mov     r9b,byte ptr [rdx]
00000001`40034251 448809          mov     byte ptr [rcx],r9b
00000001`40034254 48ffc1          inc     rcx
00000001`40034257 48ffc2          inc     rdx
00000001`4003425a 49d1e8          shr     r8,1
00000001`4003425d 7310            jae     xmemcpy+0x4f (00000001`4003426f)
00000001`4003425f 66448b0a        mov     r9w,word ptr [rdx]
00000001`40034263 66448909        mov     word ptr [rcx],r9w
00000001`40034267 4883c102        add     rcx,2
00000001`4003426b 4883c202        add     rdx,2
00000001`4003426f 49d1e8          shr     r8,1
00000001`40034272 730e            jae     xmemcpy+0x62 (00000001`40034282)
00000001`40034274 448b0a          mov     r9d,dword ptr [rdx]
00000001`40034277 448909          mov     dword ptr [rcx],r9d
00000001`4003427a 4883c104        add     rcx,4
00000001`4003427e 4883c204        add     rdx,4
00000001`40034282 49d1e8          shr     r8,1
00000001`40034285 730e            jae     xmemcpy+0x75 (00000001`40034295)
00000001`40034287 4c8b0a          mov     r9,qword ptr [rdx]
00000001`4003428a 4c8909          mov     qword ptr [rcx],r9
00000001`4003428d 4883c108        add     rcx,8
00000001`40034291 4883c208        add     rdx,8
00000001`40034295 49d1e8          shr     r8,1
00000001`40034298 7308            jae     xmemcpy+0x82 (00000001`400342a2)
00000001`4003429a f30f6f22        movdqu  xmm4,xmmword ptr [rdx]
00000001`4003429e f30f7f21        movdqu  xmmword ptr [rcx],xmm4
00000001`400342a2 c3              ret


and here is version for people without  AVX

xmemcpy PROC dest:QWORD,src :QWORD, count:UINT_PTR
   mov rax,rcx
   .if (rcx!=rdx)
     .for (r10=r8,r10>>=4¦r10¦rcx+=16,rdx+=16,r10--)   
        movdqu xmm4,[rdx]
        movdqu [rcx],xmm4
     .endfor
     shr r8,1
     .if (CARRY?)
  mov r9b,[rdx]
  mov [rcx],r9b
  inc rcx
  inc rdx
     .endif
     shr r8,1
     .if (CARRY?)
  mov r9w,[rdx]
  mov [rcx],r9w
  add rcx,2
  add rdx,2
     .endif
     shr r8,1
     .if (CARRY?)
  mov r9d,[rdx]
  mov [rcx],r9d
  add rcx,4
  add rdx,4
     .endif
     shr r8,1
     .if (CARRY?)
       mov r9,[rdx]
       mov [rcx],r9
   .endif
  .endif   
  ret             
xmemcpy ENDP
Cod-Father

habran

#47
I commented this for the visitors only, not for the members of this forum  :bgrin:

option win64:0                    ;no need for any option
OPTION PROLOGUE:NONE              ;just pure code
OPTION EPILOGUE:NONE
xmemcpy PROC dest:QWORD,src :QWORD, count:UINT_PTR
   mov rax,rcx                    ;save dest of transfered data for return  befor it changes
   .if (rcx!=rdx)                 ;check if there is not the same location of src and dest
         ;here is happening the MULTO IMPORTANTE transfer of data
         .for (r10=r8,r10>>=5¦r10¦rcx+=32,rdx+=32,r10--)   
            vmovdqu ymm4,[rdx]    ;transfer 32 byte at ones
            vmovdqu [rcx],ymm4    ;with ymm4 AVX register (The Transporter)
        .endfor                   ;RRRRRRRROOOOOOOOAAAAAAAAARRRRRRRR
  ;data is probably not aligned to 32 bytes so we have to check if so
  ;it could have been left for example 31 or 01Fh  or 0000 0000 0001 1111 in reg r8 or count
   shr r8,1                       ;check if so by shifting right 1 time
   .if (CARRY?)                   ;if 1 pops out it will enter in the carry flag
    mov r9b,[rdx]                 ;transfer only one byte to dest
    mov [rcx],r9b                 ;it can be only one byte
    inc rcx                       ;if more than 1 it will be done
    inc rdx                       ;in the next shift
  .endif
  shr r8,1                        ;LET$;) see if there is a word prezent
  .if (CARRY?)                    ;HA! I found you
    mov r9w,[rdx]                 ;store that only word in the dest
    mov [rcx],r9w
    add rcx,2                     ;this time add two to dest pos
    add rdx,2                     ;and src
   .endif
    shr r8,1                      ;shift again for the dword
   .if (CARRY?)                   ;nock-nock are you in cf
    mov r9d,[rdx]                 ;get in
    mov [rcx],r9d
    add rcx,4                     ;now we add 4 to both src and dest
    add rdx,4
  .endif
  shr r8,1                       ;looking for qword
  .if (CARRY?)                   ;no job for you today go home and do some programming
     mov r9,[rdx]               
     mov [rcx],r9
     add rcx,8
     add rdx,8                   ;inrease your pay for 8 bucks an hour
   .endif
   shr r8,1                      ;oword prezent today?
   .if (CARRY?)                   
      movdqu xmm4,[rdx]         
      movdqu [rcx],xmm4          ;Last Stand!!!
   .endif                        ;no need to increase pozition
   .endif                        ;I finished!!! Did you finish yet???
aexit: ret                       ;have a smocko
xmemcpy ENDP
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef


Cod-Father

habran

I thought maybe those who still have 32 bit old-timers machines can feel the blow of the lightning speed in they hair :biggrin:
but be careful, it can blow away that little bit of hair left on your head ;)
so I wrote a 32 bit for them 8)

xmemcpy uses ebx dest:DWORD,src:DWORD,count:DWORD
   mov ecx,dest
   mov edx,src
   mov ebx,count
   .if (ecx!=edx)
     .for (eax=ebx,eax>>=4¦eax¦ecx+=16,edx+=16,eax--)   
        movdqu xmm4,[edx]
        movdqu [ecx],xmm4
     .endfor
     shr ebx,1
     .if (CARRY?)
       mov al,[edx]
       mov [ecx],al
       inc ecx
       inc edx
     .endif
     shr ebx,1
     .if (CARRY?)
       mov ax,[edx]
       mov [ecx],ax
       add ecx,2
       add edx,2
     .endif
     shr ebx,1
     .if (CARRY?)
       mov eax,[edx]
       mov [ecx],eax
       add ecx,4
       add edx,4
     .endif
     shr ebx,1
     .if (CARRY?)
       movq xmm4,[edx]
       movq [ecx],xmm4
   .endif
  .endif
  mov eax,dest   
  ret             
xmemcpy ENDP
Cod-Father

jj2007

Doesn't assemble with my version of JWasm. Where is your latest build?
And what does .for (eax=ebx,eax>>=4¦eax¦ecx+=16,edx+=16,eax--) mean? "Much bigger or equal"??

habran

Hello JJ2007,
my latest build is as usual in the topic ".FOR built in JWasm" http://masm32.com/board/index.php?topic=402.0 :shock:
QuoteAnd what does .for (eax=ebx,eax>>=4¦eax¦ecx+=16,edx+=16,eax--) mean? "Much bigger or equal"??
and >>=4 means shift right 4 time    it produces shr eax,4  :biggrin:
it means "Much much much much less" :lol:
Cod-Father

habran

Hey qWord,
Cat got your tongue?  :icon_eek:

you have the same "Qosmio laptop" as me
did you test the speed? :bgrin:
Cod-Father

frktons

Quote from: habran on February 20, 2013, 09:37:06 AM
I thought maybe those who still have 32 bit old-timers machines can feel the blow of the lightning speed in they hair :biggrin:
but be careful, it can blow away that little bit of hair left on your head ;)
so I wrote a 32 bit for them 8)

xmemcpy uses ebx dest:DWORD,src:DWORD,count:DWORD
   mov ecx,dest
   mov edx,src
   mov ebx,count
   .if (ecx!=edx)
     .for (eax=ebx,eax>>=4¦eax¦ecx+=16,edx+=16,eax--)   
        movdqu xmm4,[edx]
        movdqu [ecx],xmm4
     .endfor
     shr ebx,1
     .if (CARRY?)
       mov al,[edx]
       mov [ecx],al
       inc ecx
       inc edx
     .endif
     shr ebx,1
     .if (CARRY?)
       mov ax,[edx]
       mov [ecx],ax
       add ecx,2
       add edx,2
     .endif
     shr ebx,1
     .if (CARRY?)
       mov eax,[edx]
       mov [ecx],eax
       add ecx,4
       add edx,4
     .endif
     shr ebx,1
     .if (CARRY?)
       movq xmm4,[edx]
       movq [ecx],xmm4
   .endif
  .endif
  mov eax,dest   
  ret             
xmemcpy ENDP


Habran, why do you use MOVDQU and not align the memory
pointers to 16 bytes addresses? MOVAPS/MOVDQA are faster.
Unrolling the MOV can be another good option to test.
And if the area to copy is big, > 4 MB , MOVNTDQ is the best
option. Have a look at the old forum and search for CLEARBUFFER.

REP STOSQ is probably faster than your non AVX solution, give it
a shot on 64 bit version.
A last thing. You should post the results of your tests, if you like
to get the attention of somebody on these routines.

Frank
There are only two days a year when you can't do anything: one is called yesterday, the other is called tomorrow, so today is the right day to love, believe, do and, above all, live.

Dalai Lama

habran

#53
Hi Frank, :biggrin:
Quotewhy do you use MOVDQU and not align the memory
because this routine is created particularly for unaligned data like text or something
I totally agree with you that MOVDQA is much faster  than MOVDQU  :t
however, if data is aligned to 32 byte I wouldn't need that routine I would just write in my source:


    ;r8 can contain sizeof(buffer)
    .for (rcx=dest,rdx=src,r8=count,r8>>=5¦r8¦rcx+=32,rdx+=32,r8--)
             vmovdqa ymm4,[rdx]
             vmovdqa [rcx],ymm4
    .endfor

or for for 16 byte xmm:


    ;r8 can contain sizeof(buffer)
    .for (rcx=dest,rdx=src,r8=count,r8>>=4¦r8¦rcx+=16,rdx+=16,r8--)
             movdqa xmm4,[rdx]
             movdqa [rcx],xmm4
    .endfor


QuoteA last thing. You should post the results of your tests

I left it to qWord to do that for me because he likes testing and arguing :P
and I like and appreciate him :biggrin:

Quoteif you like to get the attention of somebody on these routines.

I don't give a damn about attention, take it or leave it 8)
Cod-Father

habran

Frank,
IMO it is not always advisable to align data to 16 or 32 bytes :(
if you have STRUCT in 32 bit program you align it to 4
in 64 bit logically is to align it to 8
however, when you work with big data transfer than it is logical to align it as big as your machine can afford :biggrin:
Cod-Father

habran

this version is even more optimized then former and it has more logical order
as well as it can be faster for less data then 32 bytes:

option win64:0
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
xmemcpy PROC dest:QWORD,src :QWORD, count:UINT_PTR
    mov rax,rcx
    .if (rcx!=rdx)
shr r8,1
       .if (CARRY?)
     mov r9b,[rdx]
         mov [rcx],r9b
     inc rcx
     inc rdx
       .endif
       shr r8,1
       .if (CARRY?)
      mov r9w,[rdx]
      mov [rcx],r9w
      add rcx,2
      add rdx,2
       .endif
       shr r8,1
       .if (CARRY?)
     mov r9d,[rdx]
     mov [rcx],r9d
     add rcx,4
     add rdx,4
       .endif
       shr r8,1
       .if (CARRY?)
            mov r9,[rdx]
            mov [rcx],r9
            add rcx,8
            add rdx,8
       .endif
       shr r8,1
       .if (CARRY?)
            movdqu xmm4,[rdx]
            movdqu [rcx],xmm4
            add rcx,16
            add rdx,16
.endif
.for (¦r8¦rcx+=32,rdx+=32,r8--)   
    vmovdqu ymm4,[rdx]
    vmovdqu [rcx],ymm4
.endfor
    .endif     
aexit: ret                     
xmemcpy ENDP
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef
Cod-Father

jj2007

Quote from: habran on February 21, 2013, 05:48:21 AM
Hello JJ2007,
my latest build is as usual in the topic ".FOR built in JWasm" http://masm32.com/board/index.php?topic=402.0

Doesn't work on XP: "Not a valid Win32 app", access denied.

habran

sorry JJ2007, :(
it doesn't work on XP
Quotebinaries need at least Windows version 6
(Japheth)
however, there is a workaround for that
source code is in the folder and you can compile yourself if not to much hustle
just replace these two files in JW209s folder
if you don't have M$VC you can compile it with PelesC
but I don't believe you have enough energy to go through all that trouble :dazzled:
prove me wrong, I dare you :P
Cod-Father

jj2007

The standard JWasm works just fine on XP, I use it every day. And, no, I won't try to compile it myself. It is not a question of energy, though. I am too wise to invest my time in trying to compile a major C app :biggrin:

habran

Wise man JJ2007 :biggrin:
believe it or not that excellent JWasm is written in C
and Japheth had to create binaries from it
how do you think he created it, by laying on it for four weeks or something ::)
NO!!! he compiled it!!!! and it looks that he did not dye of it
C is not a plug it is a programming language for Christ sake
don't be a chicken, roll your sleeves and get dirty
No pain no gain!!! :bgrin: 
Cod-Father