this version is even more optimized then former and it has more logical order
as well as it can be faster for less data then 32 bytes:
option win64:0
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
xmemcpy PROC dest:QWORD,src :QWORD, count:UINT_PTR
mov rax,rcx
.if (rcx!=rdx)
shr r8,1
.if (CARRY?)
mov r9b,[rdx]
mov [rcx],r9b
inc rcx
inc rdx
.endif
shr r8,1
.if (CARRY?)
mov r9w,[rdx]
mov [rcx],r9w
add rcx,2
add rdx,2
.endif
shr r8,1
.if (CARRY?)
mov r9d,[rdx]
mov [rcx],r9d
add rcx,4
add rdx,4
.endif
shr r8,1
.if (CARRY?)
mov r9,[rdx]
mov [rcx],r9
add rcx,8
add rdx,8
.endif
shr r8,1
.if (CARRY?)
movdqu xmm4,[rdx]
movdqu [rcx],xmm4
add rcx,16
add rdx,16
.endif
.for (¦r8¦rcx+=32,rdx+=32,r8--)
vmovdqu ymm4,[rdx]
vmovdqu [rcx],ymm4
.endfor
.endif
aexit: ret
xmemcpy ENDP
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef