here is 64 bit without .for:
xmemcpy ENDP
option win64:0
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
xmemcpy PROC dest:QWORD,src :QWORD, count:UINT_PTR
mov rax,rcx
.if (rcx!=rdx)
shr r8,1
.if (CARRY?)
mov r9b,[rdx]
mov [rcx],r9b
inc rcx
inc rdx
.endif
shr r8,1
.if (CARRY?)
mov r9w,[rdx]
mov [rcx],r9w
add rcx,2
add rdx,2
.endif
shr r8,1
.if (CARRY?)
mov r9d,[rdx]
mov [rcx],r9d
add rcx,4
add rdx,4
.endif
shr r8,1
.if (CARRY?)
mov r9,[rdx]
mov [rcx],r9
add rcx,8
add rdx,8
.endif
shr r8,1
.if (CARRY?)
movdqu xmm4,[rdx]
movdqu [rcx],xmm4
add rcx,16
add rdx,16
.endif
.while (r8)
vmovdqu ymm4,[rdx]
vmovdqu [rcx],ymm4
add rcx,32
add rdx,32
dec r8
.endw
.endif
ret
xmemcpy ENDP
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef