I have tested 2 separate memory copy algorithms, the first is a simple REP MOVSQ type, the second is an unaligned XMM version and while the second is slightly faster, on this Haswell about 20% faster, it is not appreciably faster so I wondered if anyone has done any work to get a genuinely fast version in 64 bit.
These are the two versions so far, the XMM version does not have the tail trimmer for uneven byte counts but it does not matter in terms of testing.
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
mcopy64 proc
; rcx = source address
; rdx = destination address
; r8 = byte count
; --------------
; save rsi & rdi
; --------------
mov r11, rsi
mov r10, rdi
cld
mov rsi, rcx
mov rdi, rdx
mov rcx, r8
shr rcx, 3
rep movsq
mov rcx, r8
and rcx, 7
rep movsb
; -----------------
; restore rsi & rdi
; -----------------
mov rdi, r10
mov rsi, r11
ret
mcopy64 endp
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
xmmcopyu proc
; *********************
; unaligned memory copy
; *********************
; rcx = source address
; rdx = destination address
; r8 = byte count
mov r11, r8
shr r11, 4 ; div by 16 for loop count
xor r10, r10 ; zero r10 to use as index
lpst:
movdqu xmm0, [rcx+r10] ; cached read
movntdq [rdx+r10], xmm0 ; non cached write
add r10, 16
sub r11, 1
jnz lpst
ret
xmmcopyu endp
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤