Memory must be aligned. I did this for porting to PowerBASIC and it copies a half gig in 156 ms on my middle aged Haswell.
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
aligned_block_copy proc
; src = eax
; dst = edx
; cnt = ecx
shr ecx, 5 ; div by 32
@@:
vmovntdqa ymm0, YMMWORD PTR [eax]
vmovntdq YMMWORD PTR [edx], ymm0
add eax, 32
add edx, 32
sub ecx, 1
jnz @B
ret
aligned_block_copy endp
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef