Something consistent on the 3.3 gig Haswell is the a simple "movsb" copy is faster by a slight amount to a combined "movsq/movsb" algo. The example was originally a test of the register preservation macros but I though I may as well make something useful out of it as well. I would be interested to see if there is any real different on different processors.
These are the typical results I get with this test piece.
Warming up . . . .
mcopy = 1407
bcopy = 1313
mcopy = 1375
bcopy = 1344
mcopy = 1375
bcopy = 1313
mcopy = 1406
bcopy = 1359
mcopy = 1391
bcopy = 1328
mcopy = 1375
bcopy = 1313
mcopy = 1375
bcopy = 1312
mcopy = 1422
bcopy = 1312
1390 mcopy average
1324 bcopy average
The test piece source.
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
include \masm32\include64\masm64rt.inc
.code
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
entry_point proc
LOCAL pMem1 :QWORD
LOCAL pMem2 :QWORD
LOCAL cntr :QWORD
LOCAL tcnt :QWORD
LOCAL ocnt :QWORD
LOCAL cnt1 :QWORD
LOCAL cnt2 :QWORD
mov cnt1, 0
mov cnt2, 0
mov pMem1, alloc(1024*1024*1024)
mov pMem2, alloc(1024*1024*1024)
conout " Warming up . . . .",lf,lf
mov tcnt, rv(GetTickCount)
mov cntr, 10
@@:
rcall bcopy2,pMem1,pMem2,1024*1024*1024
sub cntr, 1
jnz @B
invoke GetTickCount
sub rax, tcnt
mov ocnt, 8
reloop:
; ---------------------------------------
invoke SleepEx,10,0
cpuid
mov tcnt, rv(GetTickCount)
mov cntr, 10
@@:
rcall mcopy2,pMem1,pMem2,1024*1024*1024
sub cntr, 1
jnz @B
invoke GetTickCount
sub rax, tcnt
add cnt1, rax
conout " mcopy = ",str$(rax),lf
; ---------------------------------------
invoke SleepEx,10,0
cpuid
mov tcnt, rv(GetTickCount)
mov cntr, 10
@@:
rcall bcopy2,pMem1,pMem2,1024*1024*1024
sub cntr, 1
jnz @B
invoke GetTickCount
sub rax, tcnt
add cnt2, rax
conout " bcopy = ",str$(rax),lf
; ---------------------------------------
sub ocnt, 1
jnz reloop
shr cnt1, 3
shr cnt2, 3
conout lf
conout " ",str$(cnt1)," mcopy average",lf
conout " ",str$(cnt2)," bcopy average",lf
waitkey
mfree pMem1
mfree pMem2
invoke ExitProcess,0
ret
entry_point endp
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
NOSTACKFRAME
mcopy2 proc
; rcx = source address
; rdx = destination address
; r8 = byte count
; --------------
; save rsi & rdi
; --------------
sav rsi
sav rdi
cld
mov rsi, rcx
mov rdi, rdx
mov rcx, r8
shr rcx, 3
rep movsq
mov rcx, r8
and rcx, 7
rep movsb
; -----------------
; restore rsi & rdi
; -----------------
rst rsi
rst rdi
ret
mcopy2 endp
STACKFRAME
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
NOSTACKFRAME
bcopy2 proc
; rcx = source address
; rdx = destination address
; r8 = byte count
; --------------
; save rsi & rdi
; --------------
sav rsi
sav rdi
mov rsi, rcx
mov rdi, rdx
mov rcx, r8
rep movsb
; -----------------
; restore rsi & rdi
; -----------------
rst rsi
rst rdi
ret
bcopy2 endp
STACKFRAME
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
end