Most programs these days use some kind of compression, sound, video, images, and the hacking of the memcpy() function generate problems in this case. Compressed data is often expanded using memcpy(data+size, data, count). If data is 'abcd', size is 1, and count is 4, the expected output is 'aaaaa'. Using movsd to improve speed will then be the hack which generate this problem.
I think it would be better to create different version of memcpy[w|d|q], since the user usually know the type of data he is copying. The idea to make one version to handle all cases will be complicated, and all the test code needed makes it rather big and also in some cases slower.
In the test I made above I was speculating if movdqu was faster than movdqa on aligned data, which seems a bit odd. I rewrote the test code, but the result is still random. The test (at least in this case) show how little gain there is using movsd to improve speed. Using SSE to copy aligned data have some benefits, but it is not a huge improvement.
; MEMCPY2.ASM--
; http://www.masm32.com/
;
; make:
; jwasm /coff memcpy2.asm
; link /subsystem:console memcpy2.obj
;
.xlist
include \masm32\include\masm32rt.inc
.686
.xmm
include \masm32\macros\timers.asm
.list
MAIN_COUNT = 2
LOOP_COUNT = 100
MAXMEMORY = 40000h
memcpy proto :ptr byte, :ptr byte, :dword
memcpyd proto :ptr byte, :ptr byte, :dword
memcpyxmmA proto :ptr byte, :ptr byte, :dword
memcpyxmmU proto :ptr byte, :ptr byte, :dword
.data
a1 dd ?
m1 dd ?
u1 dd ?
.code
start:
invoke GlobalAlloc,GMEM_FIXED,MAXMEMORY+128
mov a1,eax
test eax,eax
jnz @F
exit
@@:
mov edx,eax
and eax,not 128-1
mov m1,eax
inc edx
mov u1,edx
push 1
call ShowCpu ; print brand string and SSE level
print "---------------------------------------------------------", 13, 10
mov ecx,MAIN_COUNT
main_loop:
push ecx
test_start macro
invoke Sleep, 100
counter_begin LOOP_COUNT, HIGH_PRIORITY_CLASS
endm
test_end macro text
counter_end
print str$(eax), 9, text, 13, 10
endm
;----------------------------------------------
test_start
mov esi,LOOP_COUNT
@@:
invoke crt_memcpy,m1,m1,MAXMEMORY
dec esi
jnz @b
test_end "cycles for crt_memcpy A"
;----------------------------------------------
test_start
mov esi,LOOP_COUNT
@@:
invoke memcpy,m1,m1,MAXMEMORY
dec esi
jnz @b
test_end "cycles for memcpy A"
;----------------------------------------------
test_start
mov esi,LOOP_COUNT
@@:
invoke memcpyd,m1,m1,MAXMEMORY
dec esi
jnz @b
test_end "cycles for memcpyd A"
;----------------------------------------------
test_start
mov esi,LOOP_COUNT
@@:
invoke memcpyxmmA,m1,m1,MAXMEMORY
dec esi
jnz @b
test_end "cycles for memcpy movdqa A"
;----------------------------------------------
test_start
mov esi,LOOP_COUNT
@@:
invoke memcpyxmmU,m1,m1,MAXMEMORY
dec esi
jnz @b
test_end "cycles for memcpy movdqu A"
;----------------------------------------------
test_start
mov esi,LOOP_COUNT
@@:
invoke memcpyxmmU,u1,u1,MAXMEMORY
dec esi
jnz @b
test_end "cycles for memcpy movdqu U"
;----------------------------------------------
print "---------------------------------------------------------", 13, 10
pop ecx
dec ecx
jz @F
jmp main_loop
@@:
invoke GlobalFree,a1
inkey chr$(13, 10, "--- ok ---", 13)
exit
align 16
memcpy proc uses esi edi s1:ptr byte, s2:ptr byte, count:dword
mov edi,s1
mov esi,s2
mov ecx,count
mov eax,edi
rep movsb
ret
memcpy endp
align 16
memcpyd proc uses esi edi s1:ptr byte, s2:ptr byte, count:dword
mov edi,s1
mov esi,s2
mov ecx,count
shr ecx,2
mov eax,edi
rep movsd
ret
memcpyd endp
align 16
memcpyxmmA proc uses ebx s1:ptr byte, s2:ptr byte, count:dword
mov edx,s1
mov ebx,s2
mov eax,count
neg eax
add eax,127
align 16
@@:
movdqa xmm0,[ebx]
movdqa xmm1,[ebx+16]
movdqa xmm2,[ebx+32]
movdqa xmm3,[ebx+48]
movdqa xmm4,[ebx+64]
movdqa xmm5,[ebx+80]
movdqa xmm6,[ebx+96]
movdqa xmm7,[ebx+112]
movdqa [edx],xmm0
movdqa [edx+16],xmm1
movdqa [edx+32],xmm2
movdqa [edx+48],xmm3
movdqa [edx+64],xmm4
movdqa [edx+80],xmm5
movdqa [edx+96],xmm6
movdqa [edx+112],xmm7
add ebx,128
add edx,128
add eax,128
jnc @B
mov eax,s1
ret
memcpyxmmA endp
align 16
memcpyxmmU proc uses ebx s1:ptr byte, s2:ptr byte, count:dword
mov edx,s1
mov ebx,s2
mov eax,count
neg eax
add eax,127
jbe memcpyxmmU_16
align 16
@@:
movdqu xmm0,[ebx]
movdqu xmm1,[ebx+16]
movdqu xmm2,[ebx+32]
movdqu xmm3,[ebx+48]
movdqu xmm4,[ebx+64]
movdqu xmm5,[ebx+80]
movdqu xmm6,[ebx+96]
movdqu xmm7,[ebx+112]
movdqu [edx],xmm0
movdqu [edx+16],xmm1
movdqu [edx+32],xmm2
movdqu [edx+48],xmm3
movdqu [edx+64],xmm4
movdqu [edx+80],xmm5
movdqu [edx+96],xmm6
movdqu [edx+112],xmm7
add ebx,128
add edx,128
add eax,128
jnc @B
memcpyxmmU_16:
sub eax,127-15
jns memcpyxmmU_tail
@@:
movdqu xmm0,[ebx]
movdqu [edx],xmm0
add ebx,16
add edx,16
add eax,16
jnc @B
memcpyxmmU_tail:
sub eax,15
jz memcpyxmmU_end
neg eax
mov ecx,eax
xchg esi,ebx
xchg edi,edx
rep movsb
mov esi,ebx
mov edi,edx
memcpyxmmU_end:
mov eax,s1
ret
memcpyxmmU endp
ShowCpu proc ; mode:DWORD
COMMENT @ Usage:
push 0, call ShowCpu ; simple, no printing, just returns SSE level
push 1, call ShowCpu ; prints the brand string and returns SSE level@
pushad
sub esp, 80 ; create a buffer for the brand string
mov edi, esp ; point edi to it
xor ebp, ebp
.Repeat
lea eax, [ebp+80000002h]
db 0Fh, 0A2h ; cpuid 80000002h-80000004h
stosd
mov eax, ebx
stosd
mov eax, ecx
stosd
mov eax, edx
stosd
inc ebp
.Until ebp>=3
push 1
pop eax
db 0Fh, 0A2h ; cpuid 1
xor ebx, ebx ; CpuSSE
xor esi, esi ; add zero plus the carry flag
bt edx, 25 ; edx bit 25, SSE1
adc ebx, esi
bt edx, 26 ; edx bit 26, SSE2
adc ebx, esi
bt ecx, esi ; ecx bit 0, SSE3
adc ebx, esi
bt ecx, 9 ; ecx bit 9, SSE4
adc ebx, esi
dec dword ptr [esp+4+32+80] ; dec mode in stack
.if Zero?
mov edi, esp ; restore pointer to brand string
.Repeat
.Break .if byte ptr [edi]!=32 ; mode was 1, so show a string but skip leading blanks
inc edi
.Until 0
.if byte ptr [edi]<32
print chr$("pre-P4")
.else
print edi ; CpuBrand
.endif
.if ebx
print chr$(32, 40, "SSE") ; info on SSE level, 40=(
print str$(ebx), 41, 13, 10 ; 41=)
.endif
.endif
add esp, 80 ; discard brand buffer (after printing!)
mov [esp+32-4], ebx ; move ebx into eax stack position - returns eax to main for further use
ifdef MbBufferInit
call MbBufferInit
endif
popad
ret 4
ShowCpu endp
end start
AMD Athlon(tm) II X2 245 Processor (SSE3)
---------------------------------------------------------
5436621 cycles for crt_memcpy A
5451494 cycles for memcpy A
5430749 cycles for memcpyd A
5130181 cycles for memcpy movdqa A
5137260 cycles for memcpy movdqu A
9398746 cycles for memcpy movdqu U
---------------------------------------------------------
5424911 cycles for crt_memcpy A
5429803 cycles for memcpy A
5424371 cycles for memcpyd A
5147542 cycles for memcpy movdqa A
5139047 cycles for memcpy movdqu A
9419693 cycles for memcpy movdqu U
---------------------------------------------------------
Intel(R) Core(TM) i3 CPU 540 @ 3.07GHz (SSE4)
---------------------------------------------------------
3768758 cycles for crt_memcpy A
3601358 cycles for memcpy A
3611729 cycles for memcpyd A
3665437 cycles for memcpy movdqa A
3527944 cycles for memcpy movdqu A
4053850 cycles for memcpy movdqu U
---------------------------------------------------------
3910008 cycles for crt_memcpy A
3616456 cycles for memcpy A
3675379 cycles for memcpyd A
4250390 cycles for memcpy movdqa A
3348694 cycles for memcpy movdqu A
4051784 cycles for memcpy movdqu U
---------------------------------------------------------