My conclusion from the last test was three versions:
- one for machines without SIMD functions
- one using SIMD functions
- and one for AVX++ using movsb.
This version uses SIMD functions and handle overlapping data:memcpy proc dst, src, count
push esi
push edi
push edx
mov eax,[esp+16]
mov esi,[esp+20]
mov ecx,[esp+24]
movdqu xmm1,[esi] ; save aligned and tail bytes
movdqu xmm2,[esi+ecx-16]
test ecx,-32 ; need 32 bytes
jz copy_0_31
mov edi,eax
neg edi
and edi,16-1 ; align EDI 16
mov edx,esi ; get direction of copy
sub edx,eax
cmp edx,ecx
lea edx,[eax+ecx-16]
jbe overlapped ; move left if not overlapping
add esi,edi
sub ecx,edi
add edi,eax
and ecx,dword ptr -16 ; align EIP and ECX 16
align 16
loop_L:
REPEAT 0
sub ecx,16
movdqu xmm0,[esi+ecx]
movdqa [edi+ecx],xmm0
jz done
ENDM
sub ecx,16
movdqu xmm0,[esi+ecx]
movdqa [edi+ecx],xmm0
jnz loop_L
align 16
done:
movdqu [eax],xmm1
movdqu [edx],xmm2
toend:
pop edx
pop edi
pop esi
ret 12
;----------------------------------------------------------------------
; Overlapping buffers
;----------------------------------------------------------------------
align 16
overlapped:
sub ecx,edi
and ecx,dword ptr -16
add edi,ecx
add esi,edi
add edi,eax
neg ecx
align 16
loop_R:
movdqu xmm0,[esi+ecx]
movdqa [edi+ecx],xmm0
add ecx,16
jnz loop_R
jmp done
;----------------------------------------------------------------------
; Copy 0..31 byte
;----------------------------------------------------------------------
align 4
copy_0_31:
test ecx,ecx
jz toend
test ecx,-2
jz copy_1
test ecx,-4
jz copy_2_3
test ecx,-8
jz copy_4_7
test ecx,-16
jz copy_8_15
lea edx,[eax+ecx-16]
jmp done
align 4
copy_8_15:
movq xmm0,[esi+ecx-8]
movq [eax],xmm1
movq [eax+ecx-8],xmm0
jmp toend
align 4
copy_4_7:
mov edi,[esi]
mov esi,[esi+ecx-4]
mov [eax],edi
mov [eax+ecx-4],esi
jmp toend
align 4
copy_2_3:
mov di,[esi]
mov si,[esi+ecx-2]
mov [eax+ecx-2],si
mov [eax],di
jmp toend
align 4
copy_1:
mov cl,[esi]
mov [eax],cl
jmp toend
memcpy endp
However, this was faster on newer machines:memcpy proc dst, src, count
push esi
push edi
mov eax,[esp+12]
mov esi,[esp+16]
mov ecx,[esp+20]
mov edi,eax
cmp eax,esi
ja @F
rep movsb
pop edi
pop esi
ret 12
@@:
lea esi,[esi+ecx-1]
lea edi,[edi+ecx-1]
std
rep movsb
cld
pop edi
pop esi
ret 12
memcpy endp
I added the function copying 128 byte to the test:AMD Athlon(tm) II X2 245 Processor (SSE3)
----------------------------------------------
--------- short: 31
560692 cycles - ( 0) proc_0: ??? crt_memcpy *
1040554 cycles - ( 44) proc_1: AVX - 1 byte *
620459 cycles - (242) proc_2: 386 - 4 byte *
381270 cycles - (245) proc_3: SSE - 16 byte *
1822737 cycles - (164) proc_4: SSE - 128 byte
--------- short: 271
506692 cycles - ( 0) proc_0: ??? crt_memcpy *
1101216 cycles - ( 44) proc_1: AVX - 1 byte *
455698 cycles - (242) proc_2: 386 - 4 byte *
210824 cycles - (245) proc_3: SSE - 16 byte *
394439 cycles - (164) proc_4: SSE - 128 byte
--------- short: 2014
931445 cycles - ( 0) proc_0: ??? crt_memcpy *
3310250 cycles - ( 44) proc_1: AVX - 1 byte *
1262714 cycles - (242) proc_2: 386 - 4 byte *
448062 cycles - (245) proc_3: SSE - 16 byte *
574491 cycles - (164) proc_4: SSE - 128 byte
--------- aligned: 262159
1314077 cycles - ( 0) proc_0: ??? crt_memcpy *
3795774 cycles - ( 44) proc_1: AVX - 1 byte *
1363916 cycles - (242) proc_2: 386 - 4 byte *
992821 cycles - (245) proc_3: SSE - 16 byte *
1061304 cycles - (164) proc_4: SSE - 128 byte
--------- unaligned: 262159
1312767 cycles - ( 0) proc_0: ??? crt_memcpy *
3794592 cycles - ( 44) proc_1: AVX - 1 byte *
1352670 cycles - (242) proc_2: 386 - 4 byte *
995327 cycles - (245) proc_3: SSE - 16 byte *
1326086 cycles - (164) proc_4: SSE - 128 byte
result: * memcpy = memmove
3028304 cycles - proc_3: SSE - 16 byte *
4625673 cycles - proc_0: ??? crt_memcpy *
5055457 cycles - proc_2: 386 - 4 byte *
5179057 cycles - proc_4: SSE - 128 byte
13042386 cycles - proc_1: AVX - 1 byte *