This is very suboptimal AVX-512 memmove based on previous Nidud AVX-64 memmove. It was tested and works very well, but slowly as mentioned.
I have done it in UASM because there is currently a problem with ASMC with AVX-512 instructions.
.xmm
OPTION EVEX:1
option SWITCHSTYLE : ASMSTYLE
option win64 : 6
option casemap:none
.code
switchAVX512_64 proc
mov r10,rcx
.if r8 <= 128
.switch r8
.case 0
ret
.case 1
mov cl,[rdx]
mov [r10],cl
ret
.case 2,3,4
mov cx,[rdx]
mov dx,[rdx+r8-2]
mov [r10+r8-2],dx
mov [r10],cx
ret
.case 5,6,7,8
mov ecx,[rdx]
mov edx,[rdx+r8-4]
mov [r10+r8-4],edx
mov [r10],ecx
ret
.case 9,10,11,12,13,14,15,16
mov rcx,[rdx]
mov rdx,[rdx+r8-8]
mov [r10],rcx
mov [r10+r8-8],rdx
ret
.case 17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32
vmovdqu xmm0,[rdx]
vmovdqu xmm1,[rdx+r8-16]
vmovups [r10],xmm0
vmovups [r10+r8-16],xmm1
ret
.case 33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,\
49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64
vmovdqu ymm0,[rdx]
vmovdqu ymm1,[rdx+r8-32]
vmovups [r10],ymm0
vmovups [r10+r8-32],ymm1
ret
.case 65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,\
88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,\
110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128
vmovdqu8 zmm0, [rdx]
vmovdqu8 zmm1,[rdx+r8-64]
vmovups [r10],zmm0
vmovups [r10+r8-64],zmm1
ret
.endswitch
.endif
vmovdqu8 zmm2,[rdx]
vmovdqu8 zmm3,[rdx+64]
vmovdqu8 zmm4,[rdx+r8-64]
vmovdqu8 zmm5,[rdx+r8-128]
.if r8 > 256
mov ecx,r10d
neg ecx
and ecx,128-1
add rdx,rcx
mov r9,r8
sub r9,rcx
add rcx,r10
and r9b,-128
.if rcx > rdx
.while 1
sub r9,128
vmovdqu8 zmm0,[rdx+r9]
vmovdqu8 zmm1,[rdx+r9+64]
vmovdqu8 [rcx+r9],zmm0
vmovdqu8 [rcx+r9+64],zmm1
.if ZERO?
.break
.endif
.endw
vmovdqu8 [r10],zmm2
vmovdqu8 [r10+64],zmm3
vmovdqu8 [r10+r8-64],zmm4
vmovdqu8 [r10+r8-128],zmm5
ret
;db 13 dup(0x90)
.endif
lea rcx,[rcx+r9]
lea rdx,[rdx+r9]
neg r9
.while 1
vmovdqu8 zmm0,[rdx+r9]
vmovdqu8 zmm1,[rdx+r9+64]
vmovdqu8 [rcx+r9],zmm0
vmovdqu8 [rcx+r9+64],zmm1
add r9,128
.if ZERO?
.break
.endif
.endw
.endif
vmovdqu8 [r10],zmm2
vmovdqu8 [r10+64],zmm3
vmovdqu8 [r10+r8-64],zmm4
vmovdqu8 [r10+r8-128],zmm5
ret
switchAVX512_64 endp
end
I attach the test program (no source code at this time). The program will run only the tests that are compatible with the computer. People without AVX will run 5 tests, people with AVX 7 tests and people with AVX- 512 8 tests.
There is a lot to say but only a few notes now:
- I added the rep movsb modified to support overlapping. This makes it not competitive against the others.
- The AVX memoves from nidud are normally faster.
- Agner Fog memove supports AVX and AVX-512 and has the advantage of falling back when the system has no AVX or above.
- memmove has a decent performance and does not use AVX, only SSE.
- This test also includes apex-memmove which claims to be the fastest memcpy/memmove on x86/x64 .. EVER, written in C.
https://www.codeproject.com/Articles/1110153/Apex-memmove-the-fastest-memcpy-memmove-on-x-x-EVEIt does not shine and is not up to its claims.
I attach also the results of the test performed on the Xeon with AVX-512. The times are slow due to the conditions under which the test was performed, so what is meaningful is how the various tests stake against one another.