as I said before this routine is PARTICULARLY made for UNALIGNED data
that is why I use MOVDQU command
there is no reason to create a sophisticated algorithm for aligned data
you can just use fastest command to do that depending on the ability of your machine
;r8 can contain sizeof(buffer)
.for (rcx=dest,rdx=src,r8=count,r8>>=5¦r8¦rcx+=32,rdx+=32,r8--)
vmovdqa ymm4,[rdx]
vmovdqa [rcx],ymm4
.endfor
;or for for 16 byte xmm:
;r8 can contain sizeof(buffer)
.for (rcx=dest,rdx=src,r8=count,r8>>=4¦r8¦rcx+=16,rdx+=16,r8--)
movdqa xmm4,[rdx]
movdqa [rcx],xmm4
.endfor
;for 32 bit machine
;eax can contain sizeof(buffer)
.for (ecx=dest,edx=src,eax=count,eax>>=4¦eax¦ecx+=16,edx+=16,eax--)
movdqa xmm4,[edx]
movdqa [ecx],xmm4
.endfor
;or for JJ2007
mov ecx,dest
mov edx,src
mov eax,sizeof(buffer)
shr eax,4
.while (eax)
movdqa xmm4,[edx]
movdqa [ecx],xmm4
add edx,16
add ecx,16
dec eax
.endw