This algorithm suitable for copying large amount of memory blocks (aligned or not). It can also be used to copy large strings when you already have their lenght
; Fast memory copy. Can also be used as a fast string copy if you know the string lenght.
; pdest = destination buffer for the copied memory
; psource = the inputed buffer
; lenght = len of the inputed buffer (psource).
; It is the same functionality as the memcpy from msvcrt.dll, bu it is faster.
; I works copying from 128 bits to 128 at once. (4 dwords)
Proc memcpy_SSE:
Arguments @pDest, @pSource, @Length
Uses esi, edi, ecx, edx, eax
mov edi D@pDest
mov esi D@pSource
; we are copying a memory from 128 to 128 bytes at once
mov ecx D@Length
mov eax ecx | shr ecx 4 ; integer count. Divide by 16 (4 dwords)
jz L0> ; The memory size if smaller then 16 bytes long. Jmp over
; No we must compute he remainder, to see how many times we will loop
mov edx ecx | shl edx 4 | sub eax edx ; remainder. Can be 0 to 15
mov edx 0 ; here it is used as an index
L1:
movupd XMM1 X$esi+edx*8 ; copy the 1st 4 dwords from esi to register XMM
movupd X$edi+edx*8 XMM1 ; copy the 1st 4 dwords from register XMM to edi
lea edx D$edx+2 ; we are copying the 128 bits. So instead simply inc by 1, we made it by 2, because each index holds only 8 bytes (limitation of the operand multiplication edx*8 / esi*8 etc)
; So, when edx = 0. edx*8 = 0. X$esi will point to esi+0 bytes
; when edx = 2. edx*8*2 = edx*16. X$esi will point to esi+16 bytes
; when edx = 4. edx*8*4 =edx*32. X$esi will point to esi+32 bytes.
; So. The important is that after each loop esiand edi must points 16 bytes ahead.
dec ecx ; ecx is our counter. It simply computes the lenght/16. Why 16 ? because we are jumping from 4 to 4 dwords. Which means that the loop is 16 x faster then using a regular byte by byte operation.
jnz L1<
emms ; clear the regsters back to use on FPU
shl edx 3 ; mul edx by 8 to get the pos
add edi edx
add esi edx
jmp L2> ; jmp to the remainder computation
L0:
; If we are here, It means that the data is smaller then 16 bytes, and we ned to compute the remainder.
mov edx ecx | shl edx 4 | sub eax edx ; remainder. Can be 0 to 15
L2:
; If the memory of not 4 dword aligned we may have some remainder here So, just clean them.
While eax <> 0
movsb
dec eax
End_While
EndP
Example of usage:
[OutputBuffer: B$ 0 #2048]
mov esi {B$ "Hello, my name is g works as expected, since i´ tryoing to give a update of here. Hello, my name is guga, i´m 41 years old. Brazilian. I am testing this 128 bit operation to see if it works ok ? I hope works as expected, since i´ tryoing to give a update of here. Hello, my name is guga, i´m 41 years old. Brazilian. I am testing this 128 bit operation to see if it works ok ? I hope works as expected, since i´ tryoing to ", 0};D@pSource
c_call 'msvcrt.strlen' esi
call memcpy_SSE OutputBuffer, esi, eax