Next challenge on the way to enlightenment is:
What is the fastest way to move a 3 bytes variable
into a dword variable/register?
And the reverse, of course.
Let's see what shows up this time :P
I bet Dave will enjoy this one. :lol:
mov eax,dword ptr var3bytes
and eax,0FFFFFFh
:P
Quote from: dedndave on January 21, 2013, 04:40:08 AM
mov eax,dword ptr var3bytes
and eax,0FFFFFFh
:P
Good idea Dave, I like it. :t
The reverse operation is still missing. Dave, give us the light.
I'm very curious to see if more solutions come up & what kind
of bit-imagination assembly programmers have developed :lol:
Quote from: Siekmanski on January 21, 2013, 07:30:41 AM
24bit to 32 bit,
* code modified,
.data
align 16
Buffer32 db 32*1024 dup (0)
ByteAndMask32 db -1,-1,-1,0,-1,-1,-1,0,-1,-1,-1,0,-1,-1,-1,0
ByteMask24BitSSE3 db 0,1,2,0,3,4,5,0,6,7,8,0,9,10,11,0
Bytes24bit db 1,2,3,4,5,6,7,8,9,10,11,12 ; etc.
.code
lea eax,ByteMask24BitSSE3
movdqa xmm1,[eax]
lea eax,ByteAndMask32
movdqa xmm2,[eax]
lea esi,Bytes24bit
lea edi,Buffer32
; mov ecx,1024
align 16
conversion_loop:
movdqu xmm0,[esi]
pshufb xmm0,xmm1
pand xmm0,xmm2
movdqa [edi],xmm0
; movdqu xmm0,[esi+12]
; pshufb xmm0,xmm1
; pand xmm0,xmm2
; movdqa [edi+16],xmm0
; add esi,24
; add edi,32
; dec ecx
; jnz conversion_loop
1,2,3,4,5,6,7,8,9,10,11,12
results in:
00030201h 00060504h 00090807h 000c0b0ah
Quote from: Siekmanski on January 21, 2013, 07:43:45 AM
.data
align 16
Buffer32 db 32*1024 dup (0)
ByteAndMask32 db -1,-1,-1,0,-1,-1,-1,0,-1,-1,-1,0,-1,-1,-1,0
ByteMask24BitSSE3 db 0,1,2,0,3,4,5,0,6,7,8,0,9,10,11,0
Bytes24bit db 1,2,3,4,5,6,7,8,9,10,11,12 ; etc.
.code
lea eax,ByteMask24BitSSE3
movdqa xmm1,[eax]
lea eax,ByteAndMask32
movdqa xmm2,[eax]
lea esi,Bytes24bit
lea edi,Buffer32
; mov ecx,1024
align 16
conversion_loop:
movdqu xmm0,[esi]
pshufb xmm0,xmm1
pand xmm0,xmm2
movdqa [edi],xmm0
; movdqu xmm0,[esi+12]
; pshufb xmm0,xmm1
; pand xmm0,xmm2
; movdqa [edi+16],xmm0
; add esi,24
; add edi,32
; dec ecx,1
; jnz conversion_loop
1,2,3,4,5,6,7,8,9,10,11,12
results in:
00030201h 00060504h 00090807h 000c0b0ah
Very interesting Siekmanski, I want to test it to see the performance
of SSE code against traditional 32 bit code. :t
32bit to 24 bit,
:biggrin:
.data
align 16
Bytes32bit dd 00030201h,00060504h,00090807h,000c0b0ah ; etc.
ByteMask24BitSSE3_2 db 0,1,2,4,5,6,8,9,10,12,13,14,3,3,3,3
Buffer24 db 24*1024+4 dup (0)
.code
lea eax,ByteMask24BitSSE3_2
movdqa xmm1,[eax]
lea esi,Bytes32bit
lea edi,Buffer24
; mov ecx,1024
align 16
conversion_loop2:
movdqa xmm0,[esi]
pshufb xmm0,xmm1
movdqu [edi],xmm0
; movdqa xmm0,[esi+16]
; pshufb xmm0,xmm1
; movdqu [edi+12],xmm0
; add esi,32
; add edi,24
; dec ecx
; jnz conversion_loop2
00030201h 00060504h 00090807h 000c0b0ah
results in:
1,2,3,4,5,6,7,8,9,10,11,12
Quote from: Siekmanski on January 21, 2013, 09:05:40 AM
32bit to 24 bit,
:biggrin:
.data
align 16
Bytes32bit dd 00030201h,00060504h,00090807h,000c0b0ah ; etc.
ByteMask24BitSSE3_2 db 0,1,2,4,5,6,8,9,10,12,13,14,3,3,3,3
Buffer24 db 24*1024+4 dup (0)
.code
lea eax,ByteMask24BitSSE3_2
movdqa xmm1,[eax]
lea esi,Bytes32bit
lea edi,Buffer24
; mov ecx,1024
align 16
conversion_loop2:
movdqa xmm0,[esi]
pshufb xmm0,xmm1
movdqu [edi],xmm0
; movdqa xmm0,[esi+16]
; pshufb xmm0,xmm1
; movdqu [edi+12],xmm0
; add esi,32
; add edi,24
; dec ecx,1
; jnz conversion_loop2
00030201h 00060504h 00090807h 000c0b0ah
results in:
1,2,3,4,5,6,7,8,9,10,11,12
Yes, this was the missing part. Very nice. I'll test them ASAP. :t
:icon_redface: Made a stupid mistake,
dec ecx,1
must be: dec ecx
sources modified.... :biggrin:
To get faster results,unrole the conversionloop 3 times to run in L1 cache (64 byte)
This 24 bit to 32 bit routine is now 60 bytes long and fits in the L1 cache.
( The 32 bit to 24 bit routine unroled 3 times is 65 byte so 1 byte to big to fit the L1 cache.)
mov ecx,128
align 16
conversion_loop2:
movdqa xmm0,[esi]
pshufb xmm0,xmm1
movdqu [edi],xmm0
movdqa xmm0,[esi+16]
pshufb xmm0,xmm1
movdqu [edi+12],xmm0
movdqa xmm0,[esi+32]
pshufb xmm0,xmm1
movdqu [edi+24],xmm0
add esi,48
add edi,36
dec ecx
jnz conversion_loop2
Quote from: Siekmanski on January 21, 2013, 10:02:06 AM
To get faster results,unrole the conversionloop 3 times to run in L1 cache (64 byte)
This 24 bit to 32 bit routine is now 60 bytes long and fits in the L1 cache.
( The 32 bit to 24 bit routine unroled 3 times is 65 byte so 1 byte to big to fit the L1 cache.)
mov ecx,128
align 16
conversion_loop2:
movdqa xmm0,[esi]
pshufb xmm0,xmm1
movdqu [edi],xmm0
movdqa xmm0,[esi+16]
pshufb xmm0,xmm1
movdqu [edi+12],xmm0
movdqa xmm0,[esi+32]
pshufb xmm0,xmm1
movdqu [edi+24],xmm0
add esi,48
add edi,36
dec ecx
jnz conversion_loop2
I'm preparing the test program, but I'm not sure it'll be ready soon.
It is night and I'm almost sleeping. :dazzled:
This is the structure of the test program:
;==============================================================================
; Test_mov3todw.asm
; ------------------------------------------------------------------------
; Example to test instructions that mov 3 bytes vars into dwords.
; The test uses 48 bytes string to be read 3 bytes each time, into 16 DW.
; ------------------------------------------------------------------------
; Frktons 20-jan-2013 @Masm32 Forum
;==============================================================================
include \masm32\include\masm32rt.inc
;==============================================================================
.nolist
.686
.xmm
include \masm32\macros\timers.asm
; get them from the
;[url=http://www.masm32.com/board/index.php?topic=770.0]Masm32 Laboratory[/url]
AxCPUid_Print PROTO
LOOP_COUNT EQU 1000
include \masm32\include\MyLib.inc
;==============================================================================
.data
align 16
Area DB "Here it is a string with 48 characters inside me",0
AreaLn = ($ - Area - 1)
align Four
AreaLen dd 0
Counter dd 0
PtrSource dd Area
PtrDest dd ArrayDW
align Four
LineSep db 72 dup("-"),0,0,0,0
align Four
PtrLineSep dd LineSep
.data?
align 16
ArrayDW dd 16 DUP (?)
align Four
CPU_Count DD ? ; Number of Cycles elapsed
.code
;==============================================================================
align Four
MovProc proc
mov edx, 1000 ; Number of cycles to perform
align Four
TotCycles:
mov esi, PtrSource
mov edi, PtrDest
mov ecx, 16
align Four
cycle:
mov eax, [esi]
and eax, 00FFFFFFH
mov [edi], eax
add esi, 3
add edi, Four
dec ecx
jnz cycle
dec edx
jnz TotCycles
ret
MovProc endp
;==============================================================================
align Four
DisplayArrayDW proc
mov ecx, 0
mov edx, PtrDest
Display:
pushad
print DWORD PTR edx
popad
add edx, Four
inc ecx
cmp ecx, 16
jnz Display
ret
DisplayArrayDW endp
;==============================================================================
align Four
Main proc
invoke GetLocaleInfo,LOCALE_USER_DEFAULT,LOCALE_STHOUSAND,offset Tsep,Four
invoke CharToOem,offset Tsep,offset Tsep
CALL FillMyArray
CALL FillMyArray0
INVOKE ConsoleSize, 40, 100
print PtrLineSep, 13, 10
invoke AxCPUid_Print
print PtrLineSep, 13, 10
REPEAT Four
;---------------------------------------------------------------------------------
invoke Sleep, 100
counter_begin LOOP_COUNT, HIGH_PRIORITY_CLASS
CALL MovProc
counter_end
mov edi, PtrFmtNum16
lea esi, InitString
movdqa xmm0, [esi]
movdqa [edi], xmm0
INVOKE FormatNumDW, eax, PtrFmtNum16
print PtrFmtNum16, 9, "cycles for Dave - MOV 4 bytes / AND", 13, 10
;---------------------------------------------------------------------------------
print PtrLineSep, 13, 10
ENDM
; CALL DisplayArrayDW
ret
Main endp
;-------------------------------------------------------------
include AxCPUid.inc
;-------------------------------------------------------------
;==============================================================================
start:
;==============================================================================
;==============================================================================
call Main
inkey
exit
;==============================================================================
end start
If you are still awake, try to adapt your code for the task.
Attached the files you need.
Frank
Inserted my routines.
------------------------------------------------------------------------
Intel(R) Core(TM)2 Quad CPU Q6600 @ 2.40GHz
Instructions: MMX, SSE1, SSE2, SSE3, SSSE3
------------------------------------------------------------------------
49.992 cycles for Dave - 48 bytes MOV 4 bytes / AND
------------------------------------------------------------------------
48.133 cycles for Dave - 48 bytes MOV 4 bytes / AND
------------------------------------------------------------------------
48.143 cycles for Dave - 48 bytes MOV 4 bytes / AND
------------------------------------------------------------------------
48.144 cycles for Dave - 48 bytes MOV 4 bytes / AND
------------------------------------------------------------------------
23.103 cycles for Siekmanski - 48 bytes SSSE3_24_32
------------------------------------------------------------------------
23.152 cycles for Siekmanski - 48 bytes SSSE3_24_32
------------------------------------------------------------------------
23.137 cycles for Siekmanski - 48 bytes SSSE3_24_32
------------------------------------------------------------------------
23.136 cycles for Siekmanski - 48 bytes SSSE3_24_32
------------------------------------------------------------------------
20.138 cycles for Siekmanski - 48 bytes SSSE3_24_32 unroled
------------------------------------------------------------------------
20.124 cycles for Siekmanski - 48 bytes SSSE3_24_32 unroled
------------------------------------------------------------------------
20.130 cycles for Siekmanski - 48 bytes SSSE3_24_32 unroled
------------------------------------------------------------------------
20.137 cycles for Siekmanski - 48 bytes SSSE3_24_32 unroled
------------------------------------------------------------------------
23.137 cycles for Siekmanski - 48 bytes SSSE3_32_24
------------------------------------------------------------------------
23.126 cycles for Siekmanski - 48 bytes SSSE3_32_24
------------------------------------------------------------------------
23.137 cycles for Siekmanski - 48 bytes SSSE3_32_24
------------------------------------------------------------------------
23.137 cycles for Siekmanski - 48 bytes SSSE3_32_24
------------------------------------------------------------------------
19.139 cycles for Siekmanski - 48 bytes SSSE3_32_24 unroled
------------------------------------------------------------------------
19.130 cycles for Siekmanski - 48 bytes SSSE3_32_24 unroled
------------------------------------------------------------------------
19.138 cycles for Siekmanski - 48 bytes SSSE3_32_24 unroled
------------------------------------------------------------------------
19.138 cycles for Siekmanski - 48 bytes SSSE3_32_24 unroled
------------------------------------------------------------------------
I looked at some of this code and noticed some new instructions that I had never seen before, and then it dawned on me, I don't have the manuals for my new quad A8 3520M cpu, I only have the manuals for my old dual core system.
Does anyone have a good link to AMD to get the CORRECT manuals for this CPU?
Dave.
Try here: http://developer.amd.com/resources/documentation-articles/developer-guides-manuals/
i have a prescott that supports SSE3, Marinus
crashes at PSHUFB XMM0,XMM1 :P
87,665 cycles for the first test, though
Quote from: sinsi on January 21, 2013, 02:44:47 PM
Try here: http://developer.amd.com/resources/documentation-articles/developer-guides-manuals/
Sinsi,
Thank you, exactly what I was looking for.
Dave.
My results so far:
Quote
------------------------------------------------------------------------
Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz
Instructions: MMX, SSE1, SSE2, SSE3, SSSE3
------------------------------------------------------------------------
72,205 cycles for Dave - 48 bytes MOV 4 bytes / AND
72,142 cycles for Dave - 48 bytes MOV 4 bytes / AND
72,138 cycles for Dave - 48 bytes MOV 4 bytes / AND
------------------------------------------------------------------------
34,586 cycles for Siekmanski - 48 bytes SSSE3_24_32
34,571 cycles for Siekmanski - 48 bytes SSSE3_24_32
34,575 cycles for Siekmanski - 48 bytes SSSE3_24_32
------------------------------------------------------------------------
30,106 cycles for Siekmanski - 48 bytes SSSE3_24_32 unrolled
30,067 cycles for Siekmanski - 48 bytes SSSE3_24_32 unrolled
30,111 cycles for Siekmanski - 48 bytes SSSE3_24_32 unrolled
------------------------------------------------------------------------
34,651 cycles for Siekmanski - 48 bytes SSSE3_32_24
34,758 cycles for Siekmanski - 48 bytes SSSE3_32_24
34,736 cycles for Siekmanski - 48 bytes SSSE3_32_24
------------------------------------------------------------------------
28,661 cycles for Siekmanski - 48 bytes SSSE3_32_24 unrolled
28,645 cycles for Siekmanski - 48 bytes SSSE3_32_24 unrolled
28,637 cycles for Siekmanski - 48 bytes SSSE3_32_24 unrolled
------------------------------------------------------------------------
I slightly modified the source to avoid the too many separator lines.
Siekmanski is apparently new on the forum and not used to our "traditional"
way of doing tests. :lol:
Compliments Siekmanski, you made a very good job. :t
Waiting for our masters version, though. :P
Attached the modified source.
Quote from: dedndave on January 21, 2013, 02:55:21 PM
i have a prescott that supports SSE3, Marinus
crashes at PSHUFB XMM0,XMM1 :P
87,665 cycles for the first test, though
You need an "
S" more on your SSE level, otherwise no
PSHUFBfor you.
Somebody messed up my string :eusa_naughty:
----------------------------------------------------------------------------
Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz
Instructions: MMX, SSE1, SSE2, SSE3, SSSE3
----------------------------------------------------------------------------
48,058 cycles for Dave - 48 bytes MOV 4 bytes / AND
48,069 cycles for Dave - 48 bytes MOV 4 bytes / AND
48,036 cycles for Dave - 48 bytes MOV 4 bytes / AND
----------------------------------------------------------------------------
54,914 cycles for Dave - 48 bytes MOV 4 bytes / AND - Unrolled
58,549 cycles for Dave - 48 bytes MOV 4 bytes / AND - Unrolled
60,832 cycles for Dave - 48 bytes MOV 4 bytes / AND - Unrolled
----------------------------------------------------------------------------
55,680 cycles for Dave - 48 bytes MOV 4 bytes / AND - Unrolled 2
55,653 cycles for Dave - 48 bytes MOV 4 bytes / AND - Unrolled 2
55,674 cycles for Dave - 48 bytes MOV 4 bytes / AND - Unrolled 2
----------------------------------------------------------------------------
34,575 cycles for Siekmanski - 48 bytes SSSE3_24_32
34,625 cycles for Siekmanski - 48 bytes SSSE3_24_32
34,540 cycles for Siekmanski - 48 bytes SSSE3_24_32
----------------------------------------------------------------------------
30,030 cycles for Siekmanski - 48 bytes SSSE3_24_32 unrolled
30,036 cycles for Siekmanski - 48 bytes SSSE3_24_32 unrolled
30,037 cycles for Siekmanski - 48 bytes SSSE3_24_32 unrolled
----------------------------------------------------------------------------
34,534 cycles for Siekmanski - 48 bytes SSSE3_32_24
34,533 cycles for Siekmanski - 48 bytes SSSE3_32_24
34,575 cycles for Siekmanski - 48 bytes SSSE3_32_24
----------------------------------------------------------------------------
28,543 cycles for Siekmanski - 48 bytes SSSE3_32_24 unrolled
28,533 cycles for Siekmanski - 48 bytes SSSE3_32_24 unrolled
28,538 cycles for Siekmanski - 48 bytes SSSE3_32_24 unrolled
----------------------------------------------------------------------------
Here it is a string with 48 characters inside me
Her itis stingwit 48chaactrs nsie meeeearaitis stingwit 48chaactrs nsie meeeeara stingwit 48chaac
trs nsie meeeearaingwit 48chaactrs nsie meeeearait 48chaactrs nsie meeeeara8chaactrs nsie meeeearaac
trs nsie meeeearas nsie meeeearaie meeeearaeeeearaaracte
Was it mr. Siekmansky ::)
Well Siekmansky, this report can help you find the bugs:
----------------------------------------------------------------------------
Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz
Instructions: MMX, SSE1, SSE2, SSE3, SSSE3
----------------------------------------------------------------------------
48,125 cycles for Dave - 48 bytes MOV 4 bytes / AND
48,093 cycles for Dave - 48 bytes MOV 4 bytes / AND
48,180 cycles for Dave - 48 bytes MOV 4 bytes / AND
Destination data Here it is a string with 48 characters inside me
----------------------------------------------------------------------------
54,307 cycles for Dave - 48 bytes MOV 4 bytes / AND - Unrolled
57,137 cycles for Dave - 48 bytes MOV 4 bytes / AND - Unrolled
57,148 cycles for Dave - 48 bytes MOV 4 bytes / AND - Unrolled
Destination data Here it is a string with 48 characters inside me
----------------------------------------------------------------------------
54,605 cycles for Dave - 48 bytes MOV 4 bytes / AND - Unrolled 2
44,780 cycles for Dave - 48 bytes MOV 4 bytes / AND - Unrolled 2
55,807 cycles for Dave - 48 bytes MOV 4 bytes / AND - Unrolled 2
Destination data Here it is a string with 48 characters inside me
----------------------------------------------------------------------------
34,573 cycles for Siekmanski - 48 bytes SSSE3_24_32
34,576 cycles for Siekmanski - 48 bytes SSSE3_24_32
34,565 cycles for Siekmanski - 48 bytes SSSE3_24_32
Destination data strinaracte
----------------------------------------------------------------------------
30,059 cycles for Siekmanski - 48 bytes SSSE3_24_32 unrolled
30,071 cycles for Siekmanski - 48 bytes SSSE3_24_32 unrolled
30,095 cycles for Siekmanski - 48 bytes SSSE3_24_32 unrolled
Destination data aracte
----------------------------------------------------------------------------
34,573 cycles for Siekmanski - 48 bytes SSSE3_32_24
34,611 cycles for Siekmanski - 48 bytes SSSE3_32_24
34,572 cycles for Siekmanski - 48 bytes SSSE3_32_24
Destination data HHHHHHHHHHHHiiiiiiiiiiiiaaaaaaaaaaaaaaaaaraHHHHHHHHiiiiiiiiiiiiaaaaaaaaaaaaaaaaara
HHHHiiiiiiiiiiiiaaaaaaaaaaaaaaaaaraiiiiiiiiiiiiaaaaaaaaaaaaaaaaaraiiiiiiiiaaaaaaaaaaaaaaaaaraiiiiaaa
aaaaaaaaaaaaaaraaaaaaaaaaaaaaaaaaraaaaaaaaaaaaaaraaaaaaaaaaraaaaaaraaracte
----------------------------------------------------------------------------
28,563 cycles for Siekmanski - 48 bytes SSSE3_32_24 unrolled
28,565 cycles for Siekmanski - 48 bytes SSSE3_32_24 unrolled
28,603 cycles for Siekmanski - 48 bytes SSSE3_32_24 unrolled
Destination data Her itis stingwit 48chaactrs nsie meeeearaitis stingwit 48chaactrs nsie meeeeara
stingwit 48chaactrs nsie meeeearaingwit 48chaactrs nsie meeeearait 48chaactrs nsie meeeeara8chaact
rs nsie meeeearaactrs nsie meeeearas nsie meeeearaie meeeearaeeeearaaracte
----------------------------------------------------------------------------
Attached the program to test the correctness of destination data.
mov eax,[esi]
and eax,0FFFFFFh
AND is dependant on MOV being completed
it would help if you could put a "do something else" instruction in there
it would also help if 0FFFFFFh is in a register
Quote from: dedndave on January 22, 2013, 02:50:45 AM
mov eax,[esi]
and eax,0FFFFFFh
AND is dependant on MOV being completed
it would help if you could put a "do something else" instruction in there
it would also help if 0FFFFFFh is in a register
Well, that's right, why don't you use the keyboard to do that? :P
It's not easy to understand someone elses code :lol:
I made it work OK now and exchanged the in and output.
Quote from: Siekmanski on January 22, 2013, 04:49:37 AM
It's not easy to understand someone elses code :lol:
I made it work OK now and exchanged the in and output.
Grab the code I posted in previous post and test your code, something is
still buggy, as it appears by the output.
Added my routine with XMM/SSE instructions. It requires a SSSE3
capable machine.
Quote
----------------------------------------------------------------------------
Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz
Instructions: MMX, SSE1, SSE2, SSE3, SSSE3
----------------------------------------------------------------------------
70,248 cycles for Dave - 48 bytes MOV 4 bytes / AND
48,078 cycles for Dave - 48 bytes MOV 4 bytes / AND
48,171 cycles for Dave - 48 bytes MOV 4 bytes / AND
Destination data:Here it is a string with 48 characters inside me
----------------------------------------------------------------------------
50,346 cycles for Dave - 48 bytes MOV 4 bytes / AND - Unrolled
59,081 cycles for Dave - 48 bytes MOV 4 bytes / AND - Unrolled
56,535 cycles for Dave - 48 bytes MOV 4 bytes / AND - Unrolled
Destination data:Here it is a string with 48 characters inside me
----------------------------------------------------------------------------
47,055 cycles for Dave - 48 bytes MOV 4 bytes / AND - Unrolled 2
57,040 cycles for Dave - 48 bytes MOV 4 bytes / AND - Unrolled 2
57,420 cycles for Dave - 48 bytes MOV 4 bytes / AND - Unrolled 2
Destination data:Here it is a string with 48 characters inside me
----------------------------------------------------------------------------
25,662 cycles for Frank - 48 bytes XMM/SSSE3 - Unrolled
38,100 cycles for Frank - 48 bytes XMM/SSSE3 - Unrolled
38,110 cycles for Frank - 48 bytes XMM/SSSE3 - Unrolled
Destination data:Here it is a string with 48 characters inside me
----------------------------------------------------------------------------
----------------------------------------------------------------------------
I skipped Siekmansky code for the time being, waiting for his
tests with the attached program.
Frank
And the last entry:
[quote]
----------------------------------------------------------------------------
Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz
Instructions: MMX, SSE1, SSE2, SSE3, SSSE3
----------------------------------------------------------------------------
50,769 cycles for Dave - 48 bytes MOV 4 bytes / AND
51,653 cycles for Dave - 48 bytes MOV 4 bytes / AND
52,787 cycles for Dave - 48 bytes MOV 4 bytes / AND
Destination data:Here it is a string with 48 characters inside me
----------------------------------------------------------------------------
43,982 cycles for Dave - 48 bytes MOV 4 bytes / AND - Unrolled
43,484 cycles for Dave - 48 bytes MOV 4 bytes / AND - Unrolled
44,191 cycles for Dave - 48 bytes MOV 4 bytes / AND - Unrolled
Destination data:Here it is a string with 48 characters inside me
----------------------------------------------------------------------------
44,852 cycles for Dave - 48 bytes MOV 4 bytes / AND - Unrolled 2
48,817 cycles for Dave - 48 bytes MOV 4 bytes / AND - Unrolled 2
52,075 cycles for Dave - 48 bytes MOV 4 bytes / AND - Unrolled 2
Destination data:Here it is a string with 48 characters inside me
----------------------------------------------------------------------------
30,254 cycles for Frank - 48 bytes XMM/SSSE3 - Unrolled
25,419 cycles for Frank - 48 bytes XMM/SSSE3 - Unrolled
29,016 cycles for Frank - 48 bytes XMM/SSSE3 - Unrolled
Destination data:Here it is a string with 48 characters inside me
----------------------------------------------------------------------------
25,407 cycles for Frank - 48 bytes XMM/SSSE3 - Unrolled II
28,221 cycles for Frank - 48 bytes XMM/SSSE3 - Unrolled II
28,249 cycles for Frank - 48 bytes XMM/SSSE3 - Unrolled II
Destination data:Here it is a string with 48 characters inside me
----------------------------------------------------------------------------
----------------------------------------------------------------------------