News:

Masm32 SDK description, downloads and other helpful links
Message to All Guests
NB: Posting URL's See here: Posted URL Change

Main Menu

Sorting strings

Started by RuiLoureiro, May 29, 2014, 06:15:48 AM

Previous topic - Next topic

nidud

#105
deleted

dedndave

i would modify that code so that it aligns itself, first
should be faster for large blocks   :t

prescott w/htt
Intel(R) Pentium(R) 4 CPU 3.00GHz (SSE3)
------------------------------------------------------
47730      cycles - a   1..256  (  0) crt_memset
131030     cycles - a   1..256  ( 22) stosb
41845      cycles - a   1..256  ( 67) memzero
54881      cycles - a   1..256  ( 80) ZeroMem_SSE

55975      cycles - u   1..256  (  0) crt_memset
148928     cycles - u   1..256  ( 22) stosb
62356      cycles - u   1..256  ( 67) memzero
77576      cycles - u   1..256  ( 80) ZeroMem_SSE

13075466   cycles - a 400..8192 (  0) crt_memset
14596370   cycles - a 400..8192 ( 22) stosb
12471946   cycles - a 400..8192 ( 67) memzero
23698809   cycles - a 400..8192 ( 80) ZeroMem_SSE

24246850   cycles - u 400..8192 (  0) crt_memset
94611193   cycles - u 400..8192 ( 22) stosb
22922458   cycles - u 400..8192 ( 67) memzero
71302961   cycles - u 400..8192 ( 80) ZeroMem_SSE

jj2007

Intel(R) Celeron(R) M CPU        420  @ 1.60GHz (SSE3)
------------------------------------------------------
25333      cycles - a   1..256  (  0) crt_memcpy
23268      cycles - a   1..256  ( 98) memcpy
23663      cycles - a   1..256  (157) memcpy SSE2
24829      cycles - a   1..256  ( 89) regcopy
24889      cycles - a   1..256  (172) memcpyxmmU SSE
34273      cycles - a   1..256  ( 87) memcpy_SSE_V2
34119      cycles - a   1..256  ( 84) memcpy_SSE_V4

94394      cycles - u   1..256  (  0) crt_memcpy
69785      cycles - u   1..256  ( 98) memcpy
47876      cycles - u   1..256  (157) memcpy SSE2
56750      cycles - u   1..256  ( 89) regcopy
45682      cycles - u   1..256  (172) memcpyxmmU SSE
54575      cycles - u   1..256  ( 87) memcpy_SSE_V2
54696      cycles - u   1..256  ( 84) memcpy_SSE_V4

2127610    cycles - a 400..4000 (  0) crt_memcpy
2070954    cycles - a 400..4000 ( 98) memcpy
3257495    cycles - a 400..4000 (157) memcpy SSE2
2717699    cycles - a 400..4000 ( 89) regcopy
3279839    cycles - a 400..4000 (172) memcpyxmmU SSE
3782636    cycles - a 400..4000 ( 87) memcpy_SSE_V2
3785678    cycles - a 400..4000 ( 84) memcpy_SSE_V4

20412269   cycles - u 400..4000 (  0) crt_memcpy
12605121   cycles - u 400..4000 ( 98) memcpy
5820491    cycles - u 400..4000 (157) memcpy SSE2
8852658    cycles - u 400..4000 ( 89) regcopy
4978994    cycles - u 400..4000 (172) memcpyxmmU SSE
7896637    cycles - u 400..4000 ( 87) memcpy_SSE_V2
7894145    cycles - u 400..4000 ( 84) memcpy_SSE_V4

guga

This ?




Proc ZeroMem_SSE:
    Arguments @pMem, @Length
    Uses esi, edi, ecx, edx, eax

    mov edi D@pMem
    ; we are copying a memory from 128 to 128 bytes at once
    mov ecx D@Length
    mov eax ecx | shr ecx 4 ; integer count. Divide by 16 (4 dwords)
    jz L0> ; The memory size if smaller then 16 bytes long. Jmp over
        PREFETCHNTA B$edi
        align 16;16
        pxor XMM1 XMM1 ; clear XMM1 register
        ; No we must compute he remainder, to see how many times we will loop
        mov edx ecx | shl edx 4 | sub eax edx ; remainder. It can only have be 0 to 15 remainders bytes
        mov edx 0 ; here it is used as an index
        L1:
            movdqu X$edi+edx*8 XMM1 ; copy the 1st 4 dwords from register XMM to edi
            dec ecx
            lea edx D$edx+2
            jnz L1<
        test eax eax | jz L4> ; No remainders ? Exit
        jmp L9> ; jmp to the remainder computation

L0:
   ; If we are here, It means that the data is smaller then 16 bytes, and we ned to compute the remainder.
   mov edx ecx | shl edx 4 | sub eax edx ; remainder. It can only have be 0 to 15 remainders bytes

L2:

    ; If the memory is not 4 dword aligned we may have some remainder here So, just clean them.
    test eax eax | jz L4>  ; No remainders ? Exit
L9:
        lea edi D$edi+edx*8 ; mul edx by 8 to get the pos

L3:  mov B$edi+eax-1 0 | dec eax | jnz L3<

L4:

EndP

Coding in Assembly requires a mix of:
80% of brain, passion, intuition, creativity
10% of programming skills
10% of alcoholic levels in your blood.

My Code Sites:
http://rosasm.freeforums.org
http://winasm.tripod.com

dedndave

that's GoAsm syntax, Gustavo
Dave <--- Masm guy   :biggrin:

guga

Hmm...i don´t remember the proper syntax for masm.

But it should be something like:


ZeroMem_SSE Proc
    pMem:DWORD, Length:DWORD

    Uses esi, edi, ecx, edx, eax <---Just a RosAsm macro to push/pop those registers after and before the end of a function

    mov edi pMem
    ; we are copying a memory from 128 to 128 bytes at once
    mov ecx Length
    mov eax ecx
    shr ecx 4 ; integer count. Divide by 16 (4 dwords)
    jz L0 ; The memory size if smaller then 16 bytes long. Jmp over
        PREFETCHNTA Byte ptr [edi]
        align 16 <---- don´t recall the syntax for masm...but should be the same
        pxor XMM1 XMM1 ; clear XMM1 register
        ; No we must compute he remainder, to see how many times we will loop
        mov edx ecx | shl edx 4 | sub eax edx ; remainder. It can only have be 0 to 15 remainders bytes
        mov edx 0 ; here it is used as an index
        L1:
            movdqu [edi+edx*8 XMM1] ; copy the 1st 4 dwords from register XMM to edi
            dec ecx
            lea edx [edx+2]
            jnz L1
        test eax eax
        jz L4 ; No remainders ? Exit
        jmp L9 ; jmp to the remainder computation

L0:
   ; If we are here, It means that the data is smaller then 16 bytes, and we ned to compute the remainder.
   mov edx ecx | shl edx 4 | sub eax edx ; remainder. It can only have be 0 to 15 remainders bytes

L2:

    ; If the memory is not 4 dword aligned we may have some remainder here So, just clean them.
    test eax eax
    jz L4>  ; No remainders ? Exit
L9:
        lea edi [edi+edx*8] ; mul edx by 8 to get the pos

L3:  mov [edi+eax-1] 0 | dec eax | jnz L3

L4:

ZeroMem_SSE endp


Although it has been some years since i last coded using masm syntax, basically both are not at all that different (except, for the macros usage). The main differences i see is due to the fact that on RosAsm we must set the size directive to make easier to "see" what is a dword a word, byte etc.
D$ = dword ptr etc
W$ = word ptr etc
B$ = byte ptr
X$ = any type size used mainly in SSE
T$ = terabyte ptr
F$ = floating ptr
R$ = real floating ptr
Q$ = quadword ptr

Those are the mainly differences.

The rest is basically from the usage or not of the macros (I like to use due to readability)
Coding in Assembly requires a mix of:
80% of brain, passion, intuition, creativity
10% of programming skills
10% of alcoholic levels in your blood.

My Code Sites:
http://rosasm.freeforums.org
http://winasm.tripod.com

dedndave

T is probably TenByte Ptr   :P

guga

 :biggrin: my bad  :icon_mrgreen: TenByte not "Terabyte" :icon_mrgreen: :icon_mrgreen:
Coding in Assembly requires a mix of:
80% of brain, passion, intuition, creativity
10% of programming skills
10% of alcoholic levels in your blood.

My Code Sites:
http://rosasm.freeforums.org
http://winasm.tripod.com

Gunther

Hi nidud,

results for memcpy:

Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz (SSE4)
------------------------------------------------------
14554      cycles - a   1..256  (  0) crt_memcpy
14385      cycles - a   1..256  ( 98) memcpy
12017      cycles - a   1..256  (157) memcpy SSE2
13916      cycles - a   1..256  ( 89) regcopy
12463      cycles - a   1..256  (172) memcpyxmmU SSE
39451      cycles - a   1..256  ( 87) memcpy_SSE_V2
46031      cycles - a   1..256  ( 84) memcpy_SSE_V4

44040      cycles - u   1..256  (  0) crt_memcpy
41462      cycles - u   1..256  ( 98) memcpy
29933      cycles - u   1..256  (157) memcpy SSE2
37559      cycles - u   1..256  ( 89) regcopy
30145      cycles - u   1..256  (172) memcpyxmmU SSE
41172      cycles - u   1..256  ( 87) memcpy_SSE_V2
45774      cycles - u   1..256  ( 84) memcpy_SSE_V4

757294     cycles - a 400..4000 (  0) crt_memcpy
767330     cycles - a 400..4000 ( 98) memcpy
668195     cycles - a 400..4000 (157) memcpy SSE2
2342388    cycles - a 400..4000 ( 89) regcopy
643145     cycles - a 400..4000 (172) memcpyxmmU SSE
1147039    cycles - a 400..4000 ( 87) memcpy_SSE_V2
1286232    cycles - a 400..4000 ( 84) memcpy_SSE_V4

2441358    cycles - u 400..4000 (  0) crt_memcpy
2334729    cycles - u 400..4000 ( 98) memcpy
1461821    cycles - u 400..4000 (157) memcpy SSE2
2672089    cycles - u 400..4000 ( 89) regcopy
1289715    cycles - u 400..4000 (172) memcpyxmmU SSE
1159220    cycles - u 400..4000 ( 87) memcpy_SSE_V2
1288160    cycles - u 400..4000 ( 84) memcpy_SSE_V4
--- ok ---

Results for memzero:

Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz (SSE4)
------------------------------------------------------
39303      cycles - a   1..256  (  0) crt_memset
22441      cycles - a   1..256  ( 22) stosb
32617      cycles - a   1..256  ( 67) memzero
22725      cycles - a   1..256  ( 80) ZeroMem_SSE

41195      cycles - u   1..256  (  0) crt_memset
22984      cycles - u   1..256  ( 22) stosb
35609      cycles - u   1..256  ( 67) memzero
23456      cycles - u   1..256  ( 80) ZeroMem_SSE

2369821    cycles - a 400..8192 (  0) crt_memset
2143114    cycles - a 400..8192 ( 22) stosb
2355755    cycles - a 400..8192 ( 67) memzero
4063180    cycles - a 400..8192 ( 80) ZeroMem_SSE

2739486    cycles - u 400..8192 (  0) crt_memset
2944992    cycles - u 400..8192 ( 22) stosb
3212215    cycles - u 400..8192 ( 67) memzero
4038264    cycles - u 400..8192 ( 80) ZeroMem_SSE
--- ok ---


Gunther
You have to know the facts before you can distort them.

jj2007

#114
Quote from: guga on July 05, 2014, 10:52:33 AM
This ?

I've given it a try, Gustavo - version 0.01 of Ros2Masm attached here.below.
It expects a text file in the commandline; if there is no argument, RosAsmTest.asm is assumed. Use Console Build All in qEditor.

dedndave

you have to test aligned and unaligned starting addresses, of course   :P

guga

WOW..great work, JJ :t :t
Coding in Assembly requires a mix of:
80% of brain, passion, intuition, creativity
10% of programming skills
10% of alcoholic levels in your blood.

My Code Sites:
http://rosasm.freeforums.org
http://winasm.tripod.com

guga

 :icon_mrgreen: Sure, dave. The aligned version works faster :):):)

I tested it here and it works like a gem . Many tks :t
Coding in Assembly requires a mix of:
80% of brain, passion, intuition, creativity
10% of programming skills
10% of alcoholic levels in your blood.

My Code Sites:
http://rosasm.freeforums.org
http://winasm.tripod.com

RuiLoureiro

Hi guga

I couldn't do anything this weekend. I was sick (and...).
About your memcpy_SSE_V4, you should improve it

    1. If it jumps to L0, ECX=0.
        So we don't need to do this: mov edx ecx | shl edx 4
    2. L2 is not used
    3. If we do «sub eax edx» we don't need: test eax eax
       but only jz L4>
   
Quote
Proc memcpy_SSE_V4:
    Arguments @pDest, @pSource, @Length
    Uses esi, edi, ecx, edx, eax

    mov edi D@pDest
    mov esi D@pSource
    ; we are copying a memory from 128 to 128 bytes at once
    mov ecx D@Length
    mov eax ecx | shr ecx 4 ; integer count. Divide by 16 (4 dwords)
    jz L0> ; The memory size if smaller then 16 bytes long. Jmp over

        ; No we must compute he remainder, to see how many times we will loop
        mov edx ecx | shl edx 4 | sub eax edx ; remainder. It can only have be 0 to 15 remainders bytes
        mov edx 0 ; here it is used as an index
        L1:
            lddqu XMM1 X$esi+edx*8 ; copy the 1st 4 dwords from esi to register XMM
            movups X$edi+edx*8 XMM1 ; copy the 1st 4 dwords from register XMM to edi
            dec ecx
            lea edx D$edx+2
            jnz L1<
        test eax eax | jz L4> ; No remainders ? Exit
        jmp L9> ; jmp to the remainder computation
L0:
   ; If we are here, It means that the data is smaller then 16 bytes, and we ned to compute the remainder.
   mov edx ecx | shl edx 4 | sub eax edx ; remainder. It can only have be 0 to 15 remainders bytes

L2:

    ; If the memory is not 4 dword aligned we may have some remainder here So, just clean them.
    test eax eax | jz L4>  ; No remainders ? Exit
L9:
        lea edi D$edi+edx*8 ; mul edx by 8 to get the pos
        lea esi D$esi+edx*8 ; mul edx by 8 to get the pos
L3:  movsb | dec eax | jnz L3<
L4:
EndP

RuiLoureiro

#119
Hi,
        These are my results.
       
        MOVEAtoB_SSEG is a macro with your V4 (+/-)
        (MOVEAtoB_SSEG srcName, dstName, cpyLen)
       
        Could you post your results ?

        Gunther, could you run this CopyString47, and CopyString48, please ?
        Thanks
     
**replace 128 BYTES by 128 BITS

Quote
NOT ALIGNED
-----------------------------------------------------
Intel(R) Pentium(R) 4 CPU 3.40GHz (SSE3)
-----------------------------------------------------
***** Time table *****

  5 milliseconds, MOVEAtoB_SSEE-  13 bytes- copy 128 BYTES+MOVZX
  5 milliseconds, MOVEAtoB_XZZF-  13 bytes- copy lenght DWORDS+MOVZX
  5 milliseconds, MOVEAtoB_XZZC-  13 bytes- copy lenght DWORDS+MOVZX
  5 milliseconds, MOVEAtoB_XZE-   13 bytes- copy lenght DWORDS+MOVZX
  5 milliseconds, MOVEAtoB_XZD-   13 bytes- copy lenght DWORDS+MOVZX
  5 milliseconds, MOVEAtoB_XZC-   13 bytes- copy lenght DWORDS+MOVZX
  6 milliseconds, MOVEAtoB_XZZA-  13 bytes- copy lenght DWORDS+MOVZX
  6 milliseconds, MOVEAtoB_XZZD-  13 bytes- copy lenght DWORDS+MOVZX
  6 milliseconds, MOVEAtoB_XZZE-  13 bytes- copy lenght DWORDS+MOVZX
  6 milliseconds, MOVEAtoB_XZZB-  13 bytes- copy lenght DWORDS+MOVZX
  6 milliseconds, MOVEAtoB_XZB-   13 bytes- copy lenght DWORDS+MOVZX
  8 milliseconds, MOVEAtoB_XZA-   13 bytes- copy lenght DWORDS+MOVZX
12 milliseconds, MOVEAtoB_XZE-   53 bytes- copy lenght DWORDS+MOVZX
12 milliseconds, MOVEAtoB_XZZF-  53 bytes- copy lenght DWORDS+MOVZX
12 milliseconds, MOVEAtoB_XZD-   53 bytes- copy lenght DWORDS+MOVZX
12 milliseconds, MOVEAtoB_XZZB-  53 bytes- copy lenght DWORDS+MOVZX
12 milliseconds, MOVEAtoB_XZC-   53 bytes- copy lenght DWORDS+MOVZX
12 milliseconds, MOVEAtoB_XZZD-  53 bytes- copy lenght DWORDS+MOVZX
12 milliseconds, MOVEAtoB_XZB-   53 bytes- copy lenght DWORDS+MOVZX
12 milliseconds, MOVEAtoB_XZZA-  53 bytes- copy lenght DWORDS+MOVZX
12 milliseconds, MOVEAtoB_XZA-   53 bytes- copy lenght DWORDS+MOVZX
12 milliseconds, MOVEAtoB_XZZE-  53 bytes- copy lenght DWORDS+MOVZX
18 milliseconds, MOVEAtoB_XZZC-  53 bytes- copy lenght DWORDS+MOVZX
34 milliseconds, MOVEAtoB_XZZF- 103 bytes- copy lenght DWORDS+MOVZX
34 milliseconds, MOVEAtoB_XZE-  103 bytes- copy lenght DWORDS+MOVZX
35 milliseconds, MOVEAtoB_XZZA- 103 bytes- copy lenght DWORDS+MOVZX
35 milliseconds, MOVEAtoB_XZZB- 103 bytes- copy lenght DWORDS+MOVZX
35 milliseconds, MOVEAtoB_XZB-  103 bytes- copy lenght DWORDS+MOVZX
35 milliseconds, MOVEAtoB_XZD-  103 bytes- copy lenght DWORDS+MOVZX
35 milliseconds, MOVEAtoB_XZZE- 103 bytes- copy lenght DWORDS+MOVZX
35 milliseconds, MOVEAtoB_XZZD- 103 bytes- copy lenght DWORDS+MOVZX
35 milliseconds, MOVEAtoB_XZC-  103 bytes- copy lenght DWORDS+MOVZX
37 milliseconds, MOVEAtoB_SSEE53 bytes- copy 128 BYTES+MOVZX
37 milliseconds, MOVEAtoB_XZA-  103 bytes- copy lenght DWORDS+MOVZX
39 milliseconds, MOVEAtoB_SSEH-  53 bytes- copy 128 BYTES+MOVZX
40 milliseconds, MOVEAtoB_SSEG53 bytes- copy 128 BYTES+MOVZX
42 milliseconds, MOVEAtoB_XZZC- 103 bytes- copy lenght DWORDS+MOVZX
42 milliseconds, MOVEAtoB_SSEG-  13 bytes- copy 128 BYTES+MOVZX
42 milliseconds, MOVEAtoB_SSEH-  13 bytes- copy 128 BYTES+MOVZX
51 milliseconds, MOVEAtoB_SSEE- 103 bytes- copy 128 BYTES+MOVZX
52 milliseconds, MOVEAtoB_SSEH- 103 bytes- copy 128 BYTES+MOVZX
56 milliseconds, MOVEAtoB_SSEG- 103 bytes- copy 128 BYTES+MOVZX
57 milliseconds, MOVEAtoB_XZB-  203 bytes- copy lenght DWORDS+MOVZX
57 milliseconds, MOVEAtoB_XZZA- 203 bytes- copy lenght DWORDS+MOVZX
57 milliseconds, MOVEAtoB_XZZF- 203 bytes- copy lenght DWORDS+MOVZX
57 milliseconds, MOVEAtoB_XZC-  203 bytes- copy lenght DWORDS+MOVZX
57 milliseconds, MOVEAtoB_XZZD- 203 bytes- copy lenght DWORDS+MOVZX
57 milliseconds, MOVEAtoB_XZD-  203 bytes- copy lenght DWORDS+MOVZX
57 milliseconds, MOVEAtoB_XZZB- 203 bytes- copy lenght DWORDS+MOVZX
58 milliseconds, MOVEAtoB_XZZE- 203 bytes- copy lenght DWORDS+MOVZX
59 milliseconds, MOVEAtoB_XZZC- 203 bytes- copy lenght DWORDS+MOVZX
59 milliseconds, MOVEAtoB_XZE-  203 bytes- copy lenght DWORDS+MOVZX
61 milliseconds, MOVEAtoB_XZA-  203 bytes- copy lenght DWORDS+MOVZX
122 milliseconds, MOVEAtoB_SSEH- 203 bytes- copy 128 BYTES+MOVZX
124 milliseconds, MOVEAtoB_XZZF- 503 bytes- copy lenght DWORDS+MOVZX
124 milliseconds, MOVEAtoB_XZE-  503 bytes- copy lenght DWORDS+MOVZX
124 milliseconds, MOVEAtoB_XZZC- 503 bytes- copy lenght DWORDS+MOVZX
124 milliseconds, MOVEAtoB_XZC-  503 bytes- copy lenght DWORDS+MOVZX
124 milliseconds, MOVEAtoB_XZZA- 503 bytes- copy lenght DWORDS+MOVZX
124 milliseconds, MOVEAtoB_XZZB- 503 bytes- copy lenght DWORDS+MOVZX
124 milliseconds, MOVEAtoB_XZZE- 503 bytes- copy lenght DWORDS+MOVZX
124 milliseconds, MOVEAtoB_SSEG- 203 bytes- copy 128 BYTES+MOVZX
125 milliseconds, MOVEAtoB_XZD-  503 bytes- copy lenght DWORDS+MOVZX
125 milliseconds, MOVEAtoB_SSEE- 203 bytes- copy 128 BYTES+MOVZX
126 milliseconds, MOVEAtoB_XZZD- 503 bytes- copy lenght DWORDS+MOVZX
126 milliseconds, MOVEAtoB_XZB-  503 bytes- copy lenght DWORDS+MOVZX
145 milliseconds, MOVEAtoB_XZA-  503 bytes- copy lenght DWORDS+MOVZX
238 milliseconds, MOVEAtoB_XZZE-1027 bytes- copy lenght DWORDS+MOVZX
238 milliseconds, MOVEAtoB_XZZA-1027 bytes- copy lenght DWORDS+MOVZX
238 milliseconds, MOVEAtoB_XZZB-1027 bytes- copy lenght DWORDS+MOVZX
238 milliseconds, MOVEAtoB_XZZC-1027 bytes- copy lenght DWORDS+MOVZX
239 milliseconds, MOVEAtoB_XZE- 1027 bytes- copy lenght DWORDS+MOVZX
240 milliseconds, MOVEAtoB_XZZF-1027 bytes- copy lenght DWORDS+MOVZX
240 milliseconds, MOVEAtoB_XZB- 1027 bytes- copy lenght DWORDS+MOVZX
240 milliseconds, MOVEAtoB_XZZD-1027 bytes- copy lenght DWORDS+MOVZX
241 milliseconds, MOVEAtoB_XZA- 1027 bytes- copy lenght DWORDS+MOVZX
242 milliseconds, MOVEAtoB_XZC- 1027 bytes- copy lenght DWORDS+MOVZX
249 milliseconds, MOVEAtoB_XZD- 1027 bytes- copy lenght DWORDS+MOVZX

308 milliseconds, MOVEAtoB_SSEG- 503 bytes- copy 128 BYTES+MOVZX
312 milliseconds, MOVEAtoB_SSEH- 503 bytes- copy 128 BYTES+MOVZX
317 milliseconds, MOVEAtoB_SSEE- 503 bytes- copy 128 BYTES+MOVZX
601 milliseconds, MOVEAtoB_SSEG-1027 bytes- copy 128 BYTES+MOVZX
629 milliseconds, MOVEAtoB_SSEH-1027 bytes- copy 128 BYTES+MOVZX
674 milliseconds, MOVEAtoB_SSEE-1027 bytes- copy 128 BYTES+MOVZX
********** END III **********
Quote
NOT ALIGNED
-----------------------------------------------------
Intel(R) Pentium(R) 4 CPU 3.40GHz (SSE3)
-----------------------------------------------------
***** Time table *****

  5 milliseconds, MOVEAtoB_SSED-  13 bytes- copy 128 BYTES+MOVZX
  5 milliseconds, MOVEAtoB_SSEF-  13 bytes- copy 128 BYTES+MOVZX
  5 milliseconds, MOVEAtoB_SSEE-  13 bytes- copy 128 BYTES+MOVZX
  5 milliseconds, MOVEAtoB_XZZF-  13 bytes- copy lenght DWORDS+MOVZX
  6 milliseconds, MOVEAtoB_XZZC-  13 bytes- copy lenght DWORDS+MOVZX
  8 milliseconds, MOVEAtoB_XZE-   13 bytes- copy lenght DWORDS+MOVZX
14 milliseconds, MOVEAtoB_XZZE-  13 bytes- copy lenght DWORDS+MOVZX

15 milliseconds, MOVEAtoB_XZE-   53 bytes- copy lenght DWORDS+MOVZX
15 milliseconds, MOVEAtoB_XZZF-  53 bytes- copy lenght DWORDS+MOVZX
17 milliseconds, MOVEAtoB_XZZC53 bytes- copy lenght DWORDS+MOVZX
18 milliseconds, MOVEAtoB_SSEF-  53 bytes- copy 128 BYTES+MOVZX
20 milliseconds, MOVEAtoB_SSED-  53 bytes- copy 128 BYTES+MOVZX
25 milliseconds, MOVEAtoB_SSEE-  53 bytes- copy 128 BYTES+MOVZX
29 milliseconds, MOVEAtoB_SSEG-  53 bytes- copy 128 BYTES+MOVZX
29 milliseconds, MOVEAtoB_XZZF- 103 bytes- copy lenght DWORDS+MOVZX
29 milliseconds, MOVEAtoB_SSEH53 bytes- copy 128 BYTES+MOVZX

30 milliseconds, MOVEAtoB_XZZE- 103 bytes- copy lenght DWORDS+MOVZX
30 milliseconds, MOVEAtoB_XZE-  103 bytes- copy lenght DWORDS+MOVZX
31 milliseconds, MOVEAtoB_XZZC- 103 bytes- copy lenght DWORDS+MOVZX
32 milliseconds, MOVEAtoB_SSED- 103 bytes- copy 128 BYTES+MOVZX
34 milliseconds, MOVEAtoB_XZZE-  53 bytes- copy lenght DWORDS+MOVZX
35 milliseconds, MOVEAtoB_SSEE- 103 bytes- copy 128 BYTES+MOVZX
40 milliseconds, MOVEAtoB_SSEF- 103 bytes- copy 128 BYTES+MOVZX
42 milliseconds, MOVEAtoB_SSEH-  13 bytes- copy 128 BYTES+MOVZX
43 milliseconds, MOVEAtoB_SSEG-  13 bytes- copy 128 BYTES+MOVZX

48 milliseconds, MOVEAtoB_XZZE- 203 bytes- copy lenght DWORDS+MOVZX
49 milliseconds, MOVEAtoB_SSEH- 103 bytes- copy 128 BYTES+MOVZX
49 milliseconds, MOVEAtoB_XZZF- 203 bytes- copy lenght DWORDS+MOVZX
49 milliseconds, MOVEAtoB_SSEG- 103 bytes- copy 128 BYTES+MOVZX
51 milliseconds, MOVEAtoB_XZZC- 203 bytes- copy lenght DWORDS+MOVZX
53 milliseconds, MOVEAtoB_XZE-  203 bytes- copy lenght DWORDS+MOVZX
67 milliseconds, MOVEAtoB_SSED- 203 bytes- copy 128 BYTES+MOVZX
67 milliseconds, MOVEAtoB_SSEF- 203 bytes- copy 128 BYTES+MOVZX
70 milliseconds, MOVEAtoB_SSEE- 203 bytes- copy 128 BYTES+MOVZX
91 milliseconds, MOVEAtoB_SSEH- 203 bytes- copy 128 BYTES+MOVZX
92 milliseconds, MOVEAtoB_SSEG- 203 bytes- copy 128 BYTES+MOVZX

99 milliseconds, MOVEAtoB_XZZE- 503 bytes- copy lenght DWORDS+MOVZX
99 milliseconds, MOVEAtoB_XZZC- 503 bytes- copy lenght DWORDS+MOVZX
114 milliseconds, MOVEAtoB_XZE-  503 bytes- copy lenght DWORDS+MOVZX
116 milliseconds, MOVEAtoB_XZZF- 503 bytes- copy lenght DWORDS+MOVZX
127 milliseconds, MOVEAtoB_SSED- 503 bytes- copy 128 BYTES+MOVZX
127 milliseconds, MOVEAtoB_SSEE- 503 bytes- copy 128 BYTES+MOVZX
129 milliseconds, MOVEAtoB_SSEF- 503 bytes- copy 128 BYTES+MOVZX
144 milliseconds, MOVEAtoB_SSEH- 503 bytes- copy 128 BYTES+MOVZX
151 milliseconds, MOVEAtoB_SSEG- 503 bytes- copy 128 BYTES+MOVZX

185 milliseconds, MOVEAtoB_XZZF-1027 bytes- copy lenght DWORDS+MOVZX
187 milliseconds, MOVEAtoB_XZZE-1027 bytes- copy lenght DWORDS+MOVZX
193 milliseconds, MOVEAtoB_XZE- 1027 bytes- copy lenght DWORDS+MOVZX
209 milliseconds, MOVEAtoB_XZZC-1027 bytes- copy lenght DWORDS+MOVZX
241 milliseconds, MOVEAtoB_SSEE-1027 bytes- copy 128 BYTES+MOVZX
241 milliseconds, MOVEAtoB_SSEF-1027 bytes- copy 128 BYTES+MOVZX
243 milliseconds, MOVEAtoB_SSED-1027 bytes- copy 128 BYTES+MOVZX
248 milliseconds, MOVEAtoB_SSEG-1027 bytes- copy 128 BYTES+MOVZX
252 milliseconds, MOVEAtoB_SSEH-1027 bytes- copy 128 BYTES+MOVZX
********** END III **********
Quote
NOT ALIGNED
-----------------------------------------------------
Intel(R) Pentium(R) 4 CPU 3.40GHz (SSE3)
------------------------------------------------------
***** Time table *****

  16 cycles, MOVEAtoB_SSED-  13 bytes- copy 128 BITS+MOVZX
  17 cycles, MOVEAtoB_SSEF-  13 bytes- copy 128 BITS+MOVZX
  18 cycles, MOVEAtoB_SSEE-  13 bytes- copy 128 BITS+MOVZX
  18 cycles, MOVEAtoB_XZZF-  13 bytes- copy lenght DWORDS+MOVZX
  18 cycles, MOVEAtoB_XZZE-  13 bytes- copy lenght DWORDS+MOVZX
  19 cycles, MOVEAtoB_XZZC-  13 bytes- copy lenght DWORDS+MOVZX
  27 cycles, MOVEAtoB_XZE-   13 bytes- copy lenght DWORDS+MOVZX
 
  39 cycles, MOVEAtoB_XZZF-  53 bytes- copy lenght DWORDS+MOVZX
  40 cycles, MOVEAtoB_XZZE-  53 bytes- copy lenght DWORDS+MOVZX
  40 cycles, MOVEAtoB_XZE-   53 bytes- copy lenght DWORDS+MOVZX
  40 cycles, MOVEAtoB_XZZC-  53 bytes- copy lenght DWORDS+MOVZX
117 cycles, MOVEAtoB_XZZF- 103 bytes- copy lenght DWORDS+MOVZX
117 cycles, MOVEAtoB_XZZE- 103 bytes- copy lenght DWORDS+MOVZX
118 cycles, MOVEAtoB_XZZC- 103 bytes- copy lenght DWORDS+MOVZX
120 cycles, MOVEAtoB_XZE-  103 bytes- copy lenght DWORDS+MOVZX

121 cycles, MOVEAtoB_SSED-  53 bytes- copy 128 BITS+MOVZX
126 cycles, MOVEAtoB_SSEF-  53 bytes- copy 128 BITS+MOVZX
129 cycles, MOVEAtoB_SSEE-  53 bytes- copy 128 BITS+MOVZX
131 cycles, MOVEAtoB_SSEH-  53 bytes- copy 128 BITS+MOVZX
137 cycles, MOVEAtoB_SSEG-  53 bytes- copy 128 BITS+MOVZX
144 cycles, MOVEAtoB_SSEH-  13 bytes- copy 128 BITS+MOVZX
144 cycles, MOVEAtoB_SSEG-  13 bytes- copy 128 BITS+MOVZX
171 cycles, MOVEAtoB_SSED- 103 bytes- copy 128 BITS+MOVZX
173 cycles, MOVEAtoB_SSEF- 103 bytes- copy 128 BITS+MOVZX
175 cycles, MOVEAtoB_SSEE- 103 bytes- copy 128 BITS+MOVZX
178 cycles, MOVEAtoB_SSEH- 103 bytes- copy 128 BITS+MOVZX
188 cycles, MOVEAtoB_SSEG- 103 bytes- copy 128 BITS+MOVZX

193 cycles, MOVEAtoB_XZZF- 203 bytes- copy lenght DWORDS+MOVZX
194 cycles, MOVEAtoB_XZZE- 203 bytes- copy lenght DWORDS+MOVZX
196 cycles, MOVEAtoB_XZZC- 203 bytes- copy lenght DWORDS+MOVZX
210 cycles, MOVEAtoB_XZE-  203 bytes- copy lenght DWORDS+MOVZX
367 cycles, MOVEAtoB_SSED- 203 bytes- copy 128 BITS+MOVZX
410 cycles, MOVEAtoB_SSEG- 203 bytes- copy 128 BITS+MOVZX
423 cycles, MOVEAtoB_XZZC- 503 bytes- copy lenght DWORDS+MOVZX
425 cycles, MOVEAtoB_XZZF- 503 bytes- copy lenght DWORDS+MOVZX
426 cycles, MOVEAtoB_SSEE- 203 bytes- copy 128 BITS+MOVZX
427 cycles, MOVEAtoB_XZZE- 503 bytes- copy lenght DWORDS+MOVZX
437 cycles, MOVEAtoB_SSEH- 203 bytes- copy 128 BITS+MOVZX
442 cycles, MOVEAtoB_SSEF- 203 bytes- copy 128 BITS+MOVZX

493 cycles, MOVEAtoB_XZE-  503 bytes- copy lenght DWORDS+MOVZX
823 cycles, MOVEAtoB_XZE- 1027 bytes- copy lenght DWORDS+MOVZX
835 cycles, MOVEAtoB_XZZE-1027 bytes- copy lenght DWORDS+MOVZX
933 cycles, MOVEAtoB_XZZC-1027 bytes- copy lenght DWORDS+MOVZX
987 cycles, MOVEAtoB_XZZF-1027 bytes- copy lenght DWORDS+MOVZX
1024 cycles, MOVEAtoB_SSED- 503 bytes- copy 128 BITS+MOVZX
1043 cycles, MOVEAtoB_SSEG- 503 bytes- copy 128 BITS+MOVZX
1064 cycles, MOVEAtoB_SSEH- 503 bytes- copy 128 BITS+MOVZX
1077 cycles, MOVEAtoB_SSEF- 503 bytes- copy 128 BITS+MOVZX
1081 cycles, MOVEAtoB_SSEE- 503 bytes- copy 128 BITS+MOVZX
2045 cycles, MOVEAtoB_SSEG-1027 bytes- copy 128 BITS+MOVZX
2117 cycles, MOVEAtoB_SSED-1027 bytes- copy 128 BITS+MOVZX
2141 cycles, MOVEAtoB_SSEH-1027 bytes- copy 128 BITS+MOVZX
2293 cycles, MOVEAtoB_SSEE-1027 bytes- copy 128 BITS+MOVZX
2293 cycles, MOVEAtoB_SSEF-1027 bytes- copy 128 BITS+MOVZX
********** END III **********