Author Topic: Code location sensitivity of timings  (Read 36168 times)

nidud

  • Member
  • *****
  • Posts: 2311
    • https://github.com/nidud/asmc
Re: Code location sensitivity of timings
« Reply #15 on: July 21, 2014, 06:19:30 AM »
they're all over the place   :P

Before I made the changes my machine was all over the place and yours was stable using the same test   :P

Now the AMD I use produce the same result all the time with no deviation regardless of delay or not, so that's good if you in a hurry. The test is also constructed this way, so there are three runs with different loop count and input.

Using an Intel machine produce this result:
Code: [Select]
Intel(R) Core(TM) i3 CPU         540  @ 3.07GHz (SSE4)
---------------------------------------------
88029   cycles - (  0) 0: crt_strchr
83409   cycles - ( 29) 1: x
50004   cycles - (119) 2: 'c'
49914   cycles - (107) 3: 'cccc'

84574   cycles - (  0) 0: crt_strchr
83375   cycles - ( 29) 1: x
50586   cycles - (119) 2: 'c'
50408   cycles - (107) 3: 'cccc'

109988  cycles - (  0) 0: crt_strchr
22161   cycles - ( 29) 1: x
19388   cycles - (119) 2: 'c'
16840   cycles - (107) 3: 'cccc'

--2--

93149   cycles - (  0) 0: crt_strchr
83171   cycles - ( 29) 1: x
49828   cycles - (119) 2: 'c'
49987   cycles - (107) 3: 'cccc'

84252   cycles - (  0) 0: crt_strchr
83229   cycles - ( 29) 1: x
50892   cycles - (119) 2: 'c'
50861   cycles - (107) 3: 'cccc'

110783  cycles - (  0) 0: crt_strchr
22027   cycles - ( 29) 1: x
19821   cycles - (119) 2: 'c'
16775   cycles - (107) 3: 'cccc'

--3--

100491  cycles - (  0) 0: crt_strchr
83365   cycles - ( 29) 1: x
50163   cycles - (119) 2: 'c'
50950   cycles - (107) 3: 'cccc'

84368   cycles - (  0) 0: crt_strchr
82911   cycles - ( 29) 1: x
50919   cycles - (119) 2: 'c'
52683   cycles - (107) 3: 'cccc'

110577  cycles - (  0) 0: crt_strchr
22136   cycles - ( 29) 1: x
19384   cycles - (119) 2: 'c'
16812   cycles - (107) 3: 'cccc'

If you organize the result you get this:

88029   cycles - (  0) 0: crt_strchr
93149   cycles - (  0) 0: crt_strchr
100491  cycles - (  0) 0: crt_strchr

83409   cycles - ( 29) 1: x
83171   cycles - ( 29) 1: x
83365   cycles - ( 29) 1: x

50004   cycles - (119) 2: 'c'
49828   cycles - (119) 2: 'c'
50163   cycles - (119) 2: 'c'

49914   cycles - (107) 3: 'cccc'
49987   cycles - (107) 3: 'cccc'
50950   cycles - (107) 3: 'cccc'

--unaligned--

84574   cycles - (  0) 0: crt_strchr
84252   cycles - (  0) 0: crt_strchr
84368   cycles - (  0) 0: crt_strchr

83375   cycles - ( 29) 1: x
83229   cycles - ( 29) 1: x
82911   cycles - ( 29) 1: x

50586   cycles - (119) 2: 'c'
50892   cycles - (119) 2: 'c'
50919   cycles - (119) 2: 'c'

50408   cycles - (107) 3: 'cccc'
50861   cycles - (107) 3: 'cccc'
52683   cycles - (107) 3: 'cccc'

--short test—

109988  cycles - (  0) 0: crt_strchr
110783  cycles - (  0) 0: crt_strchr
110577  cycles - (  0) 0: crt_strchr

22161   cycles - ( 29) 1: x
22027   cycles - ( 29) 1: x
22136   cycles - ( 29) 1: x

19388   cycles - (119) 2: 'c'
19821   cycles - (119) 2: 'c'
19384   cycles - (119) 2: 'c'

16840   cycles - (107) 3: 'cccc'
16775   cycles - (107) 3: 'cccc'
16812   cycles - (107) 3: 'cccc'




nidud

  • Member
  • *****
  • Posts: 2311
    • https://github.com/nidud/asmc
Re: Code location sensitivity of timings
« Reply #16 on: July 23, 2014, 01:21:19 AM »
some more string probing..

I rewrote most of the string function and need some timings

here is the time for strcpy:

AMD Athlon(tm) II X2 245 Processor (SSE3)
-------------------------------------------
-- aligned strings --
605184  cycles -  10 (  0) 0: crt_strcpy
1490896 cycles -  10 ( 37) 1: movsb
587922  cycles -  10 (118) 2: aligned
554282  cycles -  10 ( 85) 3: unaligned
171751  cycles -  10 (188) 4: SSE aligned
172081  cycles -  10 (141) 5: SSE unaligned
-- unaligned strings --
607417  cycles -  10 (  0) 0: crt_strcpy
1489108 cycles -  10 ( 37) 1: movsb
583246  cycles -  10 (118) 2: aligned
631376  cycles -  10 ( 85) 3: unaligned
171912  cycles -  10 (188) 4: SSE aligned
233758  cycles -  10 (141) 5: SSE unaligned
-- short strings --
124021  cycles - 800 (  0) 0: crt_strcpy
337044  cycles - 800 ( 37) 1: movsb
116820  cycles - 800 (118) 2: aligned
116822  cycles - 800 ( 85) 3: unaligned
45622   cycles - 800 (188) 4: SSE aligned
46020   cycles - 800 (141) 5: SSE unaligned

nidud

  • Member
  • *****
  • Posts: 2311
    • https://github.com/nidud/asmc
Re: Code location sensitivity of timings
« Reply #17 on: July 23, 2014, 01:28:20 AM »
this is the strlen version
Code: [Select]
strlen  proc uses edx string
mov eax,string
align 4
@@: mov edx,[eax]
add eax,4
lea ecx,[edx-01010101h]
not edx
and ecx,edx
and ecx,80808080h
jz @B
bsf ecx,ecx
shr ecx,3
sub eax,string
lea eax,[eax+ecx-4]
mov ecx,eax
ret
strlen  endp

and the SSE2 version:
Code: [Select]
strlen  proc string
mov ecx,string
xorps xmm1,xmm1 ; SSE2
align 4
@@: movdqu  xmm0,[ecx] ; SSE2
pcmpeqb xmm0,xmm1 ; SSE2
pmovmskb eax,xmm0 ; SSE2
add ecx,16 ;
test eax,eax
jz @B
bsf eax,eax
sub ecx,string
lea eax,[eax+ecx-16]
mov ecx,eax
ret
strlen  endp

here is the time for strlen:
AMD Athlon(tm) II X2 245 Processor (SSE3)
-----------------------------------------
127029  cycles - 1000 (  0) 0: crt_strchr
102667  cycles - 1000 ( 51) 1: unaligned
123454  cycles - 1000 ( 91) 2: aligned
51428   cycles - 1000 ( 47) 3: SSE

128706  cycles - 1000 (  0) 0: crt_strchr
102644  cycles - 1000 ( 51) 1: unaligned
123452  cycles - 1000 ( 91) 2: aligned
50759   cycles - 1000 ( 47) 3: SSE

127429  cycles - 1000 (  0) 0: crt_strchr
102647  cycles - 1000 ( 51) 1: unaligned
124072  cycles - 1000 ( 91) 2: aligned
50757   cycles - 1000 ( 47) 3: SSE

nidud

  • Member
  • *****
  • Posts: 2311
    • https://github.com/nidud/asmc
Re: Code location sensitivity of timings
« Reply #18 on: July 23, 2014, 01:55:38 AM »
some fixup on the strchr function:
Code: [Select]
strchr proc uses edx ebx string, char
mov eax,char
mov ah,al
mov ebx,eax
shl ebx,16
mov bx,ax
not ebx
mov eax,string
align 4
lup: mov edx,[eax]
lea ecx,[edx-01010101H]
not edx
and ecx,edx
and ecx,80808080h
jnz tail
add eax,4
sub edx,ebx
lea ecx,[edx-01010101H]
not edx
and ecx,edx
and ecx,80808080h
jz lup
sub eax,4
align 2
tail: mov ecx,[eax]
test cl,cl
jz null
not ebx
cmp bl,cl
je toend
test ch,ch
jz null
inc eax
cmp bl,ch
je toend
shr ecx,16
test cl,cl
jz null
inc eax
cmp cl,bl
je toend
align 2
null: xor eax,eax
align 2
toend:  ret
strchr endp


AMD Athlon(tm) II X2 245 Processor (SSE3)
-----------------------------------------
-- aligned strings --
330270  cycles - (  0) 0: crt_strchr
364594  cycles - ( 29) 1: x
213070  cycles - (105) 2: 'c'
212955  cycles - (112) 9: no bsf
-- unaligned strings --
332545  cycles - (  0) 0: crt_strchr
362595  cycles - ( 29) 1: x
237024  cycles - (105) 2: 'c'
237257  cycles - (112) 9: no bsf
-- short strings --
43523   cycles - (  0) 0: crt_strchr
31017   cycles - ( 29) 1: x
16518   cycles - (105) 2: 'c'
17020   cycles - (112) 9: no bsf

nidud

  • Member
  • *****
  • Posts: 2311
    • https://github.com/nidud/asmc
Re: Code location sensitivity of timings
« Reply #19 on: July 23, 2014, 02:08:28 AM »
the strstr function also failed, so here is an updated version

AMD Athlon(tm) II X2 245 Processor (SSE3)
-------------------------------------------
-- aligned strings --
533061  cycles -  10 (  0) 0: crt_strstr
918530  cycles -  10 (  0) 1: InString(1,dst,src) - 1 + dst
668698  cycles -  10 ( 46) 2: strstr
565355  cycles -  10 ( 57) 3: x
308053  cycles -  10 (150) 4: x
205373  cycles -  10 (176) 5: x
-- unaligned strings --
533096  cycles -  10 (  0) 0: crt_strstr
938277  cycles -  10 (  0) 1: InString(1,dst,src) - 1 + dst
666364  cycles -  10 ( 46) 2: strstr
565143  cycles -  10 ( 57) 3: x
321082  cycles -  10 (150) 4: x
215754  cycles -  10 (176) 5: x
-- short strings --
86064   cycles - 500 (  0) 0: crt_strstr
192023  cycles - 500 (  0) 1: InString(1,dst,src) - 1 + dst
113705  cycles - 500 ( 46) 2: strstr
89017   cycles - 500 ( 57) 3: x
73041   cycles - 500 (150) 4: x
67541   cycles - 500 (176) 5: x

nidud

  • Member
  • *****
  • Posts: 2311
    • https://github.com/nidud/asmc
Re: Code location sensitivity of timings
« Reply #20 on: July 23, 2014, 03:41:08 AM »
So, how to implement the SSE functions into the library?

I'm now using functions like SetFilePointerEx:
Quote
Requirements
Minimum supported client
Windows XP [desktop apps | Windows Store apps]
Minimum supported server
Windows Server 2003 [desktop apps | Windows Store apps]

The SSE level used is SSE2 so how common is this combination?

The implementation is currently to include both versions. I use the GetSSELevel function to set a sselevel variable on startup. Each module will then auto install on demand:
Code: [Select]
.data
p_strlen dd strlen
.code
Install:
.if sselevel & SSE_SSE2
    mov p_strlen,SSE_strlen
.endif
ret

pragma_init Install,41

end

and the header file:
Code: [Select]
ifdef __SSE__
pr1 typedef proto :dword
externdef p_strlen:ptr pr1
strlen equ <p_strlen>
else
strlen proto :ptr byte
endif

Both of the functions will then have to be included in the binary in order to compensate for the missing SSE functions, so what is the norm?

jj2007

  • Member
  • *****
  • Posts: 11783
  • Assembler is fun ;-)
    • MasmBasic
Re: Code location sensitivity of timings
« Reply #21 on: July 23, 2014, 05:36:38 AM »
Minimum supported client
Windows XP

The SSE level used is SSE2 so how common is this combination?

It may hurt the feelings of some fans of old hard- and software, but writing code for >=(SSE2 & Win XP) should be OK for 99% of the users.

There is a poll on SSE support here: "I'm still waiting for SSE support :) (5 votes [2.45%])"

That was 2006, 8 years ago ;)

nidud

  • Member
  • *****
  • Posts: 2311
    • https://github.com/nidud/asmc
Re: Code location sensitivity of timings
« Reply #22 on: July 23, 2014, 06:43:29 AM »
just implement it then I guess

is it normal to test the SSE level and exit if not present?

dedndave

  • Member
  • *****
  • Posts: 8829
  • Still using Abacus 2.0
    • DednDave
Re: Code location sensitivity of timings
« Reply #23 on: July 23, 2014, 07:48:38 AM »
...or provide fallback routines
you can run a little startup init routine - detect SSE support level - and fill in addresses of PROC's
i am working on something along that line at the moment

these define TYPE's for up to 6 dword parms - you can extend it easily
Code: [Select]
_FUNC00  TYPEDEF PROTO
_FUNC04  TYPEDEF PROTO :DWORD
_FUNC08  TYPEDEF PROTO :DWORD,:DWORD
_FUNC12  TYPEDEF PROTO :DWORD,:DWORD,:DWORD
_FUNC16  TYPEDEF PROTO :DWORD,:DWORD,:DWORD,:DWORD
_FUNC20  TYPEDEF PROTO :DWORD,:DWORD,:DWORD,:DWORD,:DWORD
_FUNC24  TYPEDEF PROTO :DWORD,:DWORD,:DWORD,:DWORD,:DWORD,:DWORD

_PFUNC00 TYPEDEF Ptr _FUNC00
_PFUNC04 TYPEDEF Ptr _FUNC04
_PFUNC08 TYPEDEF Ptr _FUNC08
_PFUNC12 TYPEDEF Ptr _FUNC12
_PFUNC16 TYPEDEF Ptr _FUNC16
_PFUNC20 TYPEDEF Ptr _FUNC20
_PFUNC24 TYPEDEF Ptr _FUNC24

then, i am using a structure with function pointers in it
Code: [Select]
_FUNC STRUCT
  lpfnFunc1  _PFUNC04 ?    ;this function has 1 dword arg
  lpfnFunc2  _PFUNC12 ?    ;this function has 3 dword args
_FUNC STRUCT

and, in the .DATA? section...
Code: [Select]
_Func _FUNC <>
so, you set _Func.lpfnFunc1 and _Func.lpfnFunc2 to point at appropriate routines for the supported SSE level
then.....
Code: [Select]
    INVOKE  _Func.lpfnFunc1,arg1
    INVOKE  _Func.lpfnFunc2,arg1,arg2,arg3

;or

    push    edi
    mov     edi,offset _Func
    INVOKE  [edi]._FUNC.lpfnFunc1,arg1
    INVOKE  [edi]._FUNC.lpfnFunc2,arg1,arg2,arg3
    pop     edi

another way to go would be to put all the routines for each support level into a DLL
then, at init, load the DLL that is appropriate for the machine
the routines can then all have the same names

dedndave

  • Member
  • *****
  • Posts: 8829
  • Still using Abacus 2.0
    • DednDave
Re: Code location sensitivity of timings
« Reply #24 on: July 23, 2014, 07:53:25 AM »
most people probably have at least SSE3
however, we can look at the forum members, alone, and find a few machines
some that probably support only MMX or SSE(1)

i bought this machine in 2005 - it supports SSE3, which was a new thing at the time
so - it's almost 10 years old

Gunther

  • Member
  • *****
  • Posts: 3802
  • Forgive your enemies, but never forget their names
Re: Code location sensitivity of timings
« Reply #25 on: July 23, 2014, 09:03:40 AM »
i bought this machine in 2005 - it supports SSE3, which was a new thing at the time
so - it's almost 10 years old

SSE3 was introduced in April 2005 with the Prescott revision of the Pentium 4 processor.

Gunther
Get your facts first, and then you can distort them.

nidud

  • Member
  • *****
  • Posts: 2311
    • https://github.com/nidud/asmc
Re: Code location sensitivity of timings
« Reply #26 on: July 23, 2014, 09:26:21 AM »
Quote
you can run a little startup init routine - detect SSE support level - and fill in addresses of PROC's
I use modular libraries with startup modules:
Code: [Select]
.486
.model  flat
option  casemap:none

public  _cstart_

ifdef __WCC__
C0_main equ <main_>
extrn main_:abs
else
C0_main equ <main>
main proto c
endif

exit proto stdcall :dword
Initialize proto stdcall :dword, :dword

_INIT segment dword flat public 'INIT'
_INIT ENDS
_IEND segment dword flat public 'INIT'
_IEND ENDS

.code

_cstart_:
mov edx,offset _INIT
mov eax,offset _IEND
invoke  Initialize,edx,eax
call C0_main
invoke  exit,eax
end _cstart_

so the startup routine execute proc’s in the _INIT segment
Code: [Select]
pragma_init macro pp, priority
_INIT segment dword flat public 'INIT'
dd pp
dd priority
_INIT ends
endm

The SSE init code is then
Code: [Select]
;
; Stolen fom Dave at the end of the road (dedndave)
;
; http://masm32.com/board/index.php?topic=3373.msg35658#msg35658
;
include math.inc
include io.inc
include stdlib.inc

public  sselevel

.data
sselevel dd 0

error db "CPU error: Need SSE2 level",13,10
size_m equ $ - offset error

.code
.586

Install:

    pushfd
    pop     eax
    mov     ecx,200000h
    mov     edx,eax
    xor     eax,ecx
    push    eax
    popfd
    pushfd
    pop     eax
    xor     eax,edx
    and     eax,ecx
    .if !ZERO?
push ebx
xor eax,eax
cpuid
.if eax
    .if ah == 5
xor eax,eax
    .else
mov eax,1
cpuid
xor eax,eax
bt ecx,20       ;SSE4.2
rcl eax,1       ;into bit 6
bt ecx,19       ;SSE4.1
rcl eax,1       ;into bit 5
bt ecx,9       ;SSSE3
rcl eax,1       ;into bit 4
bt ecx,0       ;SSE3
rcl eax,1       ;into bit 3
bt edx,26       ;SSE2
rcl eax,1       ;into bit 2
bt edx,25       ;SSE
rcl eax,1       ;into bit 1
bt ecx,0       ;MMX
rcl eax,1       ;into bit 0
mov sselevel,eax
    .endif
.endif
pop ebx
    .endif
    .if !(eax & SSE_SSE2)
invoke GetStdHandle,STD_OUTPUT_HANDLE
push eax
mov edx,esp
invoke WriteFile,eax,addr error,size_m,edx,0
pop eax
invoke ExitProcess,0
    .endif
    ret

pragma_init Install,4

end

and the modules will the install as needed
Code: [Select]
.code

strlen  proc uses edx string:ptr byte
mov eax,string
align 4
@@: mov edx,[eax]
add eax,4
lea ecx,[edx-01010101h]
not edx
and ecx,edx
and ecx,80808080h
jz @B
bsf ecx,ecx
shr ecx,3
sub eax,string
lea eax,[eax+ecx-4]
mov ecx,eax
ret
strlen  endp

ifdef __SSE__

public  p_strlen

.data
p_strlen dd strlen

.code
.686
.xmm


SSE_strlen proc string:ptr byte
mov ecx,string
xorps xmm1,xmm1 ; SSE2
align 4
@@: movdqu  xmm0,[ecx] ; SSE2
pcmpeqb xmm0,xmm1 ; SSE2
pmovmskb eax,xmm0 ; SSE2
add ecx,16 ;
test eax,eax
jz @B
bsf eax,eax
sub ecx,string
lea eax,[eax+ecx-16]
mov ecx,eax
ret
SSE_strlen endp

Install:
.if sselevel & SSE_SSE2
    mov p_strlen,SSE_strlen
.endif
ret

pragma_init Install,41

endif

END

The priority of 4 will then be called before 41.

In this way only used functions will be linked in and called by the Initialize() function. The exit() function also use the Initialize() with an _EXIT segment (.map file):

_INIT             INIT         AUTO        00428c04        00000050
_IEND             INIT         AUTO        00428c54        00000000
_EXIT             EXIT         AUTO        00428c54        00000010
_EEND             EXIT         AUTO        00428c64        00000000

Quote
then, i am using a structure with function pointers in it

That will be a bit complicated. I have C-files calling strlen and other functions hundrets of times so the proc name have to be the same. Then you only need to change the header and recompile.

//int strlen(char *);
int (*p_strlen)(char *);
#define strlen p_strlen


It is also possible to "code it" directly

//int strlen(char *);
int (*strlen)(char *);
...
;strlen   proto :dword
externdef   strlen:ptr pr1


Code: [Select]
.data
strlen  dd strlen1
public  strlen
.code
strlen1:
mov eax,[esp+4]
...
ret 4
strlen2:
mov ecx,[esp+4]
...
ret 4
Install:
.if sselevel & SSE_SSE2
    mov strlen,strlen2
.endif
ret

pragma_init Install,41

SSE3 was introduced in April 2005 with the Prescott revision of the Pentium 4 processor.

The simplest way will be to just implement it with an ifdef in the function body. That way I may recompile the old source if needed. Think I will keep the SSE level test thought.

nidud

  • Member
  • *****
  • Posts: 2311
    • https://github.com/nidud/asmc
Re: Code location sensitivity of timings
« Reply #27 on: July 24, 2014, 03:45:40 AM »
here is the memcpy test
larger blocks is faster but the tail bytes is a problem
using movsb on the tail is painfully slow

Code: [Select]
memcpy  proc uses esi edi dst, src, count
mov edi,dst
mov esi,src
mov ecx,count
and ecx,-32
jz tail
test edi,11B ; aligned ?
jnz align4
align 4
lup: sub ecx,32
movdqu  xmm0,[esi+ecx]
movdqu  xmm1,[esi+ecx+16]
movdqu  [edi+ecx],xmm0
movdqu  [edi+ecx+16],xmm1
jnz lup
mov ecx,count
movdqu  xmm0,[esi+ecx-32]
movdqu  xmm1,[esi+ecx-16]
movdqu  [edi+ecx-32],xmm0
movdqu  [edi+ecx-16],xmm1
align 2
toend:  mov eax,dst
ret
align 4
align4:
mov eax,edi ; align 16
neg eax
and eax,1111B
movdqu  xmm0,[esi] ; copy 16 byte
movdqu  [edi],xmm0
add edi,eax ; set new offset
add esi,eax
jmp lup
align 4
tail: xor ecx,count
jz toend
test ecx,-2
jz @1
test ecx,-4
jz @2
test ecx,-8
jz @4
test ecx,-16
jz @8
movdqu  xmm0,[esi] ; 31 byte
movdqu  [edi],xmm0 ; |16...|
movdqu  xmm0,[esi+ecx-16] ; |...16|
movdqu  [edi+ecx-16],xmm0
jmp toend
align 4
@8: movq xmm0,[esi] ; 15 byte
movq [edi],xmm0 ; |8...|
movq xmm0,[esi+ecx-8] ; |...8|
movq [edi+ecx-8],xmm0
jmp toend
align 4
@4: mov eax,[esi]
mov [edi],eax
mov eax,[esi+ecx-4]
mov [edi+ecx-4],eax
jmp toend
align 4
@2: mov eax,[esi]
mov [edi],ax
shr eax,16
mov [edi+ecx-1],al
jmp toend
align 4
@1: mov al,[esi]
mov [edi],al
jmp toend
memcpy  endp

here is the time
AMD Athlon(tm) II X2 245 Processor (SSE3)
------------------------------------------
-- aligned strings --
225399  cycles -  10 (  0) 0: crt_memcpy
222840  cycles -  10 (145) 1: 16
220338  cycles -  10 (209) 3: 32
222193  cycles -  10 (309) 5: 64
222769  cycles -  10 (186) 2: 16 aligned
219131  cycles -  10 (249) 4: 32 aligned
223450  cycles -  10 (349) 6: 64 aligned
223475  cycles -  10 (145) 7: 64 movsb
-- unaligned strings --
233090  cycles -  10 (  0) 0: crt_memcpy
311348  cycles -  10 (145) 1: 16
302111  cycles -  10 (209) 3: 32
319732  cycles -  10 (309) 5: 64
233073  cycles -  10 (186) 2: 16 aligned
215451  cycles -  10 (249) 4: 32 aligned
224974  cycles -  10 (349) 6: 64 aligned
319863  cycles -  10 (145) 7: 64 movsb
-- short strings 15 --
209267  cycles - 8000 (  0) 0: crt_memcpy
171989  cycles - 8000 (145) 1: 16
146799  cycles - 8000 (209) 3: 32
151210  cycles - 8000 (309) 5: 64
147463  cycles - 8000 (186) 2: 16 aligned
148036  cycles - 8000 (249) 4: 32 aligned
142108  cycles - 8000 (349) 6: 64 aligned
420999  cycles - 8000 (145) 7: 64 movsb


jj2007

  • Member
  • *****
  • Posts: 11783
  • Assembler is fun ;-)
    • MasmBasic
Re: Code location sensitivity of timings
« Reply #28 on: July 24, 2014, 04:32:46 AM »
SSE3 was introduced in April 2005 with the Prescott revision of the Pentium 4 processor.

SSE2 was introduced in November 2000 with the P4 Willamette. In general, it's absolutely sufficient (try your luck, make Instr_() faster with SSE7.8...); in particular, pcmpeqb and pmovmskb are important improvements.
« Last Edit: July 24, 2014, 10:20:50 AM by jj2007 »

nidud

  • Member
  • *****
  • Posts: 2311
    • https://github.com/nidud/asmc
Re: Code location sensitivity of timings
« Reply #29 on: July 25, 2014, 04:29:11 AM »
with regard to memcpy there seems little gain using SSE

Code: [Select]
memcpy  proc dst, src, count
push esi
push edi
mov edi,[esp+12]
mov esi,[esp+16]
mov ecx,[esp+20]
test ecx,-16
jz @F
mov eax,[esi]
mov [edi],eax
mov eax,edi
neg eax
and eax,11B
add edi,eax
add esi,eax
sub ecx,eax
@@: rep movsb
mov eax,[esp+12]
pop edi
pop esi
ret 12
memcpy  endp


-- aligned strings --
889418  cycles -  10 (  0) 0: crt_memcpy
891309  cycles -  10 ( 48) 1: movsb
854402  cycles -  10 (182) 2: SSE
-- unaligned strings --
923670  cycles -  10 (  0) 0: crt_memcpy
924396  cycles -  10 ( 48) 1: movsb
881774  cycles -  10 (182) 2: SSE
-- short strings --
805432  cycles - 8000 (  0) 0: crt_memcpy
1306044 cycles - 8000 ( 48) 1: movsb
520039  cycles - 8000 (182) 2: SSE


using MOVSD helps on the short strings
Code: [Select]
add esi,eax
sub ecx,eax ;--
mov eax,ecx
shr ecx,2
rep movsd
and eax,11B
mov ecx,eax ;--
@@: rep movsb

-- short strings --
805432  cycles - 8000 (  0) 0: crt_memcpy
819735  cycles - 8000 ( 62) 1: movsb
520039  cycles - 8000 (182) 2: SSE


conclution:
- in newer CPU's MOVSB is faster than moving blocks
- in older CPU's MOVSB gets faster with size
- SSE may be faster depending on CPU


String compare

Code: [Select]
strcmp  proc uses edi esi dst, src
mov edi,dst
mov esi,src
lea edi,[edi-4]
lea esi,[esi-4]
align 16 ; align main loop
@@: lea edi,[edi+4]
lea esi,[esi+4]
mov eax,[esi]
lea ecx,[eax-01010101H]
not eax
and ecx,eax
and ecx,80808080h
jnz @F
not eax
cmp eax,[edi]
je @B
align 4
@@: mov al,[edi]
mov ah,[esi]
inc edi
inc esi
test al,al
jz @F
cmp al,ah
je @B
sbb al,al
sbb al,-1
@@: movsx eax,al
ret
strcmp endp

Code: [Select]
strcmp proc uses esi edi s1, s2
mov edi,s1
mov esi,s2
xorps xmm2,xmm2 ; clear xmm2 for zero test
align 16
@@: movdqu  xmm0,[esi]
movdqu  xmm1,[edi]
pcmpeqb xmm1,xmm0 ; compare
pmovmskb eax,xmm1
pcmpeqb xmm0,xmm2 ; test for zero
pmovmskb ecx,xmm0
lea edi,[edi+16]
lea esi,[esi+16]
not ax
or ecx,eax
jz @B
bsf ecx,ecx
mov al,[edi+ecx-16]
test al,al
jz @F
cmp al,[esi+ecx-16]
sbb al,al
sbb al,-1
@@: movsx eax,al
ret
strcmp endp


AMD Athlon(tm) II X2 245 Processor (SSE3)
------------------------------------------
-- large strings: 4096 byte --
822761  cycles - 100 (  0) 0: crt_strcmp
1646116 cycles - 100 ( 40) 1: strcmp
415725  cycles - 100 ( 91) 2: x
107534  cycles - 100 ( 81) 3: SSE
-- small strings: 64 byte --
163135  cycles - 999 (  0) 0: crt_strcmp
306028  cycles - 999 ( 40) 1: strcmp
109462  cycles - 999 ( 91) 2: x
48633   cycles - 999 ( 81) 3: SSE



Code: [Select]
stricmp  proc uses edi esi dst, src
mov edi,dst
mov esi,src
lea edi,[edi-4]
lea esi,[esi-4]
align 16 ; align main loop
@@: lea edi,[edi+4]
lea esi,[esi+4]
mov eax,[esi]
lea ecx,[eax-01010101H]
not eax
and ecx,eax
and ecx,80808080h
jnz @F
mov eax,[esi]
cmp eax,[edi]
je @B
mov ecx,[edi]
or eax,20202020h
or ecx,20202020h
cmp eax,ecx
je @B
align 4
@@: mov al,[edi]
mov ah,[esi]
inc edi
inc esi
test al,al
jz @F
cmp al,ah
je @B
or ax,2020h
cmp al,ah
je @B
sbb al,al
sbb al,-1
@@: movsx eax,al
ret
stricmp endp

Code: [Select]
stricmp proc uses esi edi s1, s2
mov edi,s1
mov esi,s2
xorps xmm2,xmm2 ; clear xmm2 for zero test
mov eax,20202020h
movd xmm3,eax
pshufd  xmm3,xmm3,0 ; populate 20h for case
align 16
@@: movdqu  xmm0,[esi]
movdqu  xmm1,[edi]
movdqa  xmm4,xmm0
pcmpeqb xmm4,xmm2 ; test for zero
pmovmskb ecx,xmm4
orps xmm0,xmm3 ; A..Z to a..z
orps xmm1,xmm3
pcmpeqb xmm1,xmm0 ; compare
pmovmskb eax,xmm1
lea edi,[edi+16]
lea esi,[esi+16]
not ax
or ecx,eax
jz @B
bsf ecx,ecx
mov al,[edi+ecx-16]
test al,al
jz @F
cmp al,[esi+ecx-16]
sbb al,al
sbb al,-1
@@: movsx eax,al
ret
stricmp endp


-- large strings: 4096 byte --
1647384 cycles - 100 (  0) 0: crt__stricmp
1646490 cycles - 100 ( 72) 1: stricmp
414651  cycles - 100 (119) 2: x
158892  cycles - 100 (107) 3: SSE
-- small strings: 64 byte --
339452  cycles - 999 (  0) 0: crt__stricmp
298264  cycles - 999 ( 72) 1: stricmp
105344  cycles - 999 (119) 2: x
60089   cycles - 999 (107) 3: SSE