Author Topic: MASM FOR FUN - REBORN - #0 Extract low order bytes from dwords  (Read 49013 times)

frktons

  • Member
  • ***
  • Posts: 491
Re: MASM FOR FUN - REBORN - #0 Extract low order bytes from dwords
« Reply #90 on: November 29, 2012, 11:46:50 AM »
So the complete test for greater, less than or equal should
be something like this:

Code: [Select]
        pcmpgtd   xmm0,xmm1
pmovmskb eax,xmm0
        .if bit ax, 15
            jmp IsGreater
        .endif

pcmpgtd xmm2,xmm3; same values in reverse order
pmovmskb ebx,xmm2
        .if bit bx, 15
            jmp IsLessThan
        .endif
        .if ax == bx
            jmp AreEqual
        .elseif ax > bx
            jmp IsGreater
        .else
            jmp IsLessThan
        .endif

I'll try the code and let you know.

nidud

  • Member
  • *****
  • Posts: 2035
    • https://github.com/nidud/asmc
Re: MASM FOR FUN - REBORN - #0 Extract low order bytes from dwords
« Reply #91 on: November 29, 2012, 12:00:35 PM »
ahh, we dont know if it's equal or less  :redface:
Code: [Select]
movdqa xmm2,xmm0 ; save dest
pcmpeqb xmm0,xmm1 ; test equal first
pmovmskb eax,xmm0
cmp ax,-1
je is_equal
movdqa xmm0,xmm2
pcmpgtb xmm0,xmm1
pmovmskb eax,xmm0
test ah,80h
jnz is_great
jmp is_less

Edit: movq --> movdqa  ::)

frktons

  • Member
  • ***
  • Posts: 491
Re: MASM FOR FUN - REBORN - #0 Extract low order bytes from dwords
« Reply #92 on: November 29, 2012, 12:15:18 PM »
Read again my last posts, yes we can know if the
compare gives GT, LT or EQ.

nidud

  • Member
  • *****
  • Posts: 2035
    • https://github.com/nidud/asmc
Re: MASM FOR FUN - REBORN - #0 Extract low order bytes from dwords
« Reply #93 on: December 03, 2012, 08:39:08 AM »
Using LEA in the test code seems not only to give random result, it also change according to the value of IP on entry to the test. I did some testing on the atol function, which uses LEA to multiply by 10, and I notice that by changing code not related to this function had an effect on the result. This is the test code used:
Code: [Select]
; ATOL.ASM--
; http://www.masm32.com/
;
; Test case for using LEA
;
; make:
; jwasm /c /coff atol.asm
; link /subsystem:console atol.obj
;
.xlist
include \masm32\include\masm32rt.inc
.686
.xmm
include \masm32\macros\timers.asm
.list

MAIN_COUNT = 2
LOOP_COUNT = 3000

atol1 proto string:dword
atol2 proto string:dword
atol3 proto string:dword

.data

align 16
;db 0
v1 db "65636",0
v2 db "2147483647",0

.code
start:
push 1
call ShowCpu ; print brand string and SSE level
print "---------------------------------------------------------", 13, 10

mov ecx,MAIN_COUNT
main_loop:
push ecx

test_start macro
align 16
invoke Sleep, 100
counter_begin LOOP_COUNT, HIGH_PRIORITY_CLASS
endm

test_end macro text
counter_end
print str$(eax), 9, text, 13, 10
endm

;----------------------------------------------

test_start
mov esi,LOOP_COUNT
    @@:
invoke atol3,addr v1
invoke atol3,addr v2
dec esi
jnz @b
test_end "cycles for atol LODSB"

;----------------------------------------------

test_start
mov esi,LOOP_COUNT
    @@:
invoke atol2,addr v1
invoke atol2,addr v2
dec esi
jnz @b
test_end "cycles for atol SHL"

;----------------------------------------------

test_start
mov esi,LOOP_COUNT
    @@:
invoke atol1,addr v1
invoke atol1,addr v2
dec esi
jnz @b
test_end "cycles for atol LEA"

;----------------------------------------------

print "---------------------------------------------------------", 13, 10
pop ecx
dec ecx
jz @F
jmp main_loop
      @@:
inkey chr$(13, 10, "--- ok ---", 13)
exit

ShowCpu proc ; mode:DWORD
COMMENT @ Usage:
  push 0, call ShowCpu ; simple, no printing, just returns SSE level
  push 1, call ShowCpu ; prints the brand string and returns SSE level@
  pushad
  sub esp, 80 ; create a buffer for the brand string
  mov edi, esp ; point edi to it
  xor ebp, ebp
  .Repeat
  lea eax, [ebp+80000002h]
db 0Fh, 0A2h ; cpuid 80000002h-80000004h
stosd
mov eax, ebx
stosd
mov eax, ecx
stosd
mov eax, edx
stosd
inc ebp
  .Until ebp>=3
  push 1
  pop eax
  db 0Fh, 0A2h ; cpuid 1
  xor ebx, ebx ; CpuSSE
  xor esi, esi ; add zero plus the carry flag
  bt edx, 25 ; edx bit 25, SSE1
  adc ebx, esi
  bt edx, 26 ; edx bit 26, SSE2
  adc ebx, esi
  bt ecx, esi ; ecx bit 0, SSE3
  adc ebx, esi
  bt ecx, 9 ; ecx bit 9, SSE4
  adc ebx, esi
  dec dword ptr [esp+4+32+80] ; dec mode in stack
  .if Zero?
mov edi, esp ; restore pointer to brand string
  .Repeat
.Break .if byte ptr [edi]!=32 ; mode was 1, so show a string but skip leading blanks
inc edi
.Until 0
.if byte ptr [edi]<32
print chr$("pre-P4")
.else
print edi ; CpuBrand
.endif
.if ebx
print chr$(32, 40, "SSE") ; info on SSE level, 40=(
print str$(ebx), 41, 13, 10 ; 41=)
.endif
  .endif
  add esp, 80 ; discard brand buffer (after printing!)
  mov [esp+32-4], ebx ; move ebx into eax stack position - returns eax to main for further use
  ifdef MbBufferInit
call MbBufferInit
  endif
  popad
  ret 4
ShowCpu endp

align 16

atol1 proc lpSrc:DWORD

    xor eax, eax ; zero EAX
    mov edx, lpSrc
    movzx ecx, BYTE PTR [edx]
    add edx, 1
    cmp ecx, "-" ; test for sign
    jne lbl0
    add eax, 1  ; set EAX if sign
    movzx ecx, BYTE PTR [edx]
    add edx, 1

  lbl0:
    push eax    ; store sign on stack
    xor eax, eax ; so eax*10 will be 0 for first digit
;-----------------------------------
; normal: 198057
if 1 ; makes it fast: 186000
nop
endif
; not using align 16:
if 0 ; makes it slow: 294520
nop
nop
nop
nop
nop
nop
nop
 if 0 ; makes it fast: 195044
 nop
 nop
 nop
 nop
 nop
 nop
 endif
endif
;-----------------------------------
  lbl1:
    sub ecx, 48
    jc  lbl2
    lea eax, [eax+eax*4] ; mul eax by 5
    lea eax, [ecx+eax*2] ; mul eax by 2 and add digit value
    movzx ecx, BYTE PTR [edx]   ; get next digit
    add edx, 1
    jmp lbl1

  lbl2:
    pop ecx      ; retrieve sign
    test ecx, ecx
    jnz lbl3
    ret

  lbl3:
    neg eax      ; negative return value is sign set
    ret
atol1 endp

align 4

atol2 proc uses ebx string:dword
mov ebx,string
sub ecx,ecx
      @@:
mov cl,[ebx]
inc ebx
cmp cl,' '
je @B
push ecx
cmp cl,'-'
je @F
cmp cl,'+'
jne atol_set
      @@:
mov cl,[ebx]
inc ebx
    atol_set:
    sub eax,eax
;-----------------------------------
; normal: 252233
if 1  ; : 237000
nop
nop
endif
;-----------------------------------
    atol_loop:
    sub cl,'0'
jc @F
mov edx,eax
shl eax,3
add eax,edx
add eax,edx
add eax,ecx
mov cl,[ebx]
inc ebx
jmp atol_loop
      @@:
pop edx
cmp dl,'-'
je atol_neg
    atol_end:
ret
    atol_neg:
neg eax
jmp atol_end
atol2 endp

align 16

atol3 proc uses esi string:dword
mov esi,string
sub eax,eax
      @@:
lodsb
cmp al,' '
je @B
push eax
cmp al,'-'
je @F
cmp al,'+'
jne atol_set
      @@:
lodsb
    atol_set:
    sub ecx,ecx
    atol_loop:
    sub al,'0'
jc @F
mov edx,ecx
shl ecx,3
add ecx,edx
add ecx,edx
add ecx,eax
lodsb
jmp atol_loop
      @@:
mov eax,ecx
pop edx
cmp dl,'-'
je atol_neg
    atol_end:
ret
    atol_neg:
neg eax
jmp atol_end
atol3 endp

end start

The first result from the test:
Quote
AMD Athlon(tm) II X2 245 Processor (SSE3)
---------------------------------------------------------
312187   cycles for atol LODSB
243102   cycles for atol SHL
294036   cycles for atol LEA
---------------------------------------------------------
312118   cycles for atol LODSB
244057   cycles for atol SHL
294086   cycles for atol LEA
---------------------------------------------------------
312117   cycles for atol LODSB
243131   cycles for atol SHL
294214   cycles for atol LEA
---------------------------------------------------------

I then aligned the code entry on the funtions, and tuned the actual loop code to get the best result:
Quote
---------------------------------------------------------
297479   cycles for atol LODSB
237504   cycles for atol SHL
195149   cycles for atol LEA
---------------------------------------------------------
297359   cycles for atol LODSB
237311   cycles for atol SHL
197737   cycles for atol LEA
---------------------------------------------------------

The order of the test code could then have an effect on the result.

nidud

  • Member
  • *****
  • Posts: 2035
    • https://github.com/nidud/asmc
Re: MASM FOR FUN - REBORN - #0 Extract low order bytes from dwords
« Reply #94 on: December 03, 2012, 09:37:11 AM »
A test case for memcpy:
Code: [Select]
; MEMCPY.ASM--
; http://www.masm32.com/
;
; make:
; jwasm /c /coff memcpy.asm
; link /subsystem:console memcpy.obj
;
.xlist
include \masm32\include\masm32rt.inc
.686
.xmm
include \masm32\macros\timers.asm
.list

MAIN_COUNT = 2
LOOP_COUNT = 1000

memcpy proto :ptr byte, :ptr byte, :dword
memcpyxmm1 proto :ptr byte, :ptr byte, :dword
memcpyxmm2 proto :ptr byte, :ptr byte, :dword
memcpyxmm3 proto :ptr byte, :ptr byte, :dword

.data

align 16
b1 db 4096 dup(?)
b2 db 4096 dup(?)
db 1
b3 db 4096 dup(?)
b4 db 4096 dup(?)

.code
start:
push 1
call ShowCpu ; print brand string and SSE level
print "---------------------------------------------------------", 13, 10

mov ecx,MAIN_COUNT
main_loop:
push ecx

test_start macro
invoke Sleep, 100
counter_begin LOOP_COUNT, HIGH_PRIORITY_CLASS
endm

test_end macro text
counter_end
print str$(eax), 9, text, 13, 10
endm

;----------------------------------------------
if 1
test_start
mov esi,LOOP_COUNT
    @@:
invoke memcpy,addr b1,addr b2,4096
dec esi
jnz @b
test_end "cycles for memcpy A"
endif
;----------------------------------------------

test_start
mov esi,LOOP_COUNT
    @@:
invoke memcpyxmm1,addr b1,addr b2,4096
dec esi
jnz @b
test_end "cycles for memcpy movdqa xmm0 A"

;----------------------------------------------

test_start
mov esi,LOOP_COUNT
    @@:
invoke memcpyxmm2,addr b1,addr b2,4096
dec esi
jnz @b
test_end "cycles for memcpy movdqu xmm0 A"

;----------------------------------------------

test_start
mov esi,LOOP_COUNT
    @@:
invoke memcpyxmm3,addr b1,addr b2,4096
dec esi
jnz @b
test_end "cycles for memcpy movdqu xmm0..xmm7 A"

;----------------------------------------------

test_start
mov esi,LOOP_COUNT
    @@:
invoke memcpyxmm2,addr b3,addr b4,4096
dec esi
jnz @b
test_end "cycles for memcpy movdqu xmm0 U"

;----------------------------------------------

test_start
mov esi,LOOP_COUNT
    @@:
invoke memcpyxmm3,addr b3,addr b4,4096
dec esi
jnz @b
test_end "cycles for memcpy movdqu xmm0..xmm7 U"

;----------------------------------------------

print "---------------------------------------------------------", 13, 10
pop ecx
dec ecx
jz @F
jmp main_loop
      @@:
inkey chr$(13, 10, "--- ok ---", 13)
exit

align 16
memcpy proc uses esi edi s1:ptr byte, s2:ptr byte, count:dword
mov edi,s1
mov esi,s2
mov ecx,count
mov eax,edi
rep movsb
ret
memcpy endp

align 16
memcpyxmm1 proc uses esi edi s1:ptr byte, s2:ptr byte, count:dword
mov edi,s1
mov esi,s2
mov ecx,count
shr ecx,7
mov eax,edi
      @@:
movdqa xmm0,[esi]
movdqa [edi],xmm0
movdqa xmm0,[esi+16]
movdqa [edi+16],xmm0
movdqa xmm0,[esi+32]
movdqa [edi+32],xmm0
movdqa xmm0,[esi+48]
movdqa [edi+48],xmm0
movdqa xmm0,[esi+64]
movdqa [edi+64],xmm0
movdqa xmm0,[esi+80]
movdqa [edi+80],xmm0
movdqa xmm0,[esi+96]
movdqa [edi+96],xmm0
movdqa xmm0,[esi+112]
movdqa [edi+112],xmm0
add esi,128
add edi,128
dec ecx
jnz @B
ret
memcpyxmm1 endp

align 16
memcpyxmm2 proc uses esi edi s1:ptr byte, s2:ptr byte, count:dword
mov edi,s1
mov esi,s2
mov ecx,count
mov eax,ecx
shr eax,7
jz memcpyxmm2_tail
      @@:
movdqu xmm0,[esi]
movdqu [edi],xmm0
movdqu xmm0,[esi+16]
movdqu [edi+16],xmm0
movdqu xmm0,[esi+32]
movdqu [edi+32],xmm0
movdqu xmm0,[esi+48]
movdqu [edi+48],xmm0
movdqu xmm0,[esi+64]
movdqu [edi+64],xmm0
movdqu xmm0,[esi+80]
movdqu [edi+80],xmm0
movdqu xmm0,[esi+96]
movdqu [edi+96],xmm0
movdqu xmm0,[esi+112]
movdqu [edi+112],xmm0
add esi,128
add edi,128
dec eax
jnz @B
    memcpyxmm2_tail:
    and ecx,7Fh
rep movsb
    memcpyxmm2_end:
mov eax,s1
ret
memcpyxmm2 endp

align 16
memcpyxmm3 proc uses esi edi s1:ptr byte, s2:ptr byte, count:dword
mov edi,s1
mov esi,s2
mov ecx,count
shr ecx,7
mov eax,edi
      @@:
movdqu xmm0,[esi]
movdqu xmm1,[esi+16]
movdqu xmm2,[esi+32]
movdqu xmm3,[esi+48]
movdqu xmm4,[esi+64]
movdqu xmm5,[esi+80]
movdqu xmm6,[esi+96]
movdqu xmm7,[esi+112]
movdqu [edi],xmm0
movdqu [edi+16],xmm1
movdqu [edi+32],xmm2
movdqu [edi+48],xmm3
movdqu [edi+64],xmm4
movdqu [edi+80],xmm5
movdqu [edi+96],xmm6
movdqu [edi+112],xmm7
add esi,128
add edi,128
dec ecx
jnz @B
ret
memcpyxmm3 endp

ShowCpu proc ; mode:DWORD
COMMENT @ Usage:
  push 0, call ShowCpu ; simple, no printing, just returns SSE level
  push 1, call ShowCpu ; prints the brand string and returns SSE level@
  pushad
  sub esp, 80 ; create a buffer for the brand string
  mov edi, esp ; point edi to it
  xor ebp, ebp
  .Repeat
  lea eax, [ebp+80000002h]
db 0Fh, 0A2h ; cpuid 80000002h-80000004h
stosd
mov eax, ebx
stosd
mov eax, ecx
stosd
mov eax, edx
stosd
inc ebp
  .Until ebp>=3
  push 1
  pop eax
  db 0Fh, 0A2h ; cpuid 1
  xor ebx, ebx ; CpuSSE
  xor esi, esi ; add zero plus the carry flag
  bt edx, 25 ; edx bit 25, SSE1
  adc ebx, esi
  bt edx, 26 ; edx bit 26, SSE2
  adc ebx, esi
  bt ecx, esi ; ecx bit 0, SSE3
  adc ebx, esi
  bt ecx, 9 ; ecx bit 9, SSE4
  adc ebx, esi
  dec dword ptr [esp+4+32+80] ; dec mode in stack
  .if Zero?
mov edi, esp ; restore pointer to brand string
  .Repeat
.Break .if byte ptr [edi]!=32 ; mode was 1, so show a string but skip leading blanks
inc edi
.Until 0
.if byte ptr [edi]<32
print chr$("pre-P4")
.else
print edi ; CpuBrand
.endif
.if ebx
print chr$(32, 40, "SSE") ; info on SSE level, 40=(
print str$(ebx), 41, 13, 10 ; 41=)
.endif
  .endif
  add esp, 80 ; discard brand buffer (after printing!)
  mov [esp+32-4], ebx ; move ebx into eax stack position - returns eax to main for further use
  ifdef MbBufferInit
call MbBufferInit
  endif
  popad
  ret 4
ShowCpu endp

end start

result:
Quote
AMD Athlon(tm) II X2 245 Processor (SSE3)
---------------------------------------------------------
609815   cycles for memcpy A
608249   cycles for memcpy movdqa xmm0 A
579394   cycles for memcpy movdqu xmm0 A
547453   cycles for memcpy movdqu xmm0..xmm7 A
1175825   cycles for memcpy movdqu xmm0 U
1011253   cycles for memcpy movdqu xmm0..xmm7 U
---------------------------------------------------------
610739   cycles for memcpy A
605121   cycles for memcpy movdqa xmm0 A
580058   cycles for memcpy movdqu xmm0 A
541764   cycles for memcpy movdqu xmm0..xmm7 A
1173293   cycles for memcpy movdqu xmm0 U
1010530   cycles for memcpy movdqu xmm0..xmm7 U
---------------------------------------------------------

Intel(R) Core(TM) i3 CPU    540  @ 3.07GHz (SSE4)
---------------------------------------------------------
449020  cycles for memcpy A
343031  cycles for memcpy movdqa xmm0 A
274136  cycles for memcpy movdqu xmm0 A
270389  cycles for memcpy movdqu xmm0..xmm7 A
481695  cycles for memcpy movdqu xmm0 U
484069  cycles for memcpy movdqu xmm0..xmm7 U
---------------------------------------------------------
417787  cycles for memcpy A
271078  cycles for memcpy movdqa xmm0 A
322979  cycles for memcpy movdqu xmm0 A
270214  cycles for memcpy movdqu xmm0..xmm7 A
427182  cycles for memcpy movdqu xmm0 U
420321  cycles for memcpy movdqu xmm0..xmm7 U
---------------------------------------------------------

nidud

  • Member
  • *****
  • Posts: 2035
    • https://github.com/nidud/asmc
Re: MASM FOR FUN - REBORN - #0 Extract low order bytes from dwords
« Reply #95 on: December 04, 2012, 05:06:14 AM »
Most programs these days use some kind of compression, sound, video, images, and the hacking of the memcpy() function generate problems in this case. Compressed data is often expanded using memcpy(data+size, data, count). If data is 'abcd', size is 1, and count is 4, the expected output is 'aaaaa'. Using movsd to improve speed will then be the hack which generate this problem.

I think it would be better to create different version of memcpy[w|d|q], since the user usually know the type of data he is copying. The idea to make one version to handle all cases will be complicated, and all the test code needed makes it rather big and also in some cases slower.

In the test I made above I was speculating if movdqu was faster than movdqa on aligned data, which seems a bit odd. I rewrote the test code, but the result is still random. The test (at least in this case) show how little gain there is using movsd to improve speed. Using SSE to copy aligned data have some benefits, but it is not a huge improvement.
Code: [Select]
; MEMCPY2.ASM--
; http://www.masm32.com/
;
; make:
; jwasm /coff memcpy2.asm
; link /subsystem:console memcpy2.obj
;
.xlist
include \masm32\include\masm32rt.inc
.686
.xmm
include \masm32\macros\timers.asm
.list

MAIN_COUNT = 2
LOOP_COUNT = 100
MAXMEMORY  = 40000h

memcpy proto :ptr byte, :ptr byte, :dword
memcpyd proto :ptr byte, :ptr byte, :dword
memcpyxmmA proto :ptr byte, :ptr byte, :dword
memcpyxmmU proto :ptr byte, :ptr byte, :dword

.data
a1 dd ?
m1 dd ?
u1 dd ?

.code
start:
invoke GlobalAlloc,GMEM_FIXED,MAXMEMORY+128
mov a1,eax
test eax,eax
jnz @F
exit
      @@:
mov edx,eax
and eax,not 128-1
mov m1,eax
inc edx
mov u1,edx

push 1
call ShowCpu ; print brand string and SSE level
print "---------------------------------------------------------", 13, 10

mov ecx,MAIN_COUNT
main_loop:
push ecx

test_start macro
invoke Sleep, 100
counter_begin LOOP_COUNT, HIGH_PRIORITY_CLASS
endm

test_end macro text
counter_end
print str$(eax), 9, text, 13, 10
endm

;----------------------------------------------

test_start
mov esi,LOOP_COUNT
    @@:
invoke crt_memcpy,m1,m1,MAXMEMORY
dec esi
jnz @b
test_end "cycles for crt_memcpy A"

;----------------------------------------------

test_start
mov esi,LOOP_COUNT
    @@:
invoke memcpy,m1,m1,MAXMEMORY
dec esi
jnz @b
test_end "cycles for memcpy A"

;----------------------------------------------

test_start
mov esi,LOOP_COUNT
    @@:
invoke memcpyd,m1,m1,MAXMEMORY
dec esi
jnz @b
test_end "cycles for memcpyd A"

;----------------------------------------------

test_start
mov esi,LOOP_COUNT
    @@:
invoke memcpyxmmA,m1,m1,MAXMEMORY
dec esi
jnz @b
test_end "cycles for memcpy movdqa A"

;----------------------------------------------

test_start
mov esi,LOOP_COUNT
    @@:
invoke memcpyxmmU,m1,m1,MAXMEMORY
dec esi
jnz @b
test_end "cycles for memcpy movdqu A"

;----------------------------------------------

test_start
mov esi,LOOP_COUNT
    @@:
invoke memcpyxmmU,u1,u1,MAXMEMORY
dec esi
jnz @b
test_end "cycles for memcpy movdqu U"

;----------------------------------------------

print "---------------------------------------------------------", 13, 10
pop ecx
dec ecx
jz @F
jmp main_loop
      @@:
      invoke GlobalFree,a1
inkey chr$(13, 10, "--- ok ---", 13)
exit

align 16

memcpy proc uses esi edi s1:ptr byte, s2:ptr byte, count:dword
mov edi,s1
mov esi,s2
mov ecx,count
mov eax,edi
rep movsb
ret
memcpy endp

align 16

memcpyd proc uses esi edi s1:ptr byte, s2:ptr byte, count:dword
mov edi,s1
mov esi,s2
mov ecx,count
shr ecx,2
mov eax,edi
rep movsd
ret
memcpyd endp

align 16

memcpyxmmA proc uses ebx s1:ptr byte, s2:ptr byte, count:dword
mov edx,s1
mov ebx,s2
mov eax,count
neg eax
add eax,127
align 16
      @@:
movdqa xmm0,[ebx]
movdqa xmm1,[ebx+16]
movdqa xmm2,[ebx+32]
movdqa xmm3,[ebx+48]
movdqa xmm4,[ebx+64]
movdqa xmm5,[ebx+80]
movdqa xmm6,[ebx+96]
movdqa xmm7,[ebx+112]
movdqa [edx],xmm0
movdqa [edx+16],xmm1
movdqa [edx+32],xmm2
movdqa [edx+48],xmm3
movdqa [edx+64],xmm4
movdqa [edx+80],xmm5
movdqa [edx+96],xmm6
movdqa [edx+112],xmm7
add ebx,128
add edx,128
add eax,128
jnc @B
mov eax,s1
ret
memcpyxmmA endp

align 16

memcpyxmmU proc uses ebx s1:ptr byte, s2:ptr byte, count:dword
mov edx,s1
mov ebx,s2
mov eax,count
neg eax
add eax,127
jbe memcpyxmmU_16
align 16
      @@:
movdqu xmm0,[ebx]
movdqu xmm1,[ebx+16]
movdqu xmm2,[ebx+32]
movdqu xmm3,[ebx+48]
movdqu xmm4,[ebx+64]
movdqu xmm5,[ebx+80]
movdqu xmm6,[ebx+96]
movdqu xmm7,[ebx+112]
movdqu [edx],xmm0
movdqu [edx+16],xmm1
movdqu [edx+32],xmm2
movdqu [edx+48],xmm3
movdqu [edx+64],xmm4
movdqu [edx+80],xmm5
movdqu [edx+96],xmm6
movdqu [edx+112],xmm7
add ebx,128
add edx,128
add eax,128
jnc @B
    memcpyxmmU_16:
    sub eax,127-15
jns memcpyxmmU_tail
      @@:
movdqu xmm0,[ebx]
movdqu [edx],xmm0
add ebx,16
add edx,16
add eax,16
jnc @B
    memcpyxmmU_tail:
    sub eax,15
jz memcpyxmmU_end
neg eax
    mov ecx,eax
xchg esi,ebx
xchg edi,edx
rep movsb
mov esi,ebx
mov edi,edx
    memcpyxmmU_end:
mov eax,s1
ret
memcpyxmmU endp

ShowCpu proc ; mode:DWORD
COMMENT @ Usage:
  push 0, call ShowCpu ; simple, no printing, just returns SSE level
  push 1, call ShowCpu ; prints the brand string and returns SSE level@
  pushad
  sub esp, 80 ; create a buffer for the brand string
  mov edi, esp ; point edi to it
  xor ebp, ebp
  .Repeat
  lea eax, [ebp+80000002h]
db 0Fh, 0A2h ; cpuid 80000002h-80000004h
stosd
mov eax, ebx
stosd
mov eax, ecx
stosd
mov eax, edx
stosd
inc ebp
  .Until ebp>=3
  push 1
  pop eax
  db 0Fh, 0A2h ; cpuid 1
  xor ebx, ebx ; CpuSSE
  xor esi, esi ; add zero plus the carry flag
  bt edx, 25 ; edx bit 25, SSE1
  adc ebx, esi
  bt edx, 26 ; edx bit 26, SSE2
  adc ebx, esi
  bt ecx, esi ; ecx bit 0, SSE3
  adc ebx, esi
  bt ecx, 9 ; ecx bit 9, SSE4
  adc ebx, esi
  dec dword ptr [esp+4+32+80] ; dec mode in stack
  .if Zero?
mov edi, esp ; restore pointer to brand string
  .Repeat
.Break .if byte ptr [edi]!=32 ; mode was 1, so show a string but skip leading blanks
inc edi
.Until 0
.if byte ptr [edi]<32
print chr$("pre-P4")
.else
print edi ; CpuBrand
.endif
.if ebx
print chr$(32, 40, "SSE") ; info on SSE level, 40=(
print str$(ebx), 41, 13, 10 ; 41=)
.endif
  .endif
  add esp, 80 ; discard brand buffer (after printing!)
  mov [esp+32-4], ebx ; move ebx into eax stack position - returns eax to main for further use
  ifdef MbBufferInit
call MbBufferInit
  endif
  popad
  ret 4
ShowCpu endp

end start

Quote
AMD Athlon(tm) II X2 245 Processor (SSE3)
---------------------------------------------------------
5436621   cycles for crt_memcpy A
5451494   cycles for memcpy A
5430749   cycles for memcpyd A
5130181   cycles for memcpy movdqa A
5137260   cycles for memcpy movdqu A
9398746   cycles for memcpy movdqu U
---------------------------------------------------------
5424911   cycles for crt_memcpy A
5429803   cycles for memcpy A
5424371   cycles for memcpyd A
5147542   cycles for memcpy movdqa A
5139047   cycles for memcpy movdqu A
9419693   cycles for memcpy movdqu U
---------------------------------------------------------
Intel(R) Core(TM) i3 CPU    540  @ 3.07GHz (SSE4)
---------------------------------------------------------
3768758 cycles for crt_memcpy A
3601358 cycles for memcpy A
3611729 cycles for memcpyd A
3665437 cycles for memcpy movdqa A
3527944 cycles for memcpy movdqu A
4053850 cycles for memcpy movdqu U
---------------------------------------------------------
3910008 cycles for crt_memcpy A
3616456 cycles for memcpy A
3675379 cycles for memcpyd A
4250390 cycles for memcpy movdqa A
3348694 cycles for memcpy movdqu A
4051784 cycles for memcpy movdqu U
---------------------------------------------------------

frktons

  • Member
  • ***
  • Posts: 491
Re: MASM FOR FUN - REBORN - #0 Extract low order bytes from dwords
« Reply #96 on: December 04, 2012, 05:11:03 AM »
my test for atol:

Code: [Select]
Intel(R) Core(TM)2 CPU          6600  @ 2.40GHz (SSE4)
--------------------------------------------------------
429117  cycles for atol LODSB
350976  cycles for atol SHL
411231  cycles for atol LEA
--------------------------------------------------------
430242  cycles for atol LODSB
509282  cycles for atol SHL
395102  cycles for atol LEA
--------------------------------------------------------

well they are a bit random, as you said.

frktons

  • Member
  • ***
  • Posts: 491
Re: MASM FOR FUN - REBORN - #0 Extract low order bytes from dwords
« Reply #97 on: December 04, 2012, 05:20:42 AM »
Memcpy:

Code: [Select]
Intel(R) Core(TM)2 CPU          6600  @ 2.40GHz (SSE4)
--------------------------------------------------------
757223  cycles for memcpy A
288975  cycles for memcpy movdqa xmm0 A
1352024 cycles for memcpy movdqu xmm0 A
1367569 cycles for memcpy movdqu xmm0..xmm7 A
5668726 cycles for memcpy movdqu xmm0 U
4563076 cycles for memcpy movdqu xmm0..xmm7 U
--------------------------------------------------------
749649  cycles for memcpy A
302916  cycles for memcpy movdqa xmm0 A
1737163 cycles for memcpy movdqu xmm0 A
1841807 cycles for memcpy movdqu xmm0..xmm7 A
6136384 cycles for memcpy movdqu xmm0 U
4055501 cycles for memcpy movdqu xmm0..xmm7 U
--------------------------------------------------------

and the last routines:

Code: [Select]
Intel(R) Core(TM)2 CPU          6600  @ 2.40GHz (SSE4)
--------------------------------------------------------
5825483 cycles for crt_memcpy A
7352188 cycles for memcpy A
7269901 cycles for memcpyd A
4146083 cycles for memcpy movdqa A
8700368 cycles for memcpy movdqu A
27212513        cycles for memcpy movdqu U
--------------------------------------------------------
6180618 cycles for crt_memcpy A
9100718 cycles for memcpy A
7028934 cycles for memcpyd A
4151090 cycles for memcpy movdqa A
11923657        cycles for memcpy movdqu A
29665081        cycles for memcpy movdqu U
--------------------------------------------------------


frktons

  • Member
  • ***
  • Posts: 491
Re: MASM FOR FUN - REBORN - #0 Extract low order bytes from dwords
« Reply #98 on: December 04, 2012, 05:35:31 AM »
You could find interesting the test we did a couple of years ago:



« Last Edit: December 04, 2012, 07:13:09 AM by frktons »

nidud

  • Member
  • *****
  • Posts: 2035
    • https://github.com/nidud/asmc
Re: MASM FOR FUN - REBORN - #0 Extract low order bytes from dwords
« Reply #99 on: December 04, 2012, 06:04:11 AM »
Strange results..

The unaligned test is somewhat understandable, and there are some consistency in the movdqa function, but the large time lap between movdqu seems odd.

jj2007

  • Member
  • *****
  • Posts: 11137
  • Assembler is fun ;-)
    • MasmBasic
Re: MASM FOR FUN - REBORN - #0 Extract low order bytes from dwords
« Reply #100 on: December 04, 2012, 06:25:32 AM »
memcpy?? Looks familiar :biggrin:

nidud

  • Member
  • *****
  • Posts: 2035
    • https://github.com/nidud/asmc
Re: MASM FOR FUN - REBORN - #0 Extract low order bytes from dwords
« Reply #101 on: December 04, 2012, 06:48:06 AM »
memcpy?? Looks familiar :biggrin:

 :P

Quote
AMD Athlon(tm) II X2 245 Processor (SSE3)
8.972.644   cycles for RtlZeroMemory
10.275.565   cycles for FrkTons
8.971.074   cycles for rep stosd
9.041.781   cycles for movdqa
9.046.257   cycles for movaps
9.074.924   cycles for FrkTons New
8.937.746   cycles for movups
9.068.905   cycles for movupd
6.556.637   cycles for MOVNTDQ

8.967.443   cycles for RtlZeroMemory
10.272.111   cycles for FrkTons
8.964.736   cycles for rep stosd
9.044.940   cycles for movdqa
9.044.951   cycles for movaps
9.077.441   cycles for FrkTons New
8.940.185   cycles for movups
9.072.967   cycles for movupd
6.562.758   cycles for MOVNTDQ
---------------------------------------------
Intel(R) Core(TM) i3 CPU         540  @ 3.07GHz (SSE4)
6.065.640       cycles for RtlZeroMemory
8.193.809       cycles for FrkTons
5.992.196       cycles for rep stosd
8.180.437       cycles for movdqa
8.180.221       cycles for movaps
8.159.784       cycles for FrkTons New
8.147.703       cycles for movups
8.157.409       cycles for movupd
7.240.756       cycles for MOVNTDQ

6.016.674       cycles for RtlZeroMemory
8.220.004       cycles for FrkTons
5.995.494       cycles for rep stosd
8.172.316       cycles for movdqa
8.172.054       cycles for movaps
8.167.484       cycles for FrkTons New
8.286.491       cycles for movups
8.270.679       cycles for movupd
7.305.659       cycles for MOVNTDQ

We either all use the same CPU or stick to the basic then  :lol:


frktons

  • Member
  • ***
  • Posts: 491
Re: MASM FOR FUN - REBORN - #0 Extract low order bytes from dwords
« Reply #102 on: December 04, 2012, 07:18:43 AM »

We either all use the same CPU or stick to the basic then  :lol:


Optimization is quite a strange beast indeed, it comes and goes
depending on many [maybe too many] factors.
Nevertheless we can try and find something that we didn't expect :lol:

jj2007

  • Member
  • *****
  • Posts: 11137
  • Assembler is fun ;-)
    • MasmBasic
Re: MASM FOR FUN - REBORN - #0 Extract low order bytes from dwords
« Reply #103 on: December 04, 2012, 07:36:25 AM »
Intel(R) Core(TM) i3 CPU         540  @ 3.07GHz (SSE4)
6.065.640       cycles for RtlZeroMemory
5.992.196       cycles for rep stosd
7.240.756       cycles for MOVNTDQ

It seems Intel is still working on rep stosd..!

GabrielRavier

  • Regular Member
  • *
  • Posts: 18
Re: MASM FOR FUN - REBORN - #0 Extract low order bytes from dwords
« Reply #104 on: October 27, 2018, 12:47:53 PM »
I tried it for fun (haven't even read messages here though so this is prolly bad code) :