FillArray PROC USES EBX ESI EDI
mov edi,offset MyArray ;or whatever you name it - it should be 4-aligned
xor eax,eax
mov ebx,1
mov edx,2
mov esi,3
mov ecx,1000/4
FArry0: mov [edi],eax
mov [edi+4],ebx
mov [edi+8],edx
mov [edi+12],esi
add eax,4
add ebx,4
add edx,4
add esi,4
add edi,16
sub ecx,1
jnz FArry0
ret
FillArray ENDP
Code: [Select]FillArray PROC USES EBX ESI EDI
mov edi,offset MyArray ;or whatever you name it - it should be 4-aligned
xor eax,eax
mov ebx,1
mov edx,2
mov esi,3
mov ecx,1000/4
FArry0: mov [edi],eax
mov [edi+4],ebx
mov [edi+8],edx
mov [edi+12],esi
add eax,4
add ebx,4
add edx,4
add esi,4
add edi,16
sub ecx,1
jnz FArry0
ret
FillArray ENDP
FillArray PROC USES EBX ESI EDI
mov edi,offset MyArray ;or whatever you name it
xor esi,esi
mov ebx,1000
FArry0: INVOKE crt__ultoa,esi,edi,10
add edi,4
inc esi
dec ebx
jnz FArry0
ret
FillArray ENDP
Before you design an algo, you first need to specify:
(i) if the ascii representations will be left-aligned or right-aligned within its 4-byte space when you print that 4000-byte string,
(ii) if right-aligned, should it have leading 0's or leading spaces for the numbers below 100,
(iii)which character do you need as the 4th character since each number needs at most 3 characters in ascii format.
Seems there is not to many soluces to be fast.
put 30h in a register = 0 >>> in memory
cmp 3ah
inc = 1 >>> //
..
inc 3Ah = 3130h = 10 >>> in memory
inc
cmp ...
FillArray PROC
mov edx,offset MyArray ;or whatever you name it - it should be 4-aligned
mov eax,302020h ;EAX = " 0",0
jmp short FArry4
FArry0: and eax,030FFFFh
cmp ah,39h
jz FArry1
cmp ah,20h
jz FArry2
add ah,1
jmp short FArry4
FArry1: sub eax,8FFh
cmp al,30h
ja FArry4
mov al,31h
jmp short FArry4
FArry2: mov ah,31h
jmp short FArry4
FArry3: add eax,10000h
FArry4: mov [edx],eax
add edx,4
cmp eax,393030h
jb FArry3
ja FArry0
ret
FillArray ENDP
mov edx,offset MyArray
mov eax,30202020h ;EAX = " 0"
mov edx,offset MyArray ;or whatever you name it - it should be 4-aligned
mov eax,302020h ;EAX = " 0",0
Better if you do test it and post the results. We are in the lab here...
...and, by the way, you should use:Code: [Select]mov edx,offset MyArray
mov eax,30202020h ;EAX = " 0"
instead of:Code: [Select]mov edx,offset MyArray ;or whatever you name it - it should be 4-aligned
mov eax,302020h ;EAX = " 0",0
to meet the requisites of the test. :P
This is a good solution, but I don't think it is the fastest around.I am not interested in wining some µs in a prog who is just used one time.
By the way if you want to post your test results we can compare
them with other solutions.
i got you started - you can't get it from there ??? :lol:
I'm not able to correct your code, better if you do it yourself. :eusa_snooty:Quoteno - we want to start with 2 spaces, an ASCII 0, and a null terminator
so the high byte of EAX should be binary 0
Well that could be another test, now the test is what I stated in the first post. :PQuoteI am not interested in wining some µs in a prog who is just used one time.
it was just a pseudo-code sample.
In my intentions this will be part of a standard proc/routine I'm thinking about.
If you are not interested in the matter, it doesn't matter, you are welcome the
same. :tFolks,
If it is supposed to be fast, it must be a solution with dword-packed xmm regs.
Load them with
[ 0 1 2 3]
first, than add
[ 4 4 4 4]
to get
[ 4 5 6 7]
Go ahead, Frank!
This is what I was thinking about. It is not yet clear in my mind the sequence to
make it the fastest [how many xmm registers to use, which instructions...] but that
is the idea for a fast filling. :t
If you want them in ascending sequence, it should be [ 3 2 1 0] since the xmm reg is loaded/stored in little endian order. Then you need to take of carries frpm 9 to 0 with the next character incremented. This could get troublesome.
What I did was to set a string up to "0000", pick it up and store it, set a pointer to the lowest character, then increment the character pointed to by the pointer. Check if the character exceeds '9', if not pickup and save the string. If greater then '9' add (256-10) to the WORD which has the current pointer as its lowest BYTE which will set the current character to '0' (256-10+'9'+1 = '0' plus a carry to the upper BYTE) and increment the next higher character, then change the pointer to check that character for exceeding '9', etc, etc. After finding a valid character, always drop the pointer back to the lowest character and pick up the value and save it and increment the character at the pointer.
Dave
I'd like to fill an array of 1000 dword with the string values ranging
from ' 0' to ' 999'.
To avoid 400k increment in program size and writing manually 1000
values in .data, I prefer to declare the array in .data? and fill the
array with a proc: call FillArray.
I've some ideas on how to to that, but before starting the tests I'd like
your suggestions, code, already done experiments.. to think upon.
Let me know.
Frank
tater,
1000 dwords is quite small, in terms of memory allocation
when you put something in the uninitialized data section, the size of the exe does not increase
only the image at run-time
if you allocate memory or put it in uninitialized data, it uses memory either way
the advantage to allocation is that it may be free'd up
so - if the table is to be used throughout program execution, that advantage is lost
when you put something in the uninitialized data section, the size of the exe does not increase
only the image at run-time
include \masm32\include\masm32rt.inc
.data?
MyArray dd 280000 dup(?)
.code
start: inkey "Hello World"
exit
end start
Test it:Code: [Select]include \masm32\include\masm32rt.inc
.data?
MyArray dd 280000 dup(?)
.code
start: inkey "Hello World"
exit
end start
... and get to know a well-known MASM bug. Even ML 10 still shows that odd behaviour...
the strange behaviour is that it takes long to assemble - at least using ML v6.x it does
here you go, Frank
i get about 2.3 cycles per string on my P4 prescott
;that'll be $50, FrankWe'll see in few days if you earned them :lol:
------------------------------------------------------------------------
Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz
Instructions: MMX, SSE1, SSE2, SSE3, SSSE3
------------------------------------------------------------------------
1934 cycles for Dedndave code
------------------------------------------------------------------------
1927 cycles for Dedndave code
------------------------------------------------------------------------
1917 cycles for Dedndave code
------------------------------------------------------------------------
1918 cycles for Dedndave code
------------------------------------------------------------------------
We'll see in few days if you earned them :lol:
------------------------------------------------------------------------
Intel(R) Pentium(R) 4 CPU 3.00GHz
Instructions: MMX, SSE1, SSE2, SSE3
------------------------------------------------------------------------
2556 cycles for Dedndave code
------------------------------------------------------------------------
2256 cycles for Dedndave code
------------------------------------------------------------------------
2258 cycles for Dedndave code
------------------------------------------------------------------------
2271 cycles for Dedndave code
------------------------------------------------------------------------
------------------------------------------------------------------------
Intel(R) Pentium(R) 4 CPU 3.00GHz
Instructions: MMX, SSE1, SSE2
------------------------------------------------------------------------
2676 cycles for Dedndave code
------------------------------------------------------------------------
2655 cycles for Dedndave code
------------------------------------------------------------------------
2306 cycles for Dedndave code
------------------------------------------------------------------------
2277 cycles for Dedndave code
------------------------------------------------------------------------
------------------------------------------------------------------------
Intel(R) Pentium(R) 4 CPU 3.00GHz
Instructions: MMX, SSE1, SSE2
------------------------------------------------------------------------
2278 cycles for Dedndave code
------------------------------------------------------------------------
2296 cycles for Dedndave code
------------------------------------------------------------------------
2263 cycles for Dedndave code
------------------------------------------------------------------------
2285 cycles for Dedndave code
------------------------------------------------------------------------
------------------------------------------------------------------------
Pentium III
Instructions: MMX, SSE1
------------------------------------------------------------------------
2457 cycles for Dedndave code
------------------------------------------------------------------------
2437 cycles for Dedndave code
------------------------------------------------------------------------
2437 cycles for Dedndave code
------------------------------------------------------------------------
2439 cycles for Dedndave code
------------------------------------------------------------------------
That will be a close race between Dave and Frank...
:biggrin:
Intel(R) Celeron(R) M CPU 420 @ 1.60GHz (SSE3)
loop overhead is approx. 189/100 cycles
1556 cycles for 100 * FA Dave
541 cycles for 100 * FA Jochen
1557 cycles for 100 * FA Frank
1556 cycles for 100 * FA Dave
539 cycles for 100 * FA Jochen
1557 cycles for 100 * FA Frank
1558 cycles for 100 * FA Dave
542 cycles for 100 * FA Jochen
1564 cycles for 100 * FA Frank
230 bytes for FA Dave
281 bytes for FA Jochen
350 bytes for FA Frank
Intel(R) Pentium(R) 4 CPU 3.00GHz (SSE3)
++18 of 20 tests valid, loop overhead is approx. 246/100 cycles
2069 cycles for 100 * FA Dave
915 cycles for 100 * FA Jochen
2230 cycles for 100 * FA Frank
2029 cycles for 100 * FA Dave
913 cycles for 100 * FA Jochen
2592 cycles for 100 * FA Frank
2034 cycles for 100 * FA Dave
930 cycles for 100 * FA Jochen
2217 cycles for 100 * FA Frank
align 16
TestB_s:
Src0123 db " 0 1 2 3"
Add4444 dd 04000000h, 04000000h, 04000000h, 04000000h ; xmm1
Add44xx dd 04000000h, 04000000h, 0FA110000h, 0FA110000h ; xmm2
Addxx44 dd 0FA110000h, 0FA110000h, 04000000h, 04000000h ; xmm3
Addxxxx dd 0FA010000h, 0FA010000h, 0FA010000h, 0FA010000h ; xmm4
Add244xx dd 04000000h, 04000000h, 0FA010000h, 0FA010000h ; xmm2
Add2xx44 dd 0FA010000h, 0FA010000h, 04000000h, 04000000h ; xmm3
Add2xxxx dd 0FA010000h, 0FA010000h, 0FA010000h, 0FA010000h ; xmm4
Sub100a dd 00009EF00h, 00009EF00h, 00009EF00h, 00009EF00h
Sub100b dd 00009FF00h, 00009FF00h, 00009FF00h, 00009FF00h
NameB equ FA Jochen ; assign a descriptive name here
TestB proc
mov esi, offset Src0123
mov edi, offset MyArray
push edi
xor ecx, ecx
movaps xmm0, [esi]
movaps xmm1, [esi+16]
movaps xmm2, [esi+32]
movaps xmm3, [esi+48]
movaps xmm4, [esi+64]
lea edx, [edi+4000]
m2m ecx, -5
; align 4
.Repeat
movaps [edi], xmm0
paddd xmm0, xmm1 ; 4444
movaps [edi+16], xmm0
paddd xmm0, xmm2 ; 44xx
movaps [edi+32], xmm0
paddd xmm0, xmm3 ; xx44
movaps [edi+48], xmm0
paddd xmm0, xmm1 ; 4444
movaps [edi+64], xmm0
paddd xmm0, xmm4 ; xxxx
inc ecx
.if Zero?
psubd xmm0, oword ptr Sub100a
.elseif ecx==-4
movaps xmm2, [esi+80]
movaps xmm3, [esi+96]
movaps xmm4, [esi+112]
.elseif ecx==5
psubd xmm0, oword ptr Sub100b
xor ecx, ecx
.endif
add edi, 80
.Until edi>=edx
pop eax
ret
TestB endp
TestB_endp:
it's a "rich" text file, Frank
i think you can open it with WordPad, if you don't have anything else :t
.dataascii table is put in the texte file 000 to 999
array dd 1000 dup (0)
controle dd 0
Nomfichier db "resultat.txt",0
Hfile dd 0
NumberOfBytesWritten dd 0
retourligne db 13,10,0
.code
start:
mov eax,30303020h ; 000
mov edx,offset array
unit:
mov [edx],eax ;000
add eax,1000000h
add edx,4
mov [edx],eax ;1
add eax,1000000h
add edx,4
mov [edx],eax ;2
add eax,1000000h
add edx,4
mov [edx],eax ;3
add eax,1000000h
add edx,4
mov [edx],eax ;4
add eax,1000000h
add edx,4
mov [edx],eax ;5
add eax,1000000h
add edx,4
mov [edx],eax ;6
add eax,1000000h
add edx,4
mov [edx],eax ;7
add eax,1000000h
add edx,4
mov [edx],eax ;8
add eax,1000000h
add edx,4
mov [edx],eax ;9
;-----------------------------
sub eax,9000000h
add eax,10000h
add edx,4
mov ecx,eax
shr ecx,16
.if cl == 3Ah ;1-
sub eax,0A0000h
inc ah ;303030h + 10000h
.if ah == 3Ah
;fini
jmp fin
.else
jmp unit
.endif
.else
jmp unit
.endif
fin:
lea edx,controle ;debug limit must be NULL
lea edx,array ;debug view what is in memory
invoke CreateFile,addr Nomfichier,GENERIC_WRITE,NULL,\
NULL,CREATE_ALWAYS,FILE_ATTRIBUTE_NORMAL,0
mov Hfile,eax
mov edx,offset array
mov ecx,0
ecrire:
push edx
push ecx
invoke WriteFile,Hfile,edx,400,addr NumberOfBytesWritten,NULL
invoke WriteFile,Hfile,addr retourligne,2,addr NumberOfBytesWritten,NULL
pop ecx
pop edx
add edx,400
add ecx,100
.if ecx != 1000
jmp ecrire
.endif
invoke CloseHandle,Hfile
invoke ExitProcess,0
;################################################################
end start
TestA proc
pusha
mov edi,offset MyArray
mov ebp,00303030h
mov ecx,10
l2: mov edx,10
push ebp
l1: call proc0
add ebp,00000100h
sub edx,1
jnz l1
pop ebp
inc ebp ;add ebp,00000001h
sub ecx,1
jnz l2
popa
mov eax,offset MyArray
ret
proc0:
push ecx
push edx
lea eax,[ebp]
lea ebx,[ebp+00010000h]
lea ecx,[ebp+00020000h]
lea edx,[ebp+00030000h]
lea esi,[ebp+00040000h]
mov ebp,00050000h
mov [edi],eax
mov [edi+4],ebx
mov [edi+8],ecx
mov [edi+12],edx
mov [edi+16],esi
add eax,ebp
add ebx,ebp
add ecx,ebp
add edx,ebp
add esi,ebp
mov [edi+20],eax
mov [edi+24],ebx
mov [edi+28],ecx
mov [edi+32],edx
mov [edi+36],esi
add edi,40
pop edx
pop ecx
ret
TestA endp
This would be nice to do in 64-bit with all those regs, but the xmm's are much better.
Here is a source code to verify mmx is faster:
116 bytes of slackware
...
This would be nice to do in 64-bit with all those regs, but the xmm's are much better.
Just goes to show, these tests are of academic interest only :biggrin:
Intel(R) Celeron(R) CPU 2.80GHz (SSE3)
loop overhead is approx. 255/100 cycles
2057 cycles for 100 * FA Dave
2581 cycles for 100 * FA Sinsi
1210 cycles for 100 * FA Jochen unaligned
1858 cycles for 100 * FA Yves
2060 cycles for 100 * FA Dave
2550 cycles for 100 * FA Sinsi
1214 cycles for 100 * FA Jochen unaligned
1855 cycles for 100 * FA Yves
2056 cycles for 100 * FA Dave
2553 cycles for 100 * FA Sinsi
1213 cycles for 100 * FA Jochen unaligned
1857 cycles for 100 * FA Yves
230 bytes for FA Dave
116 bytes for FA Sinsi
149 bytes for FA Jochen unaligned
141 bytes for FA Yves
4208520 = eax FA Dave
4208520 = eax FA Sinsi
4208520 = eax FA Jochen unaligned
4208520 = eax FA Yves
--- ok ---
...
Not bad. I think we
can arrive at 0.7 cycles per dword string, but it has yet to be
demonstrated. :lol:
Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz (SSE4)
loop overhead is approx. 187/100 cycles
1870 cycles for 100 * FA Dave
648 cycles for 100 * FA Jochen
2442 cycles for 100 * FA Sinsi
791 cycles for 100 * FA Jochen unaligned
2009 cycles for 100 * FA Yves
1886 cycles for 100 * FA Dave
648 cycles for 100 * FA Jochen
2418 cycles for 100 * FA Sinsi
791 cycles for 100 * FA Jochen unaligned
2012 cycles for 100 * FA Yves
1871 cycles for 100 * FA Dave
648 cycles for 100 * FA Jochen
2430 cycles for 100 * FA Sinsi
790 cycles for 100 * FA Jochen unaligned
2010 cycles for 100 * FA Yves
230 bytes for FA Dave
281 bytes for FA Jochen
116 bytes for FA Sinsi
141 bytes for FA Jochen unaligned
141 bytes for FA Yves
4208864 = eax FA Dave
4208864 = eax FA Jochen
4208864 = eax FA Sinsi
4208864 = eax FA Jochen unaligned
4208864 = eax FA Yves
--- ok ---
-------------------------------------------------------
Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz
Instructions: MMX, SSE1, SSE2, SSE3, SSSE3
-------------------------------------------------------
1914 cycles for Dedndave code
1983 cycles for Frktons I Step
-------------------------------------------------------
1914 cycles for Dedndave code
1973 cycles for Frktons I Step
-------------------------------------------------------
1944 cycles for Dedndave code
1972 cycles for Frktons I Step
-------------------------------------------------------
1913 cycles for Dedndave code
1976 cycles for Frktons I Step
-------------------------------------------------------
------------------------------------------------------------------------
Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz
Instructions: MMX, SSE1, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2
------------------------------------------------------------------------
1577 cycles for Dedndave code
2461 cycles for Frktons I Step
------------------------------------------------------------------------
1048 cycles for Dedndave code
2492 cycles for Frktons I Step
------------------------------------------------------------------------
1058 cycles for Dedndave code
2497 cycles for Frktons I Step
------------------------------------------------------------------------
1048 cycles for Dedndave code
2495 cycles for Frktons I Step
------------------------------------------------------------------------
--- ok ---
-----------------------------------------------
Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz
Instructions: MMX, SSE1, SSE2, SSE3, SSSE3
-----------------------------------------------
1914 cycles for Dedndave code - 5 GPRs
2574 cycles for Frktons I Step / 2 GPRs
1889 cycles for Frktons I Step / 4 GPRs
-----------------------------------------------
1940 cycles for Dedndave code - 5 GPRs
2561 cycles for Frktons I Step / 2 GPRs
1886 cycles for Frktons I Step / 4 GPRs
-----------------------------------------------
1913 cycles for Dedndave code - 5 GPRs
2561 cycles for Frktons I Step / 2 GPRs
1900 cycles for Frktons I Step / 4 GPRs
-----------------------------------------------
1913 cycles for Dedndave code - 5 GPRs
2561 cycles for Frktons I Step / 2 GPRs
1887 cycles for Frktons I Step / 4 GPRs
-----------------------------------------------
------------------------------------------------------------------------
Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz
Instructions: MMX, SSE1, SSE2, SSE3, SSSE3
------------------------------------------------------------------------
1926 cycles for Dedndave code - 5 GPRs
1857 cycles for Frktons I Step / 2 GPRs
1896 cycles for Frktons I Step / 4 GPRs
1320 cycles for Frktons II Step / 5 MMX
830 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
1919 cycles for Dedndave code - 5 GPRs
1897 cycles for Frktons I Step / 2 GPRs
1895 cycles for Frktons I Step / 4 GPRs
1321 cycles for Frktons II Step / 5 MMX
831 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
1916 cycles for Dedndave code - 5 GPRs
1898 cycles for Frktons I Step / 2 GPRs
1897 cycles for Frktons I Step / 4 GPRs
1319 cycles for Frktons II Step / 5 MMX
830 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
1915 cycles for Dedndave code - 5 GPRs
1892 cycles for Frktons I Step / 2 GPRs
1894 cycles for Frktons I Step / 4 GPRs
1327 cycles for Frktons II Step / 5 MMX
830 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
--- ok ---
lea esi, Tens
lea edi, MyArray
movq mm5, Mask2Double
movq mm6, ONE_TWO
movq mm7, TWO_TWO
align 4
@@:
mov eax, [esi]
movd mm0, eax
pshufb mm0, mm5
paddd mm0, mm6
movq mm1, mm0
paddd mm1, mm7
movq mm2, mm1
paddd mm2, mm7
movq mm3, mm2
paddd mm3, mm7
movq mm4, mm3
paddd mm4, mm7
movq [edi], mm0
movq [edi + 8], mm1
movq [edi + 16], mm2
movq [edi + 24], mm3
movq [edi + 32], mm4
mov eax, [esi + 4]
movd mm0, eax
pshufb mm0, mm5
paddd mm0, mm6
movq mm1, mm0
paddd mm1, mm7
movq mm2, mm1
paddd mm2, mm7
movq mm3, mm2
paddd mm3, mm7
movq mm4, mm3
paddd mm4, mm7
movq [edi + 40], mm0
movq [edi + 48], mm1
movq [edi + 56], mm2
movq [edi + 64], mm3
movq [edi + 72], mm4
add esi, 8
add edi, 80
cmp esi, PtrTens
jl @B
Exception code: 0xc000001d
Fault offset: 0x00001619
first_dw dd 0 ; put here the same value of second_dw
second_dw dd 0 ; put here the same value of first_dw
....
lea eax, first_dw
movq mm0, [eax]
by using SSSE3 (or even SSE3), you exclude a lot of the older CPU's that are still in use
really - this is a program init function
it seems kinda silly to exclude a CPU unless SSSE3 is to be used throughout the rest of the program
you are going to tell me i need a new one ? - lol
i am very happy with the one i have
until microsoft or adobe or some other a-hole forces me to be unhappy
mov eax,1
cpuid
test ch,2
jz no_ssse3_support
you could add a little test to see if the CPU supports SSSE3
if it does not, skip that test and say "Frktons II Step requires SSSE3 support" and go on to the next testCode: [Select]mov eax,1
cpuid
test ch,2
jz no_ssse3_support
note: .586 or higher required to assemble "cpuid" without hard-coding
------------------------------------------------------------------------
Intel(R) Pentium(R) 4 CPU 3.00GHz
Instructions: MMX, SSE1, SSE2, SSE3
------------------------------------------------------------------------
2270 cycles for Dedndave code - 5 GPRs
1940 cycles for Frktons I Step / 2 GPRs
1950 cycles for Frktons I Step / 4 GPRs
Frktons II Step requires a PC with SSSE3
1165 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
2242 cycles for Dedndave code - 5 GPRs
1962 cycles for Frktons I Step / 2 GPRs
1949 cycles for Frktons I Step / 4 GPRs
Frktons II Step requires a PC with SSSE3
1176 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
2248 cycles for Dedndave code - 5 GPRs
1939 cycles for Frktons I Step / 2 GPRs
1956 cycles for Frktons I Step / 4 GPRs
Frktons II Step requires a PC with SSSE3
1161 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
2268 cycles for Dedndave code - 5 GPRs
1947 cycles for Frktons I Step / 2 GPRs
1987 cycles for Frktons I Step / 4 GPRs
Frktons II Step requires a PC with SSSE3
1191 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
--- ok ---
CpuFeatures PROC
;call once during program initialization
;store the value returned in EAX (AL, actually) for feature verification
;
;0 = no extended features present
;1 = MMX
;2 = SSE
;3 = SSE2
;4 = SSE3
;5 = SSSE3
;6 = SSE4
mov eax,1
cpuid
bswap edx ;MMX -> bit 8, SSE1 -> bit 6, SSE2 -> bit 5
xor eax,eax
test dh,1 ;MMX
jz CpuF00
inc eax
test dl,40h ;SSE1
jz CpuF00
inc eax
test dl,20h ;SSE2
jz CpuF00
inc eax
test cl,1 ;SSE3
jz CpuF00
inc eax
test cl,2 ;SSSE3
jz CpuF00
inc eax
test ecx,80000h ;SSE4
jz CpuF00
inc eax
CpuF00: ret
CpuFeatures ENDP
.DATA?
bFeatures db ?
.CODE
Start: call CpuFeatures
mov bFeatures,al
;
;
;
cmp bFeatures,5
jb no_ssse3_support
here's a little routine i wrote just for you, Frank :biggrin:Code: [Select]CpuFeatures PROC
;call once during program initialization
;store the value returned in EAX (AL, actually) for feature verification
;
;0 = no extended features present
;1 = MMX
;2 = SSE
;3 = SSE2
;4 = SSE3
;5 = SSSE3
;6 = SSE4
mov eax,1
cpuid
bswap edx ;MMX -> bit 8, SSE1 -> bit 6, SSE2 -> bit 5
xor eax,eax
test dh,1 ;MMX
jz CpuF00
inc eax
test dl,40h ;SSE1
jz CpuF00
inc eax
test dl,20h ;SSE2
jz CpuF00
inc eax
test cl,1 ;SSE3
jz CpuF00
inc eax
test cl,2 ;SSSE3
jz CpuF00
inc eax
test ecx,80000h ;SSE4
jz CpuF00
inc eax
CpuF00: ret
CpuFeatures ENDPCode: [Select].DATA?
bFeatures db ?
.CODE
Start: call CpuFeatures
mov bFeatures,al
;
;
;
now, if you want to see if they have SSSE3....Code: [Select]cmp bFeatures,5
jb no_ssse3_support
----------------------------------------------------------------------
Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz
Instructions: MMX, SSE1, SSE2, SSE3, SSSE3
----------------------------------------------------------------------
1915 cycles for Dedndave code - 5 GPRs
1837 cycles for Frktons I Step / 2 GPRs
1893 cycles for Frktons I Step / 4 GPRs
1318 cycles for Frktons II Step / 5 MMX
1365 cycles for Frktons II Step / 5 MMX without SSSE3
836 cycles for Jochen / 5 XMM
----------------------------------------------------------------------
1913 cycles for Dedndave code - 5 GPRs
1892 cycles for Frktons I Step / 2 GPRs
1893 cycles for Frktons I Step / 4 GPRs
1317 cycles for Frktons II Step / 5 MMX
1366 cycles for Frktons II Step / 5 MMX without SSSE3
836 cycles for Jochen / 5 XMM
----------------------------------------------------------------------
------------------------------------------------------------------------
Intel(R) Pentium(R) 4 CPU 3.00GHz
Instructions: MMX, SSE1, SSE2, SSE3
------------------------------------------------------------------------
2250 cycles for Dedndave code - 5 GPRs
1943 cycles for Frktons I Step / 2 GPRs
1960 cycles for Frktons I Step / 4 GPRs
Frktons II Step requires a PC with SSSE3
2148 cycles for Frktons II Step / 5 MMX without SSSE3
1161 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
2242 cycles for Dedndave code - 5 GPRs
1952 cycles for Frktons I Step / 2 GPRs
1971 cycles for Frktons I Step / 4 GPRs
Frktons II Step requires a PC with SSSE3
2268 cycles for Frktons II Step / 5 MMX without SSSE3
1161 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
2271 cycles for Dedndave code - 5 GPRs
1950 cycles for Frktons I Step / 2 GPRs
1965 cycles for Frktons I Step / 4 GPRs
Frktons II Step requires a PC with SSSE3
2149 cycles for Frktons II Step / 5 MMX without SSSE3
1166 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
2240 cycles for Dedndave code - 5 GPRs
1941 cycles for Frktons I Step / 2 GPRs
1958 cycles for Frktons I Step / 4 GPRs
Frktons II Step requires a PC with SSSE3
2144 cycles for Frktons II Step / 5 MMX without SSSE3
1166 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
------------------------------------------------------------------------
Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz
Instructions: MMX, SSE1, SSE2, SSE3, SSSE3
------------------------------------------------------------------------
1925 cycles for Dedndave code - 5 GPRs
1882 cycles for Frktons I Step / 2 GPRs
1941 cycles for Frktons I Step / 4 GPRs
1896 cycles for Frktons I Step / 4 GPRs - no external tab
1096 cycles for Frktons II Step / 5 MMX with SSSE3
1206 cycles for Frktons II Step / 5 MMX without SSSE3
836 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
1917 cycles for Dedndave code - 5 GPRs
1882 cycles for Frktons I Step / 2 GPRs
1917 cycles for Frktons I Step / 4 GPRs
1893 cycles for Frktons I Step / 4 GPRs - no external tab
1091 cycles for Frktons II Step / 5 MMX with SSSE3
1206 cycles for Frktons II Step / 5 MMX without SSSE3
836 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
--- ok ---
------------------------------------------------------------------------
Intel(R) Pentium(R) 4 CPU 3.00GHz
Instructions: MMX, SSE1, SSE2, SSE3
------------------------------------------------------------------------
2379 cycles for Dedndave code - 5 GPRs
1947 cycles for Frktons I Step / 2 GPRs
1961 cycles for Frktons I Step / 4 GPRs
1986 cycles for Frktons I Step / 4 GPRs - no external tab
Frktons II Step requires a PC with SSSE3
2460 cycles for Frktons II Step / 5 MMX without SSSE3
1152 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
2251 cycles for Dedndave code - 5 GPRs
1966 cycles for Frktons I Step / 2 GPRs
1961 cycles for Frktons I Step / 4 GPRs
1978 cycles for Frktons I Step / 4 GPRs - no external tab
Frktons II Step requires a PC with SSSE3
2456 cycles for Frktons II Step / 5 MMX without SSSE3
1151 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
------------------------------------------------------------------------
Intel(R) Pentium(R) 4 CPU 3.00GHz
Instructions: MMX, SSE1, SSE2
------------------------------------------------------------------------
2275 cycles for Dedndave code - 5 GPRs
1979 cycles for Frktons I Step / 2 GPRs
1996 cycles for Frktons I Step / 4 GPRs
2034 cycles for Frktons I Step / 4 GPRs - no external tab
Frktons II Step requires a PC with SSSE3
2970 cycles for Frktons II Step / 5 MMX without SSSE3
896 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
2330 cycles for Dedndave code - 5 GPRs
1970 cycles for Frktons I Step / 2 GPRs
1997 cycles for Frktons I Step / 4 GPRs
2734 cycles for Frktons I Step / 4 GPRs - no external tab
Frktons II Step requires a PC with SSSE3
2937 cycles for Frktons II Step / 5 MMX without SSSE3
905 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
------------------------------------------------------------------------
Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz
Instructions: MMX, SSE1, SSE2, SSE3, SSSE3
------------------------------------------------------------------------
1950 cycles for Dedndave code - 5 GPRs
1924 cycles for Frktons I Step / 2 GPRs
1871 cycles for Frktons I Step / 4 GPRs
1888 cycles for Frktons I Step / 4 GPRs - no external tab
1079 cycles for Frktons II Step / 5 MMX with SSSE3
1199 cycles for Frktons II Step / 5 MMX without SSSE3
801 cycles for Frktons III Step / XMM/MMX with SSE2
831 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
1916 cycles for Dedndave code - 5 GPRs
1930 cycles for Frktons I Step / 2 GPRs
1872 cycles for Frktons I Step / 4 GPRs
1915 cycles for Frktons I Step / 4 GPRs - no external tab
1083 cycles for Frktons II Step / 5 MMX with SSSE3
1209 cycles for Frktons II Step / 5 MMX without SSSE3
796 cycles for Frktons III Step / XMM/MMX with SSE2
831 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
--- ok ---
Frktons I Step / 2 GPRs and Frktons I Step / 4 GPRs are remarkably fast but you should have a look at the output.
lol
it has to work first, otherwise you are comparing apples with oranges in the timing tests :t
Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz
-------------------------------------------------------------
813 cycles for Frktons III Step / XMM/MMX with SSE2
835 cycles for Jochen / 5 XMM
Intel(R) Pentium(R) CPU G6950 @ 2.80GHz
-------------------------------------------------------------
...
405 cycles for Frktons III Step / XMM/MMX with SSE2
644 cycles for Jochen / 5 XMM
I don't agree with the suffix, I'm not at the FINAL stage so far. :lol:
P.S.: If you don't agree with the suffix "FINAL", write a faster algo :bgrin:
On my home PC the III step is a bit faster than Jochen's code,
and in my office PC it is even faster.
...
I'm actually studing a faster/smaller solution because, as Jochen said:I don't agree with the suffix, I'm not at the FINAL stage so far. :lol:
P.S.: If you don't agree with the suffix "FINAL", write a faster algo :bgrin:
So the incentive worked :greensml: :t
------------------------------------------------------------------------
Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz
Instructions: MMX, SSE1, SSE2, SSE3, SSSE3
------------------------------------------------------------------------
1915 cycles for Dedndave code - 5 GPRs
1890 cycles for Frktons I Step / 2 GPRs
1964 cycles for Frktons I Step / 4 GPRs with LEA
1114 cycles for Frktons II Step / 5 MMX with SSSE3
1199 cycles for Frktons II Step / 5 MMX without SSSE3
811 cycles for Frktons III Step / XMM/MMX with SSE2
630 cycles for Frktons III Step / XMM with SSE2 - enhanced
706 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
1915 cycles for Dedndave code - 5 GPRs
1896 cycles for Frktons I Step / 2 GPRs
1978 cycles for Frktons I Step / 4 GPRs with LEA
1110 cycles for Frktons II Step / 5 MMX with SSSE3
1199 cycles for Frktons II Step / 5 MMX without SSSE3
813 cycles for Frktons III Step / XMM/MMX with SSE2
628 cycles for Frktons III Step / XMM with SSE2 - enhanced
704 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
--- ok ---
------------------------------------------------------------------------
Intel(R) Pentium(R) 4 CPU 3.00GHz
Instructions: MMX, SSE1, SSE2, SSE3
------------------------------------------------------------------------
2267 cycles for Dedndave code - 5 GPRs
1957 cycles for Frktons I Step / 2 GPRs
2035 cycles for Frktons I Step / 4 GPRs with LEA
Frktons II Step requires a PC with SSSE3
2459 cycles for Frktons II Step / 5 MMX without SSSE3
1126 cycles for Frktons III Step / XMM/MMX with SSE2
1197 cycles for Frktons III Step / XMM with SSE2 - enhanced
1159 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
2282 cycles for Dedndave code - 5 GPRs
1967 cycles for Frktons I Step / 2 GPRs
2031 cycles for Frktons I Step / 4 GPRs with LEA
Frktons II Step requires a PC with SSSE3
2483 cycles for Frktons II Step / 5 MMX without SSSE3
1126 cycles for Frktons III Step / XMM/MMX with SSE2
1185 cycles for Frktons III Step / XMM with SSE2 - enhanced
1158 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
------------------------------------------------------------------------
Intel(R) Core(TM) i3 CPU M 370 @ 2.40GHz
Instructions: MMX, SSE1, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2
------------------------------------------------------------------------
1335 cycles for Dedndave code - 5 GPRs
1855 cycles for Frktons I Step / 2 GPRs
1009 cycles for Frktons I Step / 4 GPRs with LEA
587 cycles for Frktons II Step / 5 MMX with SSSE3
643 cycles for Frktons II Step / 5 MMX without SSSE3
389 cycles for Frktons III Step / XMM/MMX with SSE2
344 cycles for Frktons III Step / XMM with SSE2 - enhanced
460 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
1278 cycles for Dedndave code - 5 GPRs
1025 cycles for Frktons I Step / 2 GPRs
1015 cycles for Frktons I Step / 4 GPRs with LEA
585 cycles for Frktons II Step / 5 MMX with SSSE3
633 cycles for Frktons II Step / 5 MMX without SSSE3
400 cycles for Frktons III Step / XMM/MMX with SSE2
345 cycles for Frktons III Step / XMM with SSE2 - enhanced
437 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
--- ok ---
ahhh nice algo
mov eax,ecx ;every 10th bit from ECX
mov edx,ebp ;every 10th bit from EBP
or eax,esi ;every 14th bit from ESI
or edx,ebx ;every 14th bit from EBX
mov [edi],eax
mov [edi+4],edx
;do rotates here
add edi,8
;loop to top
It's just for fun. I even tried parallelisation on the Cuda side of life (in C), but I seem to make all of those terrible mistakes of the unblessed beginners - it gets much slower than the same partitioned code on plain CPU. One pitfall is parallel writes to the same word on different bits from different threads - it overwrites other set bits (setting the 20th bit needs loading the word, setting the bit, saving the word - so the recently stored bit 15 gets cleared again)
I think SSE3 or SSSE3 should make it easy, and even IntelHD on the other computer (Core i5) should have enough cpu hardware units to set a few bits in parallel.
One pitfall is parallel writes to the same word on different bits from different threads - it overwrites other set bits (setting the 20th bit needs loading the word, setting the bit, saving the word - so the recently stored bit 15 gets cleared again)
MOV EAX,[Memory] ; Load
OR EAX,BitPattern ; Modify
MOV [Memory],EAX ; Store
; can become
OR DWORD PTR [Memory],BitPattern
* to mark every 10th bit starting with bits 7 and 10 in a GB of memory. (7,10,17,20,27,30,...)
* also to mark every 14th bit starting with bits 15 and 24 (15,24, 29, 38, ...
@jj2007: here the code, I tried gcc with option -masm=intel.
#include <stdio.h>
#include <conio.h>
double CalcArea(const double r)
{
return r*r*3.14159;
}
int main (void)
{
printf("%f\n", CalcArea(1));
getch();
__asm__("int 3\n");
}
set file="test"
gcc -pedantic -S -masm=intel %file%.c -o %file%.asm
pause
gcc -pedantic -masm=intel %file%.c -o %file%.exe
pause
.file "test.c"
.intel_syntax noprefix
.text
.globl _CalcArea
.def _CalcArea; .scl 2; .type 32; .endef
_CalcArea:
LFB6:
.cfi_startproc
push ebp
.cfi_def_cfa_offset 8
.cfi_offset 5, -8
mov ebp, esp
.cfi_def_cfa_register 5
sub esp, 8
mov eax, DWORD PTR [ebp+8]
mov DWORD PTR [ebp-8], eax
mov eax, DWORD PTR [ebp+12]
mov DWORD PTR [ebp-4], eax
fld QWORD PTR [ebp-8]
fmul QWORD PTR [ebp-8]
fld QWORD PTR LC0
fmulp st(1), st
leave
.cfi_restore 5
.cfi_def_cfa 4, 4
ret
.cfi_endproc
LFE6:
.def ___main; .scl 2; .type 32; .endef
.section .rdata,"dr"
LC3:
.ascii "%f\12\0"
.text
.globl _main
.def _main; .scl 2; .type 32; .endef
_main:
LFB7:
.cfi_startproc
push ebp
.cfi_def_cfa_offset 8
.cfi_offset 5, -8
mov ebp, esp
.cfi_def_cfa_register 5
and esp, -16
sub esp, 16
call ___main
mov eax, 0
mov edx, 1072693248
mov DWORD PTR [esp], eax
mov DWORD PTR [esp+4], edx
call _CalcArea
fstp QWORD PTR [esp+4]
mov DWORD PTR [esp], OFFSET FLAT:LC3
call _printf
call _getch
/APP
# 13 "test.c" 1
int 3
# 0 "" 2
/NO_APP
leave
.cfi_restore 5
.cfi_def_cfa 4, 4
ret
.cfi_endproc
LFE7:
.section .rdata,"dr"
.align 8
LC0:
.long -266631570
.long 1074340345
.def _printf; .scl 2; .type 32; .endef
.def _getch; .scl 2; .type 32; .endef
@JJ2007: I was slightly misunderstood - I did not want you to compile with gcc, but I tried to make my gcc produce a "good" assembly file (.s in the flasch0.zip). What toolstack do you use when you have a peace of C-code to optimize? Or do you really write the whole software directly with in assembly and with your own libs?
That looks slow, honestly. I am not so proficient in 64-bit assembly, but in general we try to avoid shifts because they are slow, apart from other design problems in that loop.
Re toolstack: Masm32 is a library of very fast routines, compared to standard CRT; MasmBasic is another library with often much faster algos, and for specific tasks you will find many more in the "Laboratory" section. And yes we do write every algo from scratch, and we all find it particularly amusing to beat a CRT algo by a factor 3 and higher :icon_mrgreen:
* to mark every 10th bit starting with bits 7 and 10 in a GB of memory. (7,10,17,20,27,30,...)
* also to mark every 14th bit starting with bits 15 and 24 (15,24, 29, 38, ...)
* to mark every 10th bit starting with bits 7 and 0 in a GB of memory. (7,0,17,10,27,20,...)
* also to mark every 14th bit starting with bits 1 and 10 (1,10,15,24, 29, 38, ...)
* at the end, go back and correct the first few bytes :)
10000001 00100000 01001000 00010010 00000100 10000001 00100000 01001000 00010010 00000100 10000001 00100000 01001000 00010010 00000100 10000001
00100000 01001000 00010010 00000100 10000001 00100000 01001000 00010010 00000100 10000001 00100000 01001000 00010010 00000100 10000001 00100000
01001000 00010010 00000100 10000001 00100000 01001000 00010010 00000100 10000001 00100000 01001000 00010010 00000100 10000001 00100000 01001000
00010010 00000100 10000001 00100000 01001000 00010010 00000100 10000001 00100000 01001000 00010010 00000100 10000001 00100000 01001000 00010010
00000100 10000001 00100000 01001000 00010010 00000100 10000001 00100000 01001000 00010010 00000100 10000001 00100000 01001000 00010010 00000100
the pattern repeats after the 5th OWORDi guess you are a member of Euler ?
align 16
every10th_at7:
;----------------- prolog:
db 00000001b,00000000b,01000000b,00010000b,00000100b,00000001b,00000000b,01000000b
db 00010000b,00000100b,00000001b,00000000b,01000000b,00010000b,00000100b,00000001b
;----------------- main loop starts:
db 00000000b,01000000b,00010000b,00000100b,00000001b,00000000b,01000000b,00010000b
db 00000100b,00000001b,00000000b,01000000b,00010000b,00000100b,00000001b,00000000b
db 01000000b,00010000b,00000100b,00000001b,00000000b,01000000b,00010000b,00000100b
db 00000001b,00000000b,01000000b,00010000b,00000100b,00000001b,00000000b,01000000b
db 00010000b,00000100b,00000001b,00000000b,01000000b,00010000b,00000100b,00000001b
; ---------------- unroll for a 16byte multiples writes:
db 00000000b,01000000b,00010000b,00000100b,00000001b,00000000b,01000000b,00010000b
db 00000100b,00000001b,00000000b,01000000b,00010000b,00000100b,00000001b,00000000b
db 01000000b,00010000b,00000100b,00000001b,00000000b,01000000b,00010000b,00000100b
db 00000001b,00000000b,01000000b,00010000b,00000100b,00000001b,00000000b,01000000b
db 00010000b,00000100b,00000001b,00000000b,01000000b,00010000b,00000100b,00000001b
mov rsi, _buff1
; rsi points to a buffer
mov rax, [every10th_at7]
mov rdx, [every10th_at7+8]
or [rsi], rax
or [rsi+8], rdx
add rsi, 16
mov rax, [every10th_at7+16]
mov rdx, [every10th_at7+24]
mov rdi, [every10th_at7+32]
mov rbx, [every10th_at7+40]
mov rbp, [every10th_at7+48]
; number of 40byte blocks
mov ecx, 10
@@:
or [rsi], rax
or [rsi+8], rdx
or [rsi+16], rdi
or [rsi+24], rbx
or [rsi+32], rbp
add rsi, 40
dec ecx
jnz @b
mov esi, _buff1
; esi points to 16 byte aligned buffer
mov eax, [every10th_at7]
mov edx, [every10th_at7+4]
mov ecx, [every10th_at7+8]
mov edi, [every10th_at7+12]
or [esi], eax
or [esi+4], edx
or [esi+8], ecx
or [esi+12], edi
add esi, 16
movdqa xmm0, [every10th_at7+16]
movdqa xmm1, [every10th_at7+32]
movdqa xmm2, [every10th_at7+48]
movdqa xmm3, [every10th_at7+64]
mov eax, [every10th_at7+80]
mov edx, [every10th_at7+84]
mov ebx, [every10th_at7+88]
mov edi, [every10th_at7+92]
; number of 80 byte blocks
mov ecx, 5
@@:
movdqa xmm4, xmm0
movdqa xmm5, xmm1
movdqa xmm6, xmm2
movdqa xmm7, xmm3
por xmm4, [esi]
por xmm5, [esi+16]
por xmm6, [esi+32]
por xmm7, [esi+48]
or [esi+64], eax
or [esi+68], edx
or [esi+72], ebx
or [esi+76], edi
movdqa [esi], xmm4
movdqa [esi+16], xmm5
movdqa [esi+32], xmm6
movdqa [esi+48], xmm7
add esi, 80
dec ecx
jnz @b
mov rax, [every10th_at7]
mov rdx, [every10th_at7+8]
mov rdi, [every10th_at7+16]
mov rbx, [every10th_at7+24]
mov rbp, [every10th_at7+32]