I'd like to fill an array of 1000 dword with the string values ranging
from ' 0' to ' 999'.
To avoid 400k increment in program size and writing manually 1000
values in .data, I prefer to declare the array in .data? and fill the
array with a proc: call FillArray.
I've some ideas on how to to that, but before starting the tests I'd like
your suggestions, code, already done experiments.. to think upon.
Let me know.
Frank
FillArray PROC USES EBX ESI EDI
mov edi,offset MyArray ;or whatever you name it - it should be 4-aligned
xor eax,eax
mov ebx,1
mov edx,2
mov esi,3
mov ecx,1000/4
FArry0: mov [edi],eax
mov [edi+4],ebx
mov [edi+8],edx
mov [edi+12],esi
add eax,4
add ebx,4
add edx,4
add esi,4
add edi,16
sub ecx,1
jnz FArry0
ret
FillArray ENDP
Quote from: dedndave on December 09, 2012, 03:04:46 AM
FillArray PROC USES EBX ESI EDI
mov edi,offset MyArray ;or whatever you name it - it should be 4-aligned
xor eax,eax
mov ebx,1
mov edx,2
mov esi,3
mov ecx,1000/4
FArry0: mov [edi],eax
mov [edi+4],ebx
mov [edi+8],edx
mov [edi+12],esi
add eax,4
add ebx,4
add edx,4
add esi,4
add edi,16
sub ecx,1
jnz FArry0
ret
FillArray ENDP
Hi Dave, I'm not sure this routine matchs my need.
You sure this routine fills the array with strings 4
bytes long containing the ascii representation of
numbers 0-999 with leading spaces?
ohhhhhhhhhhhhhhh
no - i am sure that it does not - lol
it fills the array with the binary values
give me a minute.....
EDIT: it is run once at the beginning of the program
so, i would not think speed is super-critical, anyways
FillArray PROC USES EBX ESI EDI
mov edi,offset MyArray ;or whatever you name it
xor esi,esi
mov ebx,1000
FArry0: INVOKE crt__ultoa,esi,edi,10
add edi,4
inc esi
dec ebx
jnz FArry0
ret
FillArray ENDP
Before you design an algo, you first need to specify:
(i) if the ascii representations will be left-aligned or right-aligned within its 4-byte space when you print that 4000-byte string,
(ii) if right-aligned, should it have leading 0's or leading spaces for the numbers below 100,
(iii)which character do you need as the 4th character since each number needs at most 3 characters in ascii format.
Quote from: raymond on December 09, 2012, 03:31:05 AM
Before you design an algo, you first need to specify:
(i) if the ascii representations will be left-aligned or right-aligned within its 4-byte space when you print that 4000-byte string,
(ii) if right-aligned, should it have leading 0's or leading spaces for the numbers below 100,
(iii)which character do you need as the 4th character since each number needs at most 3 characters in ascii format.
If you read the first post, all these questions will be answered.
All the answers lies in " 0" and " 999". :P
Dave: the use of
crt__ultoa makes me doubt it will be
the fastest around.
In a few days we'll see. :t
Seems there is not to many soluces to be fast.
put 30h in a register = 0 >>> in memory
cmp 3ah
inc = 1 >>> //
..
inc 3Ah = 3130h = 10 >>> in memory
inc
cmp ...
Quote from: ToutEnMasm on December 09, 2012, 03:52:45 AM
Seems there is not to many soluces to be fast.
put 30h in a register = 0 >>> in memory
cmp 3ah
inc = 1 >>> //
..
inc 3Ah = 3130h = 10 >>> in memory
inc
cmp ...
This is a good solution, but I don't think it is the fastest around.
By the way if you want to post your test results we can compare
them with other solutions.
i didn't test it - and it may have a bug or 2 to work out
but, you get the idea
FillArray PROC
mov edx,offset MyArray ;or whatever you name it - it should be 4-aligned
mov eax,302020h ;EAX = " 0",0
jmp short FArry4
FArry0: and eax,030FFFFh
cmp ah,39h
jz FArry1
cmp ah,20h
jz FArry2
add ah,1
jmp short FArry4
FArry1: sub eax,8FFh
cmp al,30h
ja FArry4
mov al,31h
jmp short FArry4
FArry2: mov ah,31h
jmp short FArry4
FArry3: add eax,10000h
FArry4: mov [edx],eax
add edx,4
cmp eax,393030h
jb FArry3
ja FArry0
ret
FillArray ENDP
Better if you do test it and post the results. We are in the lab here,
and, by the way, you should use:
mov edx,offset MyArray
mov eax,30202020h ;EAX = " 0"
instead of:
mov edx,offset MyArray ;or whatever you name it - it should be 4-aligned
mov eax,302020h ;EAX = " 0",0
to meet the requisites of the test. :P
Quote from: frktons on December 09, 2012, 04:26:58 AM
Better if you do test it and post the results. We are in the lab here...
better for who ?
i got you started - you can't get it from there ??? :lol:
Quote from: frktons on December 09, 2012, 04:26:58 AM
...and, by the way, you should use:
mov edx,offset MyArray
mov eax,30202020h ;EAX = " 0"
instead of:
mov edx,offset MyArray ;or whatever you name it - it should be 4-aligned
mov eax,302020h ;EAX = " 0",0
to meet the requisites of the test. :P
no - we want to start with 2 spaces, an ASCII 0, and a null terminator
so the high byte of EAX should be binary 0
i will give you another idea, though
it might be slightly faster to start at the end of the array with "999",0 and work down
Quote
This is a good solution, but I don't think it is the fastest around.
By the way if you want to post your test results we can compare
them with other solutions.
I am not interested in wining some µs in a prog who is just used one time.
it was just a pseudo-code sample.
Folks,
If it is supposed to be fast, it must be a solution with dword-packed xmm regs.
Load them with
[ 0 1 2 3]
first, than add
[ 4 4 4 4]
to get
[ 4 5 6 7]
Go ahead, Frank!
Quote from: ToutEnMasm on December 09, 2012, 04:40:56 AM
Quote from: dedndave on December 09, 2012, 04:36:46 AM
i got you started - you can't get it from there ??? :lol:
I'm not able to correct your code, better if you do it yourself. :eusa_snooty:
Quote
no - we want to start with 2 spaces, an ASCII 0, and a null terminator
so the high byte of EAX should be binary 0
Well that could be another test, now the test is what I stated in the first post. :P
Quote
I am not interested in wining some µs in a prog who is just used one time.
it was just a pseudo-code sample.
In my intentions this will be part of a standard proc/routine I'm thinking about.
If you are not interested in the matter, it doesn't matter, you are welcome the
same. :t
Quote from: jj2007 on December 09, 2012, 04:44:52 AM
Folks,
If it is supposed to be fast, it must be a solution with dword-packed xmm regs.
Load them with
[ 0 1 2 3]
first, than add
[ 4 4 4 4]
to get
[ 4 5 6 7]
Go ahead, Frank!
This is what I was thinking about. It is not yet clear in my mind the sequence to
make it the fastest [how many xmm registers to use, which instructions...] but that
is the idea for a fast filling. :t
If you want them in ascending sequence, it should be [ 3 2 1 0] since the xmm reg is loaded/stored in little endian order. Then you need to take of carries frpm 9 to 0 with the next character incremented. This could get troublesome.
What I did was to set a string up to "0000", pick it up and store it, set a pointer to the lowest character, then increment the character pointed to by the pointer. Check if the character exceeds '9', if not pickup and save the string. If greater then '9' add (256-10) to the WORD which has the current pointer as its lowest BYTE which will set the current character to '0' (256-10+'9'+1 = '0' plus a carry to the upper BYTE) and increment the next higher character, then change the pointer to check that character for exceeding '9', etc, etc. After finding a valid character, always drop the pointer back to the lowest character and pick up the value and save it and increment the character at the pointer.
Dave
Quote from: KeepingRealBusy on December 09, 2012, 05:05:11 AM
If you want them in ascending sequence, it should be [ 3 2 1 0] since the xmm reg is loaded/stored in little endian order. Then you need to take of carries frpm 9 to 0 with the next character incremented. This could get troublesome.
What I did was to set a string up to "0000", pick it up and store it, set a pointer to the lowest character, then increment the character pointed to by the pointer. Check if the character exceeds '9', if not pickup and save the string. If greater then '9' add (256-10) to the WORD which has the current pointer as its lowest BYTE which will set the current character to '0' (256-10+'9'+1 = '0' plus a carry to the upper BYTE) and increment the next higher character, then change the pointer to check that character for exceeding '9', etc, etc. After finding a valid character, always drop the pointer back to the lowest character and pick up the value and save it and increment the character at the pointer.
Dave
The order of bytes you suggested is correct, Dave.
It should not be that difficult, at least as I foresee it, I haven't yet
implemented it, but I only imagined it. Jochen's suggestion is
probably the fastest path, but we are still in the preliminary steps,
everything should be tested before seeing the results.
Your idea seems to use mem vars, and this could slow down the
process.
If you test it, let me know your results. :t
You could use regs and different offsets in the regs for the testing and correcting. I was using an 8 byte valuefor my test.
Dave.
A couple of years ago I wrote a prototype in PBCC and after I translated it
in asm for a program that displays in a console screen all the ASCII chars
from 1 to 255, each one of them preceded by the ASCII number of the char.
I used 3 BYTE variables to represent the hundreds, tens and units, and filled
the console buffer with them.
Now I'd like to have a much faster routine, so instead of variables I'm going
to use GPRS and XMM registers. I need some time to write and test the code.
In the meanwhile if somebody tries any solution, is welcome to post the
solution and the performance test results. ;)
Quote from: frktons on December 09, 2012, 02:49:23 AM
I'd like to fill an array of 1000 dword with the string values ranging
from ' 0' to ' 999'.
To avoid 400k increment in program size and writing manually 1000
values in .data, I prefer to declare the array in .data? and fill the
array with a proc: call FillArray.
I've some ideas on how to to that, but before starting the tests I'd like
your suggestions, code, already done experiments.. to think upon.
Let me know.
Frank
Rather than attempting to contain a large array inside your program it would be better to allocate memory from a heap at run time. It's the same thing... but it doesn't bloat the daylights out of your program image.
tater,
1000 dwords is quite small, in terms of memory allocation
when you put something in the uninitialized data section, the size of the exe does not increase
only the image at run-time
if you allocate memory or put it in uninitialized data, it uses memory either way
the advantage to allocation is that it may be free'd up
so - if the table is to be used throughout program execution, that advantage is lost
Quote from: dedndave on December 09, 2012, 06:54:28 AM
tater,
1000 dwords is quite small, in terms of memory allocation
when you put something in the uninitialized data section, the size of the exe does not increase
only the image at run-time
if you allocate memory or put it in uninitialized data, it uses memory either way
the advantage to allocation is that it may be free'd up
so - if the table is to be used throughout program execution, that advantage is lost
Agreed, the idea is to use the array throughout program execution.
Quote from: dedndave on December 09, 2012, 06:54:28 AM
when you put something in the uninitialized data section, the size of the exe does not increase
only the image at run-time
Test it:
include \masm32\include\masm32rt.inc
.data?
MyArray dd 280000 dup(?)
.code
start: inkey "Hello World"
exit
end start
... and get to know a well-known MASM bug. Even ML 10 still shows that odd behaviour...
yah - i know that bug, Jochen - lol
but, we are only allocating 1000 dwords :biggrin:
Quote from: jj2007 on December 09, 2012, 07:14:20 AM
Test it:
include \masm32\include\masm32rt.inc
.data?
MyArray dd 280000 dup(?)
.code
start: inkey "Hello World"
exit
end start
... and get to know a well-known MASM bug. Even ML 10 still shows that odd behaviour...
I didn't get any strange behaviour, and the exe size is 3072 bytes.
when the pgm runs it displays
Hello World and waits for the
keystroke. :icon_rolleyes:
N.B. I use ML10.
the strange behaviour is that it takes long to assemble - at least using ML v6.x it does
here you go, Frank
i get about 2.3 cycles per string on my P4 prescott
Quote from: dedndave on December 09, 2012, 08:41:05 AM
the strange behaviour is that it takes long to assemble - at least using ML v6.x it does
here you go, Frank
i get about 2.3 cycles per string on my P4 prescott
Good starting point, Dave. Let's see the next algos to decide the fastest around.
On my pc I have 1.3 cycles per string. :t
Quote
;that'll be $50, Frank
We'll see in few days if you earned them :lol:
For the time being, Dave's routine got these results on my pc:
Quote
------------------------------------------------------------------------
Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz
Instructions: MMX, SSE1, SSE2, SSE3, SSSE3
------------------------------------------------------------------------
1934 cycles for Dedndave code
------------------------------------------------------------------------
1927 cycles for Dedndave code
------------------------------------------------------------------------
1917 cycles for Dedndave code
------------------------------------------------------------------------
1918 cycles for Dedndave code
------------------------------------------------------------------------
That is about 1.9 cycles per dword string. Not bad. I think we
can arrive at 0.7 cycles per dword string, but it has yet to be
demonstrated. :lol:
Quote from: frktons on December 09, 2012, 09:01:31 AMWe'll see in few days if you earned them :lol:
i accept pay-pal :biggrin:
------------------------------------------------------------------------
Intel(R) Pentium(R) 4 CPU 3.00GHz
Instructions: MMX, SSE1, SSE2, SSE3
------------------------------------------------------------------------
2556 cycles for Dedndave code
------------------------------------------------------------------------
2256 cycles for Dedndave code
------------------------------------------------------------------------
2258 cycles for Dedndave code
------------------------------------------------------------------------
2271 cycles for Dedndave code
------------------------------------------------------------------------
For Dave's code I get 2292 2299 2303 2302 2290 on a P4 Northwood and 2674 2676 2674 2677 2690 on a P3. Frank's EXE will not run on my P3 Windows 2000 system:
"Test_FillArray.exe is not a valid Win32 application."
But on my P4 Northwood Windows XP system I typically get:
------------------------------------------------------------------------
Intel(R) Pentium(R) 4 CPU 3.00GHz
Instructions: MMX, SSE1, SSE2
------------------------------------------------------------------------
2676 cycles for Dedndave code
------------------------------------------------------------------------
2655 cycles for Dedndave code
------------------------------------------------------------------------
2306 cycles for Dedndave code
------------------------------------------------------------------------
2277 cycles for Dedndave code
------------------------------------------------------------------------
But running it multiple times the first one or two results are significantly higher than the others, suggesting that the code is not delaying long enough after it loads before it starts counting cycles. After adding a:
invoke Sleep, 5000
Below the start label and assembling and linking with ML 6.15.8803 and Link 5.12.8078, typical results are:
------------------------------------------------------------------------
Intel(R) Pentium(R) 4 CPU 3.00GHz
Instructions: MMX, SSE1, SSE2
------------------------------------------------------------------------
2278 cycles for Dedndave code
------------------------------------------------------------------------
2296 cycles for Dedndave code
------------------------------------------------------------------------
2263 cycles for Dedndave code
------------------------------------------------------------------------
2285 cycles for Dedndave code
------------------------------------------------------------------------
And the EXE will then run on my P3 system:
------------------------------------------------------------------------
Pentium III
Instructions: MMX, SSE1
------------------------------------------------------------------------
2457 cycles for Dedndave code
------------------------------------------------------------------------
2437 cycles for Dedndave code
------------------------------------------------------------------------
2437 cycles for Dedndave code
------------------------------------------------------------------------
2439 cycles for Dedndave code
------------------------------------------------------------------------
That will be a close race between Dave and Frank...
:biggrin:
Intel(R) Celeron(R) M CPU 420 @ 1.60GHz (SSE3)
loop overhead is approx. 189/100 cycles
1556 cycles for 100 * FA Dave
541 cycles for 100 * FA Jochen
1557 cycles for 100 * FA Frank
1556 cycles for 100 * FA Dave
539 cycles for 100 * FA Jochen
1557 cycles for 100 * FA Frank
1558 cycles for 100 * FA Dave
542 cycles for 100 * FA Jochen
1564 cycles for 100 * FA Frank
230 bytes for FA Dave
281 bytes for FA Jochen
350 bytes for FA Frank
Quote from: jj2007 on December 09, 2012, 01:21:46 PM
That will be a close race between Dave and Frank...
:biggrin:
Intel(R) Celeron(R) M CPU 420 @ 1.60GHz (SSE3)
loop overhead is approx. 189/100 cycles
1556 cycles for 100 * FA Dave
541 cycles for 100 * FA Jochen
1557 cycles for 100 * FA Frank
1556 cycles for 100 * FA Dave
539 cycles for 100 * FA Jochen
1557 cycles for 100 * FA Frank
1558 cycles for 100 * FA Dave
542 cycles for 100 * FA Jochen
1564 cycles for 100 * FA Frank
230 bytes for FA Dave
281 bytes for FA Jochen
350 bytes for FA Frank
Actually I didn't write any code. I only posted Dave code in
another testbed. Well with your routine you demonstrated that
the good job of Dave could be optimized with better registers
and parallel computing.
i don't think Frank has any working code, per se
looks like you got the speed thing :t
did you verify the resulting table ? :P
Intel(R) Pentium(R) 4 CPU 3.00GHz (SSE3)
++18 of 20 tests valid, loop overhead is approx. 246/100 cycles
2069 cycles for 100 * FA Dave
915 cycles for 100 * FA Jochen
2230 cycles for 100 * FA Frank
2029 cycles for 100 * FA Dave
913 cycles for 100 * FA Jochen
2592 cycles for 100 * FA Frank
2034 cycles for 100 * FA Dave
930 cycles for 100 * FA Jochen
2217 cycles for 100 * FA Frank
Jochen,
the source you posted is for MasmBasic editor, I suppose,
couldn't you post a normal ascii text file, in order to see what it
actually does?
Frank
it's a "rich" text file, Frank
i think you can open it with WordPad, if you don't have anything else :t
this is Jochen's data and code
align 16
TestB_s:
Src0123 db " 0 1 2 3"
Add4444 dd 04000000h, 04000000h, 04000000h, 04000000h ; xmm1
Add44xx dd 04000000h, 04000000h, 0FA110000h, 0FA110000h ; xmm2
Addxx44 dd 0FA110000h, 0FA110000h, 04000000h, 04000000h ; xmm3
Addxxxx dd 0FA010000h, 0FA010000h, 0FA010000h, 0FA010000h ; xmm4
Add244xx dd 04000000h, 04000000h, 0FA010000h, 0FA010000h ; xmm2
Add2xx44 dd 0FA010000h, 0FA010000h, 04000000h, 04000000h ; xmm3
Add2xxxx dd 0FA010000h, 0FA010000h, 0FA010000h, 0FA010000h ; xmm4
Sub100a dd 00009EF00h, 00009EF00h, 00009EF00h, 00009EF00h
Sub100b dd 00009FF00h, 00009FF00h, 00009FF00h, 00009FF00h
NameB equ FA Jochen ; assign a descriptive name here
TestB proc
mov esi, offset Src0123
mov edi, offset MyArray
push edi
xor ecx, ecx
movaps xmm0, [esi]
movaps xmm1, [esi+16]
movaps xmm2, [esi+32]
movaps xmm3, [esi+48]
movaps xmm4, [esi+64]
lea edx, [edi+4000]
m2m ecx, -5
; align 4
.Repeat
movaps [edi], xmm0
paddd xmm0, xmm1 ; 4444
movaps [edi+16], xmm0
paddd xmm0, xmm2 ; 44xx
movaps [edi+32], xmm0
paddd xmm0, xmm3 ; xx44
movaps [edi+48], xmm0
paddd xmm0, xmm1 ; 4444
movaps [edi+64], xmm0
paddd xmm0, xmm4 ; xxxx
inc ecx
.if Zero?
psubd xmm0, oword ptr Sub100a
.elseif ecx==-4
movaps xmm2, [esi+80]
movaps xmm3, [esi+96]
movaps xmm4, [esi+112]
.elseif ecx==5
psubd xmm0, oword ptr Sub100b
xor ecx, ecx
.endif
add edi, 80
.Until edi>=edx
pop eax
ret
TestB endp
TestB_endp:
Quote from: dedndave on December 09, 2012, 02:04:35 PM
it's a "rich" text file, Frank
i think you can open it with WordPad, if you don't have anything else :t
Frank does have something else: \Masm32\RichMasm\RichMasm.exe
Drag *.asc over RichMasm.exe, then click on the bookmarks "Test A" and "Test B" on the right :icon_mrgreen:
By the way: performance drops dramatically if you replace movaps with movups. But there is a well-known workaround, see attachment.
For MisAlign=0 and MisAlign=8, performance is almost like movaps (ca. 8 cycles slower, see below). For all other values, it is still almost 500 cycles faster than Dave's non-SSE code.
MisAlign=0:
Intel(R) Celeron(R) M CPU 420 @ 1.60GHz (SSE3)
loop overhead is approx. 189/100 cycles
1558 cycles for 100 * FA Dave
540 cycles for 100 * FA Jochen
547 cycles for 100 * FA Jochen unaligned
1557 cycles for 100 * FA Dave
539 cycles for 100 * FA Jochen
548 cycles for 100 * FA Jochen unalignedMisAlign=3:
Intel(R) Celeron(R) M CPU 420 @ 1.60GHz (SSE3)
loop overhead is approx. 189/100 cycles
1845 cycles for 100 * FA Dave
1365 cycles for 100 * FA Jochen unaligned
1853 cycles for 100 * FA Dave
1362 cycles for 100 * FA Jochen unaligned
I got times around 850 for my code (no sse) but half the size.
Here's your latest jj
AMD Phenom(tm) II X6 1100T Processor (SSE3)
loop overhead is approx. 231/100 cycles
856 cycles for 100 * FA Dave
449 cycles for 100 * FA Jochen
436 cycles for 100 * FA Jochen unaligned
803 cycles for 100 * FA Dave
445 cycles for 100 * FA Jochen
435 cycles for 100 * FA Jochen unaligned
809 cycles for 100 * FA Dave
454 cycles for 100 * FA Jochen
433 cycles for 100 * FA Jochen unaligned
230 bytes for FA Dave
281 bytes for FA Jochen
141 bytes for FA Jochen unaligned
Here is a source code to verify mmx is faster:
Quote
.data
array dd 1000 dup (0)
controle dd 0
Nomfichier db "resultat.txt",0
Hfile dd 0
NumberOfBytesWritten dd 0
retourligne db 13,10,0
.code
start:
mov eax,30303020h ; 000
mov edx,offset array
unit:
mov [edx],eax ;000
add eax,1000000h
add edx,4
mov [edx],eax ;1
add eax,1000000h
add edx,4
mov [edx],eax ;2
add eax,1000000h
add edx,4
mov [edx],eax ;3
add eax,1000000h
add edx,4
mov [edx],eax ;4
add eax,1000000h
add edx,4
mov [edx],eax ;5
add eax,1000000h
add edx,4
mov [edx],eax ;6
add eax,1000000h
add edx,4
mov [edx],eax ;7
add eax,1000000h
add edx,4
mov [edx],eax ;8
add eax,1000000h
add edx,4
mov [edx],eax ;9
;-----------------------------
sub eax,9000000h
add eax,10000h
add edx,4
mov ecx,eax
shr ecx,16
.if cl == 3Ah ;1-
sub eax,0A0000h
inc ah ;303030h + 10000h
.if ah == 3Ah
;fini
jmp fin
.else
jmp unit
.endif
.else
jmp unit
.endif
fin:
lea edx,controle ;debug limit must be NULL
lea edx,array ;debug view what is in memory
invoke CreateFile,addr Nomfichier,GENERIC_WRITE,NULL,\
NULL,CREATE_ALWAYS,FILE_ATTRIBUTE_NORMAL,0
mov Hfile,eax
mov edx,offset array
mov ecx,0
ecrire:
push edx
push ecx
invoke WriteFile,Hfile,edx,400,addr NumberOfBytesWritten,NULL
invoke WriteFile,Hfile,addr retourligne,2,addr NumberOfBytesWritten,NULL
pop ecx
pop edx
add edx,400
add ecx,100
.if ecx != 1000
jmp ecrire
.endif
invoke CloseHandle,Hfile
invoke ExitProcess,0
;################################################################
end start
ascii table is put in the texte file 000 to 999
116 bytes of slackware
TestA proc
pusha
mov edi,offset MyArray
mov ebp,00303030h
mov ecx,10
l2: mov edx,10
push ebp
l1: call proc0
add ebp,00000100h
sub edx,1
jnz l1
pop ebp
inc ebp ;add ebp,00000001h
sub ecx,1
jnz l2
popa
mov eax,offset MyArray
ret
proc0:
push ecx
push edx
lea eax,[ebp]
lea ebx,[ebp+00010000h]
lea ecx,[ebp+00020000h]
lea edx,[ebp+00030000h]
lea esi,[ebp+00040000h]
mov ebp,00050000h
mov [edi],eax
mov [edi+4],ebx
mov [edi+8],ecx
mov [edi+12],edx
mov [edi+16],esi
add eax,ebp
add ebx,ebp
add ecx,ebp
add edx,ebp
add esi,ebp
mov [edi+20],eax
mov [edi+24],ebx
mov [edi+28],ecx
mov [edi+32],edx
mov [edi+36],esi
add edi,40
pop edx
pop ecx
ret
TestA endp
This would be nice to do in 64-bit with all those regs, but the xmm's are much better.
Quote from: ToutEnMasm on December 09, 2012, 07:03:32 PM
Here is a source code to verify mmx is faster:
Where's da mmx, Yves?
Quote from: sinsi on December 09, 2012, 07:15:06 PM
116 bytes of slackware
...
This would be nice to do in 64-bit with all those regs, but the xmm's are much better.
SSE2 is difficult to beat indeed...
Intel(R) Celeron(R) M CPU 420 @ 1.60GHz (SSE3)
loop overhead is approx. 189/100 cycles
1557 cycles for 100 * FA Dave
540 cycles for 100 * FA Jochen
1838 cycles for 100 * FA Sinsi
547 cycles for 100 * FA Jochen unaligned
1540 cycles for 100 * FA Yves
Just goes to show, these tests are of academic interest only :biggrin:
AMD Phenom(tm) II X6 1100T Processor (SSE3)
loop overhead is approx. 228/100 cycles
786 cycles for 100 * FA Dave
441 cycles for 100 * FA Jochen
849 cycles for 100 * FA Sinsi
419 cycles for 100 * FA Jochen unaligned
1021 cycles for 100 * FA Yves
786 cycles for 100 * FA Dave
437 cycles for 100 * FA Jochen
849 cycles for 100 * FA Sinsi
431 cycles for 100 * FA Jochen unaligned
1029 cycles for 100 * FA Yves
779 cycles for 100 * FA Dave
435 cycles for 100 * FA Jochen
846 cycles for 100 * FA Sinsi
417 cycles for 100 * FA Jochen unaligned
1023 cycles for 100 * FA Yves
230 bytes for FA Dave
281 bytes for FA Jochen
116 bytes for FA Sinsi
141 bytes for FA Jochen unaligned
141 bytes for FA Yves
4208864 = eax FA Dave
4208864 = eax FA Jochen
4208864 = eax FA Sinsi
4208864 = eax FA Jochen unaligned
4208864 = eax FA Yves
it should generate leading-zero suppressed strings, right-justified in a field of 3 spaces
Intel(R) Pentium(R) 4 CPU 3.00GHz (SSE3)
loop overhead is approx. 240/100 cycles
2040 cycles for 100 * FA Dave
919 cycles for 100 * FA Jochen
2609 cycles for 100 * FA Sinsi
1226 cycles for 100 * FA Jochen unaligned
1854 cycles for 100 * FA Yves
2040 cycles for 100 * FA Dave
943 cycles for 100 * FA Jochen
2606 cycles for 100 * FA Sinsi
1197 cycles for 100 * FA Jochen unaligned
1816 cycles for 100 * FA Yves
2037 cycles for 100 * FA Dave
920 cycles for 100 * FA Jochen
2602 cycles for 100 * FA Sinsi
1196 cycles for 100 * FA Jochen unaligned
1892 cycles for 100 * FA Yves
Quote from: sinsi on December 09, 2012, 08:05:45 PM
Just goes to show, these tests are of academic interest only :biggrin:
Absolutely :greensml:
New version:
Intel(R) Celeron(R) M CPU 420 @ 1.60GHz (SSE3)
loop overhead is approx. 189/100 cycles
1556 cycles for 100 * FA Dave
1811 cycles for 100 * FA Sinsi
441 cycles for 100 * FA Jochen unaligned
1541 cycles for 100 * FA YvesP.S.: If you don't agree with the suffix "FINAL", write a faster algo :bgrin:
Intel(R) Pentium(R) 4 CPU 3.00GHz (SSE3)
loop overhead is approx. 250/100 cycles
2038 cycles for 100 * FA Dave
2590 cycles for 100 * FA Sinsi
1188 cycles for 100 * FA Jochen unaligned
1813 cycles for 100 * FA Yves
2079 cycles for 100 * FA Dave
2593 cycles for 100 * FA Sinsi
1181 cycles for 100 * FA Jochen unaligned
1795 cycles for 100 * FA Yves
2061 cycles for 100 * FA Dave
2587 cycles for 100 * FA Sinsi
1216 cycles for 100 * FA Jochen unaligned
1791 cycles for 100 * FA Yves
Quote
Intel(R) Celeron(R) CPU 2.80GHz (SSE3)
loop overhead is approx. 255/100 cycles
2057 cycles for 100 * FA Dave
2581 cycles for 100 * FA Sinsi
1210 cycles for 100 * FA Jochen unaligned
1858 cycles for 100 * FA Yves
2060 cycles for 100 * FA Dave
2550 cycles for 100 * FA Sinsi
1214 cycles for 100 * FA Jochen unaligned
1855 cycles for 100 * FA Yves
2056 cycles for 100 * FA Dave
2553 cycles for 100 * FA Sinsi
1213 cycles for 100 * FA Jochen unaligned
1857 cycles for 100 * FA Yves
230 bytes for FA Dave
116 bytes for FA Sinsi
149 bytes for FA Jochen unaligned
141 bytes for FA Yves
4208520 = eax FA Dave
4208520 = eax FA Sinsi
4208520 = eax FA Jochen unaligned
4208520 = eax FA Yves
--- ok ---
According to these tests we reached the bottom line, almost.
SSE code is always faster than any other, and Dave won $50
for being the first to post his code :t
As I foresaw:
Quote from: frktons on December 09, 2012, 10:07:32 AM
...
Not bad. I think we
can arrive at 0.7 cycles per dword string, but it has yet to be
demonstrated. :lol:
My test with Jochen's testpad:
Quote
Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz (SSE4)
loop overhead is approx. 187/100 cycles
1870 cycles for 100 * FA Dave
648 cycles for 100 * FA Jochen
2442 cycles for 100 * FA Sinsi
791 cycles for 100 * FA Jochen unaligned
2009 cycles for 100 * FA Yves
1886 cycles for 100 * FA Dave
648 cycles for 100 * FA Jochen
2418 cycles for 100 * FA Sinsi
791 cycles for 100 * FA Jochen unaligned
2012 cycles for 100 * FA Yves
1871 cycles for 100 * FA Dave
648 cycles for 100 * FA Jochen
2430 cycles for 100 * FA Sinsi
790 cycles for 100 * FA Jochen unaligned
2010 cycles for 100 * FA Yves
230 bytes for FA Dave
281 bytes for FA Jochen
116 bytes for FA Sinsi
141 bytes for FA Jochen unaligned
141 bytes for FA Yves
4208864 = eax FA Dave
4208864 = eax FA Jochen
4208864 = eax FA Sinsi
4208864 = eax FA Jochen unaligned
4208864 = eax FA Yves
--- ok ---
And the prize for Dave:
I've divided my code in 3 steps, from the slowest to the fastest.
Here it is the slowest, with GPRs registers:
Quote
-------------------------------------------------------
Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz
Instructions: MMX, SSE1, SSE2, SSE3, SSSE3
-------------------------------------------------------
1914 cycles for Dedndave code
1983 cycles for Frktons I Step
-------------------------------------------------------
1914 cycles for Dedndave code
1973 cycles for Frktons I Step
-------------------------------------------------------
1944 cycles for Dedndave code
1972 cycles for Frktons I Step
-------------------------------------------------------
1913 cycles for Dedndave code
1976 cycles for Frktons I Step
-------------------------------------------------------
I measured it against Dave's code that uses the same
kind of registers.
Here are my results:
------------------------------------------------------------------------
Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz
Instructions: MMX, SSE1, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2
------------------------------------------------------------------------
1577 cycles for Dedndave code
2461 cycles for Frktons I Step
------------------------------------------------------------------------
1048 cycles for Dedndave code
2492 cycles for Frktons I Step
------------------------------------------------------------------------
1058 cycles for Dedndave code
2497 cycles for Frktons I Step
------------------------------------------------------------------------
1048 cycles for Dedndave code
2495 cycles for Frktons I Step
------------------------------------------------------------------------
--- ok ---
Gunther
An incredible difference between my Core Duo 2 and i7.
Adding and subtracting is very liked by the i7. :icon_eek:
Probably Jochen's solution is hard to beat for the time being. :eusa_clap:
It looks like with GPRs it is quite difficult to go below
1.900 CPU cycles, at least on my machine:
Quote
-----------------------------------------------
Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz
Instructions: MMX, SSE1, SSE2, SSE3, SSSE3
-----------------------------------------------
1914 cycles for Dedndave code - 5 GPRs
2574 cycles for Frktons I Step / 2 GPRs
1889 cycles for Frktons I Step / 4 GPRs
-----------------------------------------------
1940 cycles for Dedndave code - 5 GPRs
2561 cycles for Frktons I Step / 2 GPRs
1886 cycles for Frktons I Step / 4 GPRs
-----------------------------------------------
1913 cycles for Dedndave code - 5 GPRs
2561 cycles for Frktons I Step / 2 GPRs
1900 cycles for Frktons I Step / 4 GPRs
-----------------------------------------------
1913 cycles for Dedndave code - 5 GPRs
2561 cycles for Frktons I Step / 2 GPRs
1887 cycles for Frktons I Step / 4 GPRs
-----------------------------------------------
Only using MMX registers, the performance start to rock.
Quote
------------------------------------------------------------------------
Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz
Instructions: MMX, SSE1, SSE2, SSE3, SSSE3
------------------------------------------------------------------------
1926 cycles for Dedndave code - 5 GPRs
1857 cycles for Frktons I Step / 2 GPRs
1896 cycles for Frktons I Step / 4 GPRs
1320 cycles for Frktons II Step / 5 MMX
830 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
1919 cycles for Dedndave code - 5 GPRs
1897 cycles for Frktons I Step / 2 GPRs
1895 cycles for Frktons I Step / 4 GPRs
1321 cycles for Frktons II Step / 5 MMX
831 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
1916 cycles for Dedndave code - 5 GPRs
1898 cycles for Frktons I Step / 2 GPRs
1897 cycles for Frktons I Step / 4 GPRs
1319 cycles for Frktons II Step / 5 MMX
830 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
1915 cycles for Dedndave code - 5 GPRs
1892 cycles for Frktons I Step / 2 GPRs
1894 cycles for Frktons I Step / 4 GPRs
1327 cycles for Frktons II Step / 5 MMX
830 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
--- ok ---
And there are still many things to optimize.
Exception code: 0xc000001d
Fault offset: 0x00001619
STATUS_ILLEGAL_INSTRUCTION = 0xc000001d
he is probably using SSSE3 instructions in "Frktons II Step" :P
lea esi, Tens
lea edi, MyArray
movq mm5, Mask2Double
movq mm6, ONE_TWO
movq mm7, TWO_TWO
align 4
@@:
mov eax, [esi]
movd mm0, eax
pshufb mm0, mm5
paddd mm0, mm6
movq mm1, mm0
paddd mm1, mm7
movq mm2, mm1
paddd mm2, mm7
movq mm3, mm2
paddd mm3, mm7
movq mm4, mm3
paddd mm4, mm7
movq [edi], mm0
movq [edi + 8], mm1
movq [edi + 16], mm2
movq [edi + 24], mm3
movq [edi + 32], mm4
mov eax, [esi + 4]
movd mm0, eax
pshufb mm0, mm5
paddd mm0, mm6
movq mm1, mm0
paddd mm1, mm7
movq mm2, mm1
paddd mm2, mm7
movq mm3, mm2
paddd mm3, mm7
movq mm4, mm3
paddd mm4, mm7
movq [edi + 40], mm0
movq [edi + 48], mm1
movq [edi + 56], mm2
movq [edi + 64], mm3
movq [edi + 72], mm4
add esi, 8
add edi, 80
cmp esi, PtrTens
jl @B
Quote from: sinsi on December 10, 2012, 10:01:46 PM
Exception code: 0xc000001d
Fault offset: 0x00001619
Do you have
PSHUFB on your PC? It is SSSE3, Dave is correct.
You can use a similar instruction to duplicate low dword and store it
to high dword of mmx register, something like:
first_dw dd 0 ; put here the same value of second_dw
second_dw dd 0 ; put here the same value of first_dw
....
lea eax, first_dw
movq mm0, [eax]
N.B. Not tested
by using SSSE3 (or even SSE3), you exclude a lot of the older CPU's that are still in use
really - this is a program init function
it seems kinda silly to exclude a CPU unless SSSE3 is to be used throughout the rest of the program
Quote from: dedndave on December 10, 2012, 10:33:52 PM
by using SSSE3 (or even SSE3), you exclude a lot of the older CPU's that are still in use
really - this is a program init function
it seems kinda silly to exclude a CPU unless SSSE3 is to be used throughout the rest of the program
The owners of old CPUs should read my previous post. :lol:
you are going to tell me i need a new one ? - lol
i am very happy with the one i have
until microsoft or adobe or some other a-hole forces me to be unhappy
Quote from: dedndave on December 10, 2012, 10:38:38 PM
you are going to tell me i need a new one ? - lol
i am very happy with the one i have
until microsoft or adobe or some other a-hole forces me to be unhappy
Not really. I'm happy too with mine, a dual core P-IV, and a Core Duo.
I was talking about asm coders. They know how to change 1 single
instruction to suit their needs.
By the way, I'm only testing some instructions, nothing is going to production
as always with me. [MASM for FUN only].
you could add a little test to see if the CPU supports SSSE3
if it does not, skip that test and say "Frktons II Step requires SSSE3 support" and go on to the next test
mov eax,1
cpuid
test ch,2
jz no_ssse3_support
note: .586 or higher required to assemble "cpuid" without hard-coding
Quote from: dedndave on December 10, 2012, 10:50:09 PM
you could add a little test to see if the CPU supports SSSE3
if it does not, skip that test and say "Frktons II Step requires SSSE3 support" and go on to the next test
mov eax,1
cpuid
test ch,2
jz no_ssse3_support
note: .586 or higher required to assemble "cpuid" without hard-coding
Good idea my dear Master, I'm going to test it. Inter alia, one of my pc doesn't
have SSSE3 if I correctly remember.
OK, I updated the proggy, would you test it?
:t
------------------------------------------------------------------------
Intel(R) Pentium(R) 4 CPU 3.00GHz
Instructions: MMX, SSE1, SSE2, SSE3
------------------------------------------------------------------------
2270 cycles for Dedndave code - 5 GPRs
1940 cycles for Frktons I Step / 2 GPRs
1950 cycles for Frktons I Step / 4 GPRs
Frktons II Step requires a PC with SSSE3
1165 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
2242 cycles for Dedndave code - 5 GPRs
1962 cycles for Frktons I Step / 2 GPRs
1949 cycles for Frktons I Step / 4 GPRs
Frktons II Step requires a PC with SSSE3
1176 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
2248 cycles for Dedndave code - 5 GPRs
1939 cycles for Frktons I Step / 2 GPRs
1956 cycles for Frktons I Step / 4 GPRs
Frktons II Step requires a PC with SSSE3
1161 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
2268 cycles for Dedndave code - 5 GPRs
1947 cycles for Frktons I Step / 2 GPRs
1987 cycles for Frktons I Step / 4 GPRs
Frktons II Step requires a PC with SSSE3
1191 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
--- ok ---
here's a little routine i wrote just for you, Frank :biggrin:
CpuFeatures PROC
;call once during program initialization
;store the value returned in EAX (AL, actually) for feature verification
;
;0 = no extended features present
;1 = MMX
;2 = SSE
;3 = SSE2
;4 = SSE3
;5 = SSSE3
;6 = SSE4
mov eax,1
cpuid
bswap edx ;MMX -> bit 8, SSE1 -> bit 6, SSE2 -> bit 5
xor eax,eax
test dh,1 ;MMX
jz CpuF00
inc eax
test dl,40h ;SSE1
jz CpuF00
inc eax
test dl,20h ;SSE2
jz CpuF00
inc eax
test cl,1 ;SSE3
jz CpuF00
inc eax
test cl,2 ;SSSE3
jz CpuF00
inc eax
test ecx,80000h ;SSE4
jz CpuF00
inc eax
CpuF00: ret
CpuFeatures ENDP
.DATA?
bFeatures db ?
.CODE
Start: call CpuFeatures
mov bFeatures,al
;
;
;
now, if you want to see if they have SSSE3....
cmp bFeatures,5
jb no_ssse3_support
Quote from: dedndave on December 10, 2012, 11:40:52 PM
here's a little routine i wrote just for you, Frank :biggrin:
CpuFeatures PROC
;call once during program initialization
;store the value returned in EAX (AL, actually) for feature verification
;
;0 = no extended features present
;1 = MMX
;2 = SSE
;3 = SSE2
;4 = SSE3
;5 = SSSE3
;6 = SSE4
mov eax,1
cpuid
bswap edx ;MMX -> bit 8, SSE1 -> bit 6, SSE2 -> bit 5
xor eax,eax
test dh,1 ;MMX
jz CpuF00
inc eax
test dl,40h ;SSE1
jz CpuF00
inc eax
test dl,20h ;SSE2
jz CpuF00
inc eax
test cl,1 ;SSE3
jz CpuF00
inc eax
test cl,2 ;SSSE3
jz CpuF00
inc eax
test ecx,80000h ;SSE4
jz CpuF00
inc eax
CpuF00: ret
CpuFeatures ENDP
.DATA?
bFeatures db ?
.CODE
Start: call CpuFeatures
mov bFeatures,al
;
;
;
now, if you want to see if they have SSSE3....
cmp bFeatures,5
jb no_ssse3_support
Thanks Dave, and here it is something I wrote for you :biggrin::
Quote
----------------------------------------------------------------------
Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz
Instructions: MMX, SSE1, SSE2, SSE3, SSSE3
----------------------------------------------------------------------
1915 cycles for Dedndave code - 5 GPRs
1837 cycles for Frktons I Step / 2 GPRs
1893 cycles for Frktons I Step / 4 GPRs
1318 cycles for Frktons II Step / 5 MMX
1365 cycles for Frktons II Step / 5 MMX without SSSE3
836 cycles for Jochen / 5 XMM
----------------------------------------------------------------------
1913 cycles for Dedndave code - 5 GPRs
1892 cycles for Frktons I Step / 2 GPRs
1893 cycles for Frktons I Step / 4 GPRs
1317 cycles for Frktons II Step / 5 MMX
1366 cycles for Frktons II Step / 5 MMX without SSSE3
836 cycles for Jochen / 5 XMM
----------------------------------------------------------------------
------------------------------------------------------------------------
Intel(R) Pentium(R) 4 CPU 3.00GHz
Instructions: MMX, SSE1, SSE2, SSE3
------------------------------------------------------------------------
2250 cycles for Dedndave code - 5 GPRs
1943 cycles for Frktons I Step / 2 GPRs
1960 cycles for Frktons I Step / 4 GPRs
Frktons II Step requires a PC with SSSE3
2148 cycles for Frktons II Step / 5 MMX without SSSE3
1161 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
2242 cycles for Dedndave code - 5 GPRs
1952 cycles for Frktons I Step / 2 GPRs
1971 cycles for Frktons I Step / 4 GPRs
Frktons II Step requires a PC with SSSE3
2268 cycles for Frktons II Step / 5 MMX without SSSE3
1161 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
2271 cycles for Dedndave code - 5 GPRs
1950 cycles for Frktons I Step / 2 GPRs
1965 cycles for Frktons I Step / 4 GPRs
Frktons II Step requires a PC with SSSE3
2149 cycles for Frktons II Step / 5 MMX without SSSE3
1166 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
2240 cycles for Dedndave code - 5 GPRs
1941 cycles for Frktons I Step / 2 GPRs
1958 cycles for Frktons I Step / 4 GPRs
Frktons II Step requires a PC with SSSE3
2144 cycles for Frktons II Step / 5 MMX without SSSE3
1166 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
Some improvements before the final release:
Quote
------------------------------------------------------------------------
Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz
Instructions: MMX, SSE1, SSE2, SSE3, SSSE3
------------------------------------------------------------------------
1925 cycles for Dedndave code - 5 GPRs
1882 cycles for Frktons I Step / 2 GPRs
1941 cycles for Frktons I Step / 4 GPRs
1896 cycles for Frktons I Step / 4 GPRs - no external tab
1096 cycles for Frktons II Step / 5 MMX with SSSE3
1206 cycles for Frktons II Step / 5 MMX without SSSE3
836 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
1917 cycles for Dedndave code - 5 GPRs
1882 cycles for Frktons I Step / 2 GPRs
1917 cycles for Frktons I Step / 4 GPRs
1893 cycles for Frktons I Step / 4 GPRs - no external tab
1091 cycles for Frktons II Step / 5 MMX with SSSE3
1206 cycles for Frktons II Step / 5 MMX without SSSE3
836 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
--- ok ---
Almost close to the target. :t
prescott w/htt
------------------------------------------------------------------------
Intel(R) Pentium(R) 4 CPU 3.00GHz
Instructions: MMX, SSE1, SSE2, SSE3
------------------------------------------------------------------------
2379 cycles for Dedndave code - 5 GPRs
1947 cycles for Frktons I Step / 2 GPRs
1961 cycles for Frktons I Step / 4 GPRs
1986 cycles for Frktons I Step / 4 GPRs - no external tab
Frktons II Step requires a PC with SSSE3
2460 cycles for Frktons II Step / 5 MMX without SSSE3
1152 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
2251 cycles for Dedndave code - 5 GPRs
1966 cycles for Frktons I Step / 2 GPRs
1961 cycles for Frktons I Step / 4 GPRs
1978 cycles for Frktons I Step / 4 GPRs - no external tab
Frktons II Step requires a PC with SSSE3
2456 cycles for Frktons II Step / 5 MMX without SSSE3
1151 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
P4 Northwood:
------------------------------------------------------------------------
Intel(R) Pentium(R) 4 CPU 3.00GHz
Instructions: MMX, SSE1, SSE2
------------------------------------------------------------------------
2275 cycles for Dedndave code - 5 GPRs
1979 cycles for Frktons I Step / 2 GPRs
1996 cycles for Frktons I Step / 4 GPRs
2034 cycles for Frktons I Step / 4 GPRs - no external tab
Frktons II Step requires a PC with SSSE3
2970 cycles for Frktons II Step / 5 MMX without SSSE3
896 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
2330 cycles for Dedndave code - 5 GPRs
1970 cycles for Frktons I Step / 2 GPRs
1997 cycles for Frktons I Step / 4 GPRs
2734 cycles for Frktons I Step / 4 GPRs - no external tab
Frktons II Step requires a PC with SSSE3
2937 cycles for Frktons II Step / 5 MMX without SSSE3
905 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
It looks like the MMX tech is a bit slow on pre-Core Duo CPUs.
We'll see the XMM one in action next. :P
I've almost finished the study. Here it is the fastest code
so far I could imagine, but still room to optimize.
Quote
------------------------------------------------------------------------
Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz
Instructions: MMX, SSE1, SSE2, SSE3, SSSE3
------------------------------------------------------------------------
1950 cycles for Dedndave code - 5 GPRs
1924 cycles for Frktons I Step / 2 GPRs
1871 cycles for Frktons I Step / 4 GPRs
1888 cycles for Frktons I Step / 4 GPRs - no external tab
1079 cycles for Frktons II Step / 5 MMX with SSSE3
1199 cycles for Frktons II Step / 5 MMX without SSSE3
801 cycles for Frktons III Step / XMM/MMX with SSE2
831 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
1916 cycles for Dedndave code - 5 GPRs
1930 cycles for Frktons I Step / 2 GPRs
1872 cycles for Frktons I Step / 4 GPRs
1915 cycles for Frktons I Step / 4 GPRs - no external tab
1083 cycles for Frktons II Step / 5 MMX with SSSE3
1209 cycles for Frktons II Step / 5 MMX without SSSE3
796 cycles for Frktons III Step / XMM/MMX with SSE2
831 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
--- ok ---
prescott w/htt
------------------------------------------------------------------------
Intel(R) Pentium(R) 4 CPU 3.00GHz
Instructions: MMX, SSE1, SSE2, SSE3
------------------------------------------------------------------------
2266 cycles for Dedndave code - 5 GPRs
1976 cycles for Frktons I Step / 2 GPRs
1994 cycles for Frktons I Step / 4 GPRs
2011 cycles for Frktons I Step / 4 GPRs - no external tab
Frktons II Step requires a PC with SSSE3
2459 cycles for Frktons II Step / 5 MMX without SSSE3
1141 cycles for Frktons III Step / XMM/MMX with SSE2
1183 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
2257 cycles for Dedndave code - 5 GPRs
1983 cycles for Frktons I Step / 2 GPRs
1973 cycles for Frktons I Step / 4 GPRs
1996 cycles for Frktons I Step / 4 GPRs - no external tab
Frktons II Step requires a PC with SSSE3
2462 cycles for Frktons II Step / 5 MMX without SSSE3
1143 cycles for Frktons III Step / XMM/MMX with SSE2
1184 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
------------------------------------------------------------
AMD Phenom(tm) II X6 1100T Processor
Instructions: MMX, SSE1, SSE2, SSE3
------------------------------------------------------------
948 cycles for Dedndave code - 5 GPRs
813 cycles for Frktons I Step / 2 GPRs
838 cycles for Frktons I Step / 4 GPRs
995 cycles for Frktons I Step / 4 GPRs - no external tab
Frktons II Step requires a PC with SSSE3
1089 cycles for Frktons II Step / 5 MMX without SSSE3
747 cycles for Frktons III Step / XMM/MMX with SSE2
666 cycles for Jochen / 5 XMM
------------------------------------------------------------
950 cycles for Dedndave code - 5 GPRs
823 cycles for Frktons I Step / 2 GPRs
845 cycles for Frktons I Step / 4 GPRs
996 cycles for Frktons I Step / 4 GPRs - no external tab
Frktons II Step requires a PC with SSSE3
1084 cycles for Frktons II Step / 5 MMX without SSSE3
749 cycles for Frktons III Step / XMM/MMX with SSE2
665 cycles for Jochen / 5 XMM
------------------------------------------------------------
--- ok ---
Frktons I Step / 2 GPRs and Frktons I Step / 4 GPRs are remarkably fast but you should have a look at the output.
Quote from: jj2007 on December 12, 2012, 06:53:03 PM
Frktons I Step / 2 GPRs and Frktons I Step / 4 GPRs are remarkably fast but you should have a look at the output.
I didn't check and it is possible that some instructions are not correct,
I was first testing for speed, and now I'm going to check for size optimization
and correctness of code. A few days more, I'm quite slow indeed.
lol
it has to work first, otherwise you are comparing apples with oranges in the timing tests :t
Quote from: dedndave on December 12, 2012, 11:25:42 PM
lol
it has to work first, otherwise you are comparing apples with oranges in the timing tests :t
Here we are, tested and posted. The performances don't change,
there were some typing errors and adding one instead of two somewhere.
These errors didn't impact on performance, but on results they did. :P
I included a PROC to display the content of the filled array, on demand. :t
prescott w/htt
------------------------------------------------------------------------
Intel(R) Pentium(R) 4 CPU 3.00GHz
Instructions: MMX, SSE1, SSE2, SSE3
------------------------------------------------------------------------
2246 cycles for Dedndave code - 5 GPRs
1964 cycles for Frktons I Step / 2 GPRs
1999 cycles for Frktons I Step / 4 GPRs
Frktons II Step requires a PC with SSSE3
2451 cycles for Frktons II Step / 5 MMX without SSSE3
1122 cycles for Frktons III Step / XMM/MMX with SSE2
1167 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
2240 cycles for Dedndave code - 5 GPRs
1939 cycles for Frktons I Step / 2 GPRs
2015 cycles for Frktons I Step / 4 GPRs
Frktons II Step requires a PC with SSSE3
2588 cycles for Frktons II Step / 5 MMX without SSSE3
1115 cycles for Frktons III Step / XMM/MMX with SSE2
1167 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
Two out of three goals are accomplished, now the last, but not least,
optimization: code size and some small improvement, if needed. ;)
Intel(R) Celeron(R) M CPU 420 @ 1.60GHz
Instructions: MMX, SSE1, SSE2, SSE3
--------------------------------------------------------
1740 cycles for Dedndave code - 5 GPRs
1348 cycles for Frktons I Step / 2 GPRs
1262 cycles for Frktons I Step / 4 GPRs
Frktons II Step requires a PC with SSSE3
943 cycles for Frktons II Step / 5 MMX without SSSE3
941 cycles for Frktons III Step / XMM/MMX with SSE2
728 cycles for Jochen / 5 XMM
--------------------------------------------------------
1742 cycles for Dedndave code - 5 GPRs
1350 cycles for Frktons I Step / 2 GPRs
1262 cycles for Frktons I Step / 4 GPRs
Frktons II Step requires a PC with SSSE3
944 cycles for Frktons II Step / 5 MMX without SSSE3
937 cycles for Frktons III Step / XMM/MMX with SSE2
728 cycles for Jochen / 5 XMM
On my home PC the III step is a bit faster than Jochen's code,
and in my office PC it is even faster.
Quote
Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz
-------------------------------------------------------------
813 cycles for Frktons III Step / XMM/MMX with SSE2
835 cycles for Jochen / 5 XMM
Quote
Intel(R) Pentium(R) CPU G6950 @ 2.80GHz
-------------------------------------------------------------
...
405 cycles for Frktons III Step / XMM/MMX with SSE2
644 cycles for Jochen / 5 XMM
I'm actually studing a faster/smaller solution because, as Jochen said:
Quote from: jj2007 on December 09, 2012, 08:25:09 PM
P.S.: If you don't agree with the suffix "FINAL", write a faster algo :bgrin:
I don't agree with the suffix, I'm not at the FINAL stage so far. :lol:
Quote from: frktons on December 14, 2012, 09:58:46 AM
On my home PC the III step is a bit faster than Jochen's code,
and in my office PC it is even faster.
...
I'm actually studing a faster/smaller solution because, as Jochen said:
Quote from: jj2007 on December 09, 2012, 08:25:09 PM
P.S.: If you don't agree with the suffix "FINAL", write a faster algo :bgrin:
I don't agree with the suffix, I'm not at the FINAL stage so far. :lol:
So the incentive worked :greensml: :t
This could be my final test, if you don't have any suggestion
to enhance the performance of the last code:
Quote
------------------------------------------------------------------------
Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz
Instructions: MMX, SSE1, SSE2, SSE3, SSSE3
------------------------------------------------------------------------
1915 cycles for Dedndave code - 5 GPRs
1890 cycles for Frktons I Step / 2 GPRs
1964 cycles for Frktons I Step / 4 GPRs with LEA
1114 cycles for Frktons II Step / 5 MMX with SSSE3
1199 cycles for Frktons II Step / 5 MMX without SSSE3
811 cycles for Frktons III Step / XMM/MMX with SSE2
630 cycles for Frktons III Step / XMM with SSE2 - enhanced
706 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
1915 cycles for Dedndave code - 5 GPRs
1896 cycles for Frktons I Step / 2 GPRs
1978 cycles for Frktons I Step / 4 GPRs with LEA
1110 cycles for Frktons II Step / 5 MMX with SSSE3
1199 cycles for Frktons II Step / 5 MMX without SSSE3
813 cycles for Frktons III Step / XMM/MMX with SSE2
628 cycles for Frktons III Step / XMM with SSE2 - enhanced
704 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
--- ok ---
prescott w/htt
Quote------------------------------------------------------------------------
Intel(R) Pentium(R) 4 CPU 3.00GHz
Instructions: MMX, SSE1, SSE2, SSE3
------------------------------------------------------------------------
2267 cycles for Dedndave code - 5 GPRs
1957 cycles for Frktons I Step / 2 GPRs
2035 cycles for Frktons I Step / 4 GPRs with LEA
Frktons II Step requires a PC with SSSE3
2459 cycles for Frktons II Step / 5 MMX without SSSE3
1126 cycles for Frktons III Step / XMM/MMX with SSE2
1197 cycles for Frktons III Step / XMM with SSE2 - enhanced
1159 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
2282 cycles for Dedndave code - 5 GPRs
1967 cycles for Frktons I Step / 2 GPRs
2031 cycles for Frktons I Step / 4 GPRs with LEA
Frktons II Step requires a PC with SSSE3
2483 cycles for Frktons II Step / 5 MMX without SSSE3
1126 cycles for Frktons III Step / XMM/MMX with SSE2
1185 cycles for Frktons III Step / XMM with SSE2 - enhanced
1158 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
Quote------------------------------------------------------------------------
Intel(R) Core(TM) i3 CPU M 370 @ 2.40GHz
Instructions: MMX, SSE1, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2
------------------------------------------------------------------------
1335 cycles for Dedndave code - 5 GPRs
1855 cycles for Frktons I Step / 2 GPRs
1009 cycles for Frktons I Step / 4 GPRs with LEA
587 cycles for Frktons II Step / 5 MMX with SSSE3
643 cycles for Frktons II Step / 5 MMX without SSSE3
389 cycles for Frktons III Step / XMM/MMX with SSE2
344 cycles for Frktons III Step / XMM with SSE2 - enhanced
460 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
1278 cycles for Dedndave code - 5 GPRs
1025 cycles for Frktons I Step / 2 GPRs
1015 cycles for Frktons I Step / 4 GPRs with LEA
585 cycles for Frktons II Step / 5 MMX with SSSE3
633 cycles for Frktons II Step / 5 MMX without SSSE3
400 cycles for Frktons III Step / XMM/MMX with SSE2
345 cycles for Frktons III Step / XMM with SSE2 - enhanced
437 cycles for Jochen / 5 XMM
------------------------------------------------------------------------
--- ok ---
As usual, different CPUs = different performances :P
Considering I'm targeting Core Duo upwards, I can be satisfied
that I reached less than 0.7 cycles for each dword string. :biggrin:
ahhh nice algo
(http://3.bp.blogspot.com/-KBh4qCwokmo/UFIpFNQicqI/AAAAAAAAODM/ye726PVWUfE/s1600/laptop-thief.jpg)
Quote from: Farabi on December 23, 2012, 09:24:58 AM
ahhh nice algo
Thanks my friend. The application of unrolling and SSE2 code
produce this fast algo. :t
Hi, sorry for bumping in - and being late.
I stumbled over that wonderful thread because I was unhappy with the timing of a simple prime number generator. No math 8) or pun intended, the slow part can be simplified for the show as:
* to mark every 10th bit starting with bits 7 and 10 in a GB of memory. (7,10,17,20,27,30,...)
* also to mark every 14th bit starting with bits 15 and 24 (15,24, 29, 38, ...)
That classical C-Code ( bytelist[j/64] |= 1UL<<(j%64) ) seems to take quite some time, I come around 0.04s on a Intel(R) Core(TM)2 Duo CPU T5750 @ 2.00GHz. Maybe that is optimized, and I am hunting a pink unicorn.
I do not know asm yet, I do not yet use masm. All I have is that feeling that for setting every 10th bit should be a smarter way than those presented by gcc or clang.
It's just for fun. I even tried parallelisation on the Cuda side of life (in C), but I seem to make all of those terrible mistakes of the unblessed beginners - it gets much slower than the same partitioned code on plain CPU. One pitfall is parallel writes to the same word on different bits from different threads - it overwrites other set bits (setting the 20th bit needs loading the word, setting the bit, saving the word - so the recently stored bit 15 gets cleared again)
I think SSE3 or SSSE3 should make it easy, and even IntelHD on the other computer (Core i5) should have enough cpu hardware units to set a few bits in parallel.
i would try to get away from multiple threads
there may be a faster way in SSE - i am not familiar with SSE shifts and rotates
but, you might think of something along this line
mov eax,ecx ;every 10th bit from ECX
mov edx,ebp ;every 10th bit from EBP
or eax,esi ;every 14th bit from ESI
or edx,ebx ;every 14th bit from EBX
mov [edi],eax
mov [edi+4],edx
;do rotates here
add edi,8
;loop to top
If you can determine the period of the patterns (related to 128 bit blocks), it should be really simple to OR combine the pre-calculated patterns with the target bytes (e.g. using the SSE2 instr. POR).
oh yah - a look-up table could have the patterned data
Quote from: flaschenpost on June 22, 2013, 10:58:12 AM
It's just for fun. I even tried parallelisation on the Cuda side of life (in C), but I seem to make all of those terrible mistakes of the unblessed beginners - it gets much slower than the same partitioned code on plain CPU. One pitfall is parallel writes to the same word on different bits from different threads - it overwrites other set bits (setting the 20th bit needs loading the word, setting the bit, saving the word - so the recently stored bit 15 gets cleared again)
I think SSE3 or SSSE3 should make it easy, and even IntelHD on the other computer (Core i5) should have enough cpu hardware units to set a few bits in parallel.
Probably that's because Cuda (and SSE) are designed for really simple number crunching commands, not for the stuff like bits manipulation. I.e., they work well with float/probably integer datatypes, which can be processed well in parallel - image processing etc, but will probably slower than CPU in things like bits bits manipulation. Maybe the solution for the speed up on the CPU is to parallelise bits-in-bytes manipulation - i.e., prepare bytes of an array independedly, step by step and these steps should be intermixed, like "work with bit 1 of byte 1, work with bit 1 of byte 2, work with bit 1 of byte 3" then "work with bit 2 of byte 1, work with bit 2 of byte 2, work with bit 2 of byte 3" and so on, then put the combined bytes into array.
(There was is just an assumption about CUDA, did not work with it at all really).
Quote from: flaschenpost on June 22, 2013, 10:58:12 AM
One pitfall is parallel writes to the same word on different bits from different threads - it overwrites other set bits (setting the 20th bit needs loading the word, setting the bit, saving the word - so the recently stored bit 15 gets cleared again)
Hi,
Well in assembly you need not load, modify, and store. You can
modify the memory directly. And that would avoid resetting any
set bits.
MOV EAX,[Memory] ; Load
OR EAX,BitPattern ; Modify
MOV [Memory],EAX ; Store
; can become
OR DWORD PTR [Memory],BitPattern
Regards,
Steve N.
Quote from: flaschenpost on June 22, 2013, 10:58:12 AM
* to mark every 10th bit starting with bits 7 and 10 in a GB of memory. (7,10,17,20,27,30,...)
* also to mark every 14th bit starting with bits 15 and 24 (15,24, 29, 38, ...
As qWord already mentioned, find the right periods and go ahead. For "every 10th bit", 80 bytes is the period - 5*16, or 2*32+1*16. Every 14th bit is trickier, of course - 8*14=?
i'm not sure i understand the patterns - lol
7, 17, 29 are primes
10, 15, 20, 24, 27, 30, 38 are not
Thanks for all the suggestions, I also thought about using the lookup-tables. They sound great for 2*5 and 2*7, but I am suspicious about the bytewise operation (is there some 8-bit logik still around?), so it might get to 2*5*64 bit and 2*7*64 bit for the first two numbers, and then up it goes (2*11*64, 2*13*64). At least in my C - proof of concept they are much slower than the (gcc-optimized) naive solutions.
@dedndave: well, the trivial non-primes I just won't store, so I don't care about even numbers or multiples of 3. The list simply gets
5 7 11 13 17 19 23 25 29 31 35 37 ...
0 1 2 3 4 5 6 7 8 9 10 11 ...
The marking of all numbers devidible by 5 is marking every 10th bit starting from 7 (which is 25) and from 10 (which is 35). You see, 25 + 5 makes 30 and is even, so no need to check that. 35 is marked with bit 10, 55 with bit 17.
But maybe I am really near the possible optimum. Coming from all the high-level-languages I "thought" there would be some magic working with bit-Stream-like sources and combining them in big chunks with "OR".
I definitely will play with the SSE-Registers.
Not sure if SSE will bring the big improvement, but...
You have two overlaid loops to handle, step 10 and step 14 bits. For speed, you would need to use (many) registers to set the bits and minimise mem-to-mem operations. It may look roughly like this in the end:
Entry:
xor eax, eax ; same as mov eax, 0
xor ebx, ebx
xor ecx, ecx
...
or reg32, whatever ; set bits for step 10
...
or reg32, whatever ; set bits for step 14
...
mov [edi], eax ; write regs to mem
mov [edi+4], ebx
mov [edi+8], ecx
mov [edi+12], edx
Now it would be great if you could post the fast gcc solution, with an __asm int 3; on entry to the innermost loop, so that we could check at assembly level what makes it fast. As a rule of thumb, hand-crafted assembly can be at least a factor 2 faster.
@jj2007: here the code, I tried gcc with option -masm=intel.
Unfortunately __asm int 3; does not work with my linux-gcc. I inserted it manually at line 82, function doWork, before loop .L11 - but forgot after recompile ::)
SSE may not help much, because memory accesses are limited by hardware
but - it may help a little, because you can write 128 bits at a whack
the idea would be to make a pattern table for one set and a pattern table for the other set
then, use SSE code to OR the appropriate pattern values together, then store them
even so.....
i think you said your current code takes 40 mS to write a GB
that's not too bad, really :P
Quote from: flaschenpost on June 23, 2013, 10:13:28 PM
@jj2007: here the code, I tried gcc with option -masm=intel.
I don't have gcc. Can you post the executable, please?
There is a tn.s inside the zip. I hoped that would be compatible, but jwasm does not accept it.
Does anybody know how to compile C - Code into assembly for masm? The relevant part looks like
doWork:
.LFB64:
.cfi_startproc
movq %rcx, %rax
shrq %rax
cmpq %rax, %rdx
ja .L10
addq %r8, %r8
movl $1, %r10d
.p2align 4,,10
.p2align 3
.L11:
movq %rsi, %r9
movl %esi, %ecx
movq %r10, %r11
shrq $6, %r9
salq %cl, %r11
movl %edx, %ecx
orq %r11, (%rdi,%r9,8)
movq %rdx, %r9
movq %r10, %r11
shrq $6, %r9
addq %r8, %rdx
salq %cl, %r11
addq %r8, %rsi
orq %r11, (%rdi,%r9,8)
cmpq %rax, %rdx
jbe .L11
.L10:
cmpq %rax, %rsi
jae .L9
movq %rsi, %rdx
movl $1, %eax
movl %esi, %ecx
shrq $6, %rdx
salq %cl, %rax
orq %rax, (%rdi,%rdx,8)
.L9:
rep
ret
.cfi_endproc
I try to find a free compiler for win and come back then.
Using a gcc version 4.7.2 from a recent MinGW installation, Intel-syntax assembly is no problem, for inline assembly or for the assembly output.
#include <stdio.h>
#include <conio.h>
double CalcArea(const double r)
{
return r*r*3.14159;
}
int main (void)
{
printf("%f\n", CalcArea(1));
getch();
__asm__("int 3\n");
}
set file="test"
gcc -pedantic -S -masm=intel %file%.c -o %file%.asm
pause
gcc -pedantic -masm=intel %file%.c -o %file%.exe
pause
.file "test.c"
.intel_syntax noprefix
.text
.globl _CalcArea
.def _CalcArea; .scl 2; .type 32; .endef
_CalcArea:
LFB6:
.cfi_startproc
push ebp
.cfi_def_cfa_offset 8
.cfi_offset 5, -8
mov ebp, esp
.cfi_def_cfa_register 5
sub esp, 8
mov eax, DWORD PTR [ebp+8]
mov DWORD PTR [ebp-8], eax
mov eax, DWORD PTR [ebp+12]
mov DWORD PTR [ebp-4], eax
fld QWORD PTR [ebp-8]
fmul QWORD PTR [ebp-8]
fld QWORD PTR LC0
fmulp st(1), st
leave
.cfi_restore 5
.cfi_def_cfa 4, 4
ret
.cfi_endproc
LFE6:
.def ___main; .scl 2; .type 32; .endef
.section .rdata,"dr"
LC3:
.ascii "%f\12\0"
.text
.globl _main
.def _main; .scl 2; .type 32; .endef
_main:
LFB7:
.cfi_startproc
push ebp
.cfi_def_cfa_offset 8
.cfi_offset 5, -8
mov ebp, esp
.cfi_def_cfa_register 5
and esp, -16
sub esp, 16
call ___main
mov eax, 0
mov edx, 1072693248
mov DWORD PTR [esp], eax
mov DWORD PTR [esp+4], edx
call _CalcArea
fstp QWORD PTR [esp+4]
mov DWORD PTR [esp], OFFSET FLAT:LC3
call _printf
call _getch
/APP
# 13 "test.c" 1
int 3
# 0 "" 2
/NO_APP
leave
.cfi_restore 5
.cfi_def_cfa 4, 4
ret
.cfi_endproc
LFE7:
.section .rdata,"dr"
.align 8
LC0:
.long -266631570
.long 1074340345
.def _printf; .scl 2; .type 32; .endef
.def _getch; .scl 2; .type 32; .endef
Edit:
Somehow I missed your attachment. I added code to time the doWork procedure (Windows code that calls GetTickCount, sorry no time to do better) . The attachment contains the assembly output and EXEs for no optimization, -O1, -O2, -O3, and -Ofast. On both of my systems -O1 was the fastest.
I have to admit that I am not able to produce masm / jwasm - compatible assembly from my C-Code, I did not know how many different text formats for the same simple opcodes one could invent :( . I thought "Intel-format" would do the job. I use Intel C Compiler icc to optimize and I get factor 2 with converting to a lookup-table _and_ converting from a while-loop to a for-loop, which surprised me.
Since I can not find an assembler for the output of Intel-C-Compiler icc I can not produce dos-executables.
I begin to understand why a forum like this needs to be specific about OS and Tools.
I attached the current state of c and asm files. tn is "naive", tl is "looped".
The main problem stayes:
1. I can either produce simple loops, good for optimization and vectorization, but then I have to go through the memory more often
or
2. I can try to be memory-local (Cachelines etc), but then the loops get complicated.
From my understanding of processors it should be fastest to:
- fetch a cache line from memory as big as possible in a read
- write into that memory segment with many of those bit-settings (5 and 7 resp. every 10th and every 14th bit were just the examples of course, the steps get bigger up to 1e16)
- flush that memory segment back to main mem, i.e. tell the processor that line would not be needed again in near future.
Or, alternatively:
- fill the SSE-Registers with the pattern
- write directly from SSE with "or" into main mem if that is possible.
So some questions: Is it possible to rotate the 265 bit of a AVX - register or the 128 of a SSSE3 - register? Is it even possible to fetch all the SSE/AVX-registers from one memory region in one step, and to write them together? Can I bit-shift or bit-rotate through those registers in a small number of cycles?
If I profile my code with valgrind, then the main time is needed for memory access (surprise, surprise!). But if I try to reduce the loops over memory by processing packets of memory together, then I get slower than the naive solution.
The question of the solution with a small number of cycles seems to be almost as complex as the nowadays processors themselves.
@JJ2007: I was slightly misunderstood - I did not want you to compile with gcc, but I tried to make my gcc produce a "good" assembly file (.s in the flasch0.zip). What toolstack do you use when you have a peace of C-code to optimize? Or do you really write the whole software directly with in assembly and with your own libs?
I have even blown the dust off my old Windows- Partition (XP) and tried a VC++ Express, but got not too far into it (some trouble with a precompiled header that does not work and that I can not really get rid of).
The funny thing is: version tn.c is slower than tl.c in that pure form, but when I try to pack it (even only for p=5 and p=7) into the bigger loop, it makes the whole process slower instead of faster.
I am still frustrated that involving the 16 "machines" of my Nvidia Cuda did not speedup the marking of a rhythm of bits. Probably I have some big mistakes with those memory levels in Cuda.
I think using 4 threads for 4 parts of memory should use the 2 cores á 2 hyperthreads and speedup things, but maybe memory read/write is the bottleneck and threading makes it all worse.
Many thanks for the suggenstions and discussions so far, and I hope that the question will bring some nice thoughts from the playful experts here!
If any questions arise, I will try to answer them...
is this 128bit sequence correct? every 10th bit starting at bit 7?
0'0000001'0000000001'0000000001'0000000001'00 00000001'0000000001'0000000001'0000000001'0000000001'0000000001'0000000001'0000000001'0000000001
byte0=1
byte1=0
byte2=64
byte3=16
byte4=4
byte5=1
Quote from: flaschenpost on June 25, 2013, 06:20:29 AM
@JJ2007: I was slightly misunderstood - I did not want you to compile with gcc, but I tried to make my gcc produce a "good" assembly file (.s in the flasch0.zip). What toolstack do you use when you have a peace of C-code to optimize? Or do you really write the whole software directly with in assembly and with your own libs?
My mistake - I should have looked more carefully at the .s file...
This seems to be the innermost loop:
.L11:
mov r9, rsi
mov ecx, esi
mov r11, r10
shr r9, 6
sal r11, cl
mov ecx, edx
or QWORD PTR [rdi+r9*8], r11
mov r9, rdx
mov r11, r10
shr r9, 6
add rdx, r8
sal r11, cl
add rsi, r8
or QWORD PTR [rdi+r9*8], r11
cmp rdx, rax
jbe .L11That looks slow, honestly. I am not so proficient in 64-bit assembly, but in general we try to avoid shifts because they are slow, apart from other design problems in that loop.
Re toolstack: Masm32 is a library of very fast routines, compared to standard CRT; MasmBasic is another library with often much faster algos, and for specific tasks you will find many more in the "Laboratory" section. And yes we do write every algo from scratch, and we all find it particularly amusing to beat a CRT algo by a factor 3 and higher :icon_mrgreen:
Quote from: jj2007 on June 25, 2013, 06:59:23 AM
That looks slow, honestly. I am not so proficient in 64-bit assembly, but in general we try to avoid shifts because they are slow, apart from other design problems in that loop.
Well, for setting a certain bit in a byte I can't see an alternative. Following http://gmplib.org/~tege/x86-timing.pdf seemed not too bad for shifts, but they do not measure quad word operations.
Do you see a better translation of:
while(k <= limit/2){
C[j / 64] |= (1L << (j % 64));
C[k / 64] |= (1L << (k % 64));
j +=2*p;
k+=2*p;
}
if(j < limit/2){
C[j / SW] |= ((ST)1 << (j % SW));
}
Since the values in the lookup table are only right shifted values of a starting entry I even thought about replacing the lookup table (higher value for p gets big tables) with shift operations. But you sound like a lookup in a local long[120] is faster than shifting the same value.
Quote from: jj2007 on June 25, 2013, 06:59:23 AM
Re toolstack: Masm32 is a library of very fast routines, compared to standard CRT; MasmBasic is another library with often much faster algos, and for specific tasks you will find many more in the "Laboratory" section. And yes we do write every algo from scratch, and we all find it particularly amusing to beat a CRT algo by a factor 3 and higher :icon_mrgreen:
Wow, sounds like a nice game. :)
I think for an optimal prime number sieve there would be needed techniques like in Blas/Atlas: they measure some characteristics of the processor and system and then create an optimizer profile for just that system, so maximizing cache usage, parallelization etc. What impressed my was at the early 2000s, there were really expensive BLAS-libs out there from the vendors like Sun and IBM, hand-optimized for their system, but ATLAS made it as a general package right into the same range of speed.
But nobody really needs a fast prime sieve, so it is not worth that amount of work. ;-)
The strange thing about your "shift is slow" is that replacing a lot of shifts just with the predefined pattern makes "only" factor 2.
The (short) preparation in the fastest compiled version right now is
# Begin ASM
# prepare patterns
# End ASM
# LOE rbx rbp r12 r13 r14 r15
..B2.16: # Preds ..B2.2
movq %r14, %rax #39.21
shlq $6, %rax #39.21
addq $64, %rax #39.21
cmpq %rax, %rbp #39.21
jae ..B2.6 # Prob 10% #39.21
# LOE rax rbx rbp r12 r13 r14 r15
..B2.4: # Preds ..B2.16 ..B2.4
movq %r12, %rsi #40.15
movq %rbp, %r8 #43.15
shrq $6, %rsi #40.15
movl %r12d, %ecx #40.37
shrq $6, %r8 #43.15
movl $1, %edx #40.37
shlq %cl, %rdx #40.37
movl %ebp, %ecx #43.37
movl $1, %edi #43.37
lea (%rbp,%r14,2), %rbp #45.9
orq %rdx, (%r15,%rsi,8) #40.9
lea (%r12,%r14,2), %r12 #42.9
orq %rdx, (%rsp,%rsi,8) #41.9
shlq %cl, %rdi #43.37
orq %rdi, (%r15,%r8,8) #43.9
orq %rdi, (%rsp,%r8,8) #44.9
cmpq %rax, %rbp #39.21
jb ..B2.4 # Prob 95% #39.21
# LOE rax rbx rbp r12 r13 r14 r15
..B2.6: # Preds ..B2.4 ..B2.16
cmpq %rax, %r12 #47.18
jae ..B2.8 # Prob 50% #47.18
# LOE rbx r12 r13 r14 r15
..B2.7: # Preds ..B2.6
movq %r12, %rdx #49.15
movl %r12d, %ecx #49.37
shrq $6, %rdx #49.15
movl $1, %eax #49.37
shlq %cl, %rax #49.37
orq %rax, (%r15,%rdx,8) #49.9
orq %rax, (%rsp,%rdx,8) #50.9
# LOE rbx r13 r14 r15
..B2.8: # Preds ..B2.6 ..B2.7
movq (%rsp,%r14,8), %rax #53.17
movq %rax, (%rsp) #53.5
shrq $7, %r13 #55.22
# LOE rbx r13 r14 r15
..B2.18: # Preds ..B2.8
and the real loop is simply a looped memcopy:
# Begin ASM
# main loop
# End ASM
# LOE rbx r13 r14 r15
..B2.17: # Preds ..B2.18
movq %r14, %rcx #57.9
cmpq %r13, %r14 #57.22
jae ..B2.12 # Prob 10% #57.22
# LOE rcx rbx r13 r14 r15
..B2.10: # Preds ..B2.17 ..B2.10
movq %rcx, %rax #59.30
xorl %edx, %edx #59.30
divq %r14 #59.30
movq (%rsp,%rdx,8), %rbp #59.19
orq %rbp, (%r15,%rcx,8) #59.9
addq $2, %rcx #60.9
cmpq %r13, %rcx #57.22
jb ..B2.10 # Prob 95% #57.22
# LOE rcx rbx r13 r14 r15
..B2.12: # Preds ..B2.10 ..B2.17
# Begin ASM
# end of main loop
# End ASM
For the higher primes (>64) this makes no sense, because there would be write accesses not on every DWORD in the naive solution.
Well, my recent version comes down to a memcpy-like operation to logical combine a memory range out in RAM with a small memory from stack or something like that. So instead of
memcpy(C + offset, local, count)
I would need a
mem_or(C + offset, local, count)
to replace the long for-loop.
Is here something like that? Did you even speedup memcpy or memfill?
AVX is out of the race, it can't bit-shift. Otherwise it would have been _really_ great.
Even AVX2 seems not be able to bit-rotate the whole 256 bits - what a pity for me... If that would have been built in, I could store the unshifted bitpatterns for 15 primes in 256-bit-length YMM0 .. YMM14, rotate the the register to the right place and "OR" it with and into YMM15. That would mean 256 Bit in a few clock cycles for the first 15 primes, plus the shift and the modulo-logic around for 15 tinyints.
But extracting the register, shifting dword by dword with parity, restoring it and _then_ use the 256-bit-OR seems really silly.
SSE should be the way to go, i would think
one problem is the way you start out
Quote* to mark every 10th bit starting with bits 7 and 10 in a GB of memory. (7,10,17,20,27,30,...)
* also to mark every 14th bit starting with bits 15 and 24 (15,24, 29, 38, ...)
let's make it fast and say
Quote* to mark every 10th bit starting with bits 7 and 0 in a GB of memory. (7,0,17,10,27,20,...)
* also to mark every 14th bit starting with bits 1 and 10 (1,10,15,24, 29, 38, ...)
* at the end, go back and correct the first few bytes :)
this is the OWORD bit pattern for the first line of description, shown in little-endian form
10000001 00100000 01001000 00010010 00000100 10000001 00100000 01001000 00010010 00000100 10000001 00100000 01001000 00010010 00000100 10000001
00100000 01001000 00010010 00000100 10000001 00100000 01001000 00010010 00000100 10000001 00100000 01001000 00010010 00000100 10000001 00100000
01001000 00010010 00000100 10000001 00100000 01001000 00010010 00000100 10000001 00100000 01001000 00010010 00000100 10000001 00100000 01001000
00010010 00000100 10000001 00100000 01001000 00010010 00000100 10000001 00100000 01001000 00010010 00000100 10000001 00100000 01001000 00010010
00000100 10000001 00100000 01001000 00010010 00000100 10000001 00100000 01001000 00010010 00000100 10000001 00100000 01001000 00010010 00000100
the pattern repeats after the 5th OWORD
the pattern for the second line of description repeats after the 7th OWORD
make 2 tables, one with 5 OWORD's, one with 7 OWORD's
POR them together and write
adjust the 3 pointers (one for table 1, one for table 2, one for the destination)
rinse and repeat
when you're done, go back and straighten out the first few bytes
it would seem to me that the entire array is, then, a repeat of the first 35 OWORD's :biggrin:
so - it could be even faster
just create the first 280 bytes, and copy it until you are happy - lol
a little faster, still....
create the first 280 bytes
copy it several times to make a larger block
copy the larger block several times until the gig is full
@dedndave: yeah, that was what I tried in C but it made my code not faster, but only more complicated (maybe because I have no processor and asm experience). :( Thanks anyway for strenghening my mood in that direction!
The critical part seems to be the "OR" of two memory blocks, since 5 and 7 were only the starting point, and when I stir in the 11, 13 and 17 into the soup the blocks get really large (and since they are prime numbers, alignment can be achieved only by another 64 in the list), so it has to get a fast solution to "OR" a local memory block with the gig out there.
I read a quite strange thread about 64-bit and xmemcopy, so I will try to adopt ideas from there to OR blocks. I have made a big improvement (in C) by unrolling the loop and doing 128 long-OR in a row.
So still needed:
- optimized code to memfill blocks of data (cache? stack?)
- optimized code to set several bits through a big memory region (cache? registers? stack?
I try to learn enough assembler to write any working code!
You basically need one read-only memory block that or's the two 10 and 14 bit sequences (I have started the exercise below). You then combine them into a single block with 7*5*8 =280 bytes.
272 of them can be accessed in 16-byte steps, then you need an 8-byte write. Or you choose a 560 read area to access all of them in 16-byte steps.
Speed depends then on how much of that read memory you can squeeze into registers - xmm or ymm. With 8 xmm regs, you can cover 7*16=112 bytes, the rest must be read from memory with remaining xmm. With ymm regs, coverage gets better but still you can't arrive at 560 bytes.
After that, it's simply or'ing the Gigabyte with a battery of registers. There is a MOVNTDQ (Move Double Quadword Non-Temporal) instruction that is worth studying, but afaik no similar instruction is available for or'ing memory.
HTH
.data
;x 10987654321098765432109876543210___32109876543210987654321098765432___54321098765432109876543210987654
Bits10 dd 01001000000100100000010010000000b, 00010010000001001000000100100000b, 00000100100000010010000001001000b
;x 76543210987654321098765432109876___98765432109876543210987654321098
dd 10000001001000000100100000010010b, 00100000010010000001001000000100b
;x 10987654321098765432109876543210___32109876543210987654321098765432___54321098765432109876543210987654
Bits14 dd 00100001000000001000000000000000b, 00000000000000000000100001000000b, 00000000000000000000000000000000b
;x 76543210987654321098765432109876___98765432109876543210987654321098
dd 00000000000000000000000000000000b, 00000000000000000000000000000000b
;x 76543210987654321098765432109876___98765432109876543210987654321098
dd 00000000000000000000000000000000b, 00000000000000000000000000000000b
;x 76543210987654321098765432109876___98765432109876543210987654321098
dd 00000000000000000000000000000000b, 00000000000000000000000000000000b
i guess you are a member of Euler ?
what i had in mind was to generate the data once and store it in a file
then - read the file into memory - lol
no need to generate the table every time you want to use it
Quote from: dedndave on June 27, 2013, 10:47:43 AM
i guess you are a member of Euler ?
Well, yes that also is true, but this one was a StackOverflow question (about parallelizing, I guess it was a student's homework) triggering my old enthusiasm.
I just started my old program for a 1<<32 Maximum and sat there waiting for marking of number "3", and waited and sat, and waited...
But from that point on the search for a highly efficient solution got its own game.
I think I am still in the range of calculating a GB not much slower than reading it, apart from the disc space saved. ;-)
But in the end it should be a streaming solution that writes as many GB as I want.
Once and For All. ;)
considering I didn't mess up the table:
align 16
every10th_at7:
;----------------- prolog:
db 00000001b,00000000b,01000000b,00010000b,00000100b,00000001b,00000000b,01000000b
db 00010000b,00000100b,00000001b,00000000b,01000000b,00010000b,00000100b,00000001b
;----------------- main loop starts:
db 00000000b,01000000b,00010000b,00000100b,00000001b,00000000b,01000000b,00010000b
db 00000100b,00000001b,00000000b,01000000b,00010000b,00000100b,00000001b,00000000b
db 01000000b,00010000b,00000100b,00000001b,00000000b,01000000b,00010000b,00000100b
db 00000001b,00000000b,01000000b,00010000b,00000100b,00000001b,00000000b,01000000b
db 00010000b,00000100b,00000001b,00000000b,01000000b,00010000b,00000100b,00000001b
; ---------------- unroll for a 16byte multiples writes:
db 00000000b,01000000b,00010000b,00000100b,00000001b,00000000b,01000000b,00010000b
db 00000100b,00000001b,00000000b,01000000b,00010000b,00000100b,00000001b,00000000b
db 01000000b,00010000b,00000100b,00000001b,00000000b,01000000b,00010000b,00000100b
db 00000001b,00000000b,01000000b,00010000b,00000100b,00000001b,00000000b,01000000b
db 00010000b,00000100b,00000001b,00000000b,01000000b,00010000b,00000100b,00000001b
mov rsi, _buff1
; rsi points to a buffer
mov rax, [every10th_at7]
mov rdx, [every10th_at7+8]
or [rsi], rax
or [rsi+8], rdx
add rsi, 16
mov rax, [every10th_at7+16]
mov rdx, [every10th_at7+24]
mov rdi, [every10th_at7+32]
mov rbx, [every10th_at7+40]
mov rbp, [every10th_at7+48]
; number of 40byte blocks
mov ecx, 10
@@:
or [rsi], rax
or [rsi+8], rdx
or [rsi+16], rdi
or [rsi+24], rbx
or [rsi+32], rbp
add rsi, 40
dec ecx
jnz @b
solution 2, ilustrates limitation of "x86 32bit mode" and "POR"
mov esi, _buff1
; esi points to 16 byte aligned buffer
mov eax, [every10th_at7]
mov edx, [every10th_at7+4]
mov ecx, [every10th_at7+8]
mov edi, [every10th_at7+12]
or [esi], eax
or [esi+4], edx
or [esi+8], ecx
or [esi+12], edi
add esi, 16
movdqa xmm0, [every10th_at7+16]
movdqa xmm1, [every10th_at7+32]
movdqa xmm2, [every10th_at7+48]
movdqa xmm3, [every10th_at7+64]
mov eax, [every10th_at7+80]
mov edx, [every10th_at7+84]
mov ebx, [every10th_at7+88]
mov edi, [every10th_at7+92]
; number of 80 byte blocks
mov ecx, 5
@@:
movdqa xmm4, xmm0
movdqa xmm5, xmm1
movdqa xmm6, xmm2
movdqa xmm7, xmm3
por xmm4, [esi]
por xmm5, [esi+16]
por xmm6, [esi+32]
por xmm7, [esi+48]
or [esi+64], eax
or [esi+68], edx
or [esi+72], ebx
or [esi+76], edi
movdqa [esi], xmm4
movdqa [esi+16], xmm5
movdqa [esi+32], xmm6
movdqa [esi+48], xmm7
add esi, 80
dec ecx
jnz @b
EDIT, this code had incorrect offsets
mov rax, [every10th_at7]
mov rdx, [every10th_at7+8]
mov rdi, [every10th_at7+16]
mov rbx, [every10th_at7+24]
mov rbp, [every10th_at7+32]