Is there any function for floating point that determining the maximum value and minimum value from a set of array?
include \masm32\MasmBasic\MasmBasic.inc ; download (http://masm32.com/board/index.php?topic=94.0)
.data
MyArray REAL8 12.34, -99.0, 123.4e4, 3.0e-33, 123456.789, -123.4e4, 99.99
Init
mov esi, offset MyArray
ArrayMinMax REAL8 PTR esi:lengthof MyArray
Print Str$("The Min=\t%f\n", f:xmm0)
Inkey Str$("The Max=\t%f\n", f:xmm1)
Exit
end start
The Min= -1234000.0
The Max= 1234000.0
Documentation is here (http://www.webalice.it/jj2006/MasmBasicQuickReference.htm#Mb1170).
Quote from: jj2007 on June 13, 2012, 03:30:56 PM
include \masm32\MasmBasic\MasmBasic.inc ; download (http://masm32.com/board/index.php?topic=94.0)
.data
MyArray REAL8 12.34, -99.0, 123.4e4, 3.0e-33, 123456.789, -123.4e4, 99.99
Init
mov esi, offset MyArray
ArrayMinMax REAL8 PTR esi:lengthof MyArray
Print Str$("The Min=\t%f\n", f:xmm0)
Inkey Str$("The Max=\t%f\n", f:xmm1)
Exit
end start
The Min= -1234000.0
The Max= 1234000.0
Documentation is here (http://www.webalice.it/jj2006/MasmBasicQuickReference.htm#Mb1170).
Whoa thanks, very cool :t That is amazing. I tried to build something like this but I think the result is wrong.
a FPU solution for non-MB users:
lea edx,MyArray
mov ecx,1
fld REAL8 ptr [edx]
fld st
.while ecx < LENGTHOF MyArray - 1
fld REAL8 ptr [edx+ecx*8]
fxch st(1)
fcomi st,st(1)
fcmovnbe st,st(1)
fxch st(2)
fcomi st,st(1)
fcmovb st,st(1)
fstp st(1)
fxch
lea ecx,[ecx+1]
.endw
fstp r8Min
fstp r8Max
Quote from: qWord on June 13, 2012, 09:08:12 PM
a FPU solution for non-MB users:
Elegant solution, and competitive, too :t
AMD Athlon(tm) Dual Core Processor 4450B (SSE3)
Getting min & max for 10000000 REAL8 values:
57177 µs for ArrayMinMax
62270 µs for FPU
56843 µs for ArrayMinMax
62275 µs for FPU
57216 µs for ArrayMinMax
62252 µs for FPU
Results:
ArrayMinMax= -888.887867/999.998657
r8MinMax= -888.887828/999.998801
Intel(R) Core(TM) i5 CPU M 520 @ 2.40GHz (SSE4)
Getting min & max for 10000000 REAL8 values:
31341 µs for ArrayMinMax
26515 µs for FPU
32582 µs for ArrayMinMax
26116 µs for FPU
30457 µs for ArrayMinMax
25545 µs for FPU
Results:
ArrayMinMax= 0.0/0.0
r8MinMax= -888.887828/999.998801
:icon_confused:
maybe the complicated loop contruction is the problem?:
CPU Disasm
Address Hex dump Command Comments
00402C49 |. BD FFFFFF7F MOV EBP,7FFFFFFF
00402C4E |> 3B06 /CMP EAX,DWORD PTR DS:[ESI]
00402C50 |. 73 02 |JNB SHORT 00402C54
00402C52 |. 8B06 |MOV EAX,DWORD PTR DS:[ESI]
00402C54 |> 3B2E |CMP EBP,DWORD PTR DS:[ESI]
00402C56 |. 7E 02 |JLE SHORT 00402C5A
00402C58 |. 8B2E |MOV EBP,DWORD PTR DS:[ESI]
00402C5A |> 3B16 |CMP EDX,DWORD PTR DS:[ESI]
00402C5C |. 7D 02 |JGE SHORT 00402C60
00402C5E |. 8B16 |MOV EDX,DWORD PTR DS:[ESI]
00402C60 |> 83FF 04 |CMP EDI,4
00402C63 |. 76 10 |JBE SHORT 00402C75
00402C65 |. F20F5F0E |MAXSD XMM1,QWORD PTR DS:[ESI]
00402C69 |. F20F5D06 |MINSD XMM0,QWORD PTR DS:[ESI]
00402C6D |. 3B5E 04 |CMP EBX,DWORD PTR DS:[ESI+4]
00402C70 |. 73 03 |JNB SHORT 00402C75
00402C72 |. 8B5E 04 |MOV EBX,DWORD PTR DS:[ESI+4]
00402C75 |> 03F7 |ADD ESI,EDI
00402C77 |. 49 |DEC ECX
00402C78 |.^ 75 D4 \JNE SHORT 00402C4E
00402C7A |. 8BCD MOV ECX,EBP
You want it faster? Here it is:
AMD Athlon(tm) Dual Core Processor 4450B (SSE3)
Getting min & max for 10000000 REAL8 values:
56953 µs for ArrayMinMax
62258 µs for FPU
33072 µs for SSE2
56735 µs for ArrayMinMax
62351 µs for FPU
32944 µs for SSE2
56787 µs for ArrayMinMax
62246 µs for FPU
32830 µs for SSE2
Results:
ArrayMinMax= -888.887875/999.998846
r8MinMax= -888.887542/999.998829
SSE2Min= -888.887520/999.998850
But I am worried about your results for ArrayMinMax - why 0/0?? :(
Intel(R) Core(TM) i5 CPU M 520 @ 2.40GHz (SSE4)
Getting min & max for 10000000 REAL8 values:
32887 µs for ArrayMinMax
25024 µs for FPU
17790 µs for SSE2
37237 µs for ArrayMinMax
25062 µs for FPU
15093 µs for SSE2
28999 µs for ArrayMinMax
24408 µs for FPU
14722 µs for SSE2
Results:
ArrayMinMax= 0.0/0.0
r8MinMax= -888.887542/999.998829
SSE2Min= 0.0/0.0
Indeed, the result is strange...
Does the order play a role? Attachment has a + b versions with different order.
The A problem is the start value: this must either a value of the list or the maximum and minimum values of the type (double): +-1.7976931348623157E+308.
Sure? I thought I had that right....
This is how I set the xmm initial values:
push 07FEFFFFFh ; set MinMax to xmm regs
push -1
movlps xmm0, REAL8 ptr [esp] ; MaxVal, -1.7e308
mov byte ptr [esp+7], -1 ; dirty hack - loading from mem is 8 bytes longer ;-)
movlps xmm1, REAL8 ptr [esp] ; MinVal, +1.7e308
What are your values for xmm0/xmm1 at the first and second int 3?
int 3
mov esi, Chr$("xmm0=-888, xmm1=+999??")
ArrayMinMax MyR8()
int 3
Another possibility is that xmm regs get trashed somewhere in the Print Str$() process. Which Windows version are you running?
Third option: Rand() is not working on your system ::)
For the second, the correct result is placed in the registers - it gets overwritten SetConsoleCP() (Win7,x64)!
However, my above statement is correct, even it is not the problem here :biggrin:
Think about a list that has only positive values: 1,2,3,4....
When you start with 0, you will get zero as the minimum, instead of 1
Quote from: qWord on June 13, 2012, 10:39:21 PM
For the second, the correct result is placed in the registers - it gets overwritten SetConsoleCP() (Win7,x64)!
However, my above statement is correct, even it is not the problem here :biggrin:
Think about a list that has only positive values: 1,2,3,4....
When you start with 0, you will get zero as the minimum, instead of 1
That's correct but it is not the reason, see above in bold.
So I have to save xmm regs before calling SetConsoleCP, grrrr!
:t Whoa you guys are an experts now, just in a short time. I never able to make it myself. Now my graphic system is completed. I can calculate everything automatically from adjusting the center, pointing where the model from view, and the most important thing is, now the shadow system is complete.
Another FPU solution, I think probably slow, with min and max as separate procedures, and since this was for graphics I guessed REAL4 instead of REAL8.
;==============================================================================
include \masm32\include\masm32rt.inc
.686
;==============================================================================
;-------------------------------------
; These from VC Toolkit 2003 float.h:
;-------------------------------------
FLT_MAX equ 3.402823466e+38
DBL_MAX equ 1.7976931348623158e+308
;==============================================================================
.data
array real4 -8.8, -3.9, 111.5, 0.5, 3.6, 1.2, 4.9, 9.9, -98.2, 0.0
r8 real8 ?
.code
;==============================================================================
fltmax proc p:dword, n:dword
local _max:real4
mov ecx, n
mov edx, p
fld4 -FLT_MAX
fstp _max
L0:
fld real4 ptr [edx+ecx*4]
fld _max
fcomip st, st(1)
ja L1
fst _max
L1:
fstp st
sub ecx, 1
jns L0
fld _max
ret
fltmax endp
;==============================================================================
fltmin proc p:dword, n:dword
local _min:real4
mov ecx, n
mov edx, p
fld4 FLT_MAX
fstp _min
L0:
fld real4 ptr [edx+ecx*4]
fld _min
fcomip st, st(1)
jb L1
fst _min
L1:
fstp st
sub ecx, 1
jns L0
fld _min
ret
fltmin endp
;==============================================================================
start:
;==============================================================================
invoke fltmin, addr array, lengthof array
fstp r8
printf("%.1f\n",r8)
invoke fltmax, addr array, lengthof array
fstp r8
printf("%.1f\n",r8)
inkey
exit
;==============================================================================
end start
Quote from: qWord on June 13, 2012, 10:20:11 PM
The A problem is the start value: this must either a value of the list or the maximum and minimum values of the type (double): +-1.7976931348623157E+308.
Always initialize the min and max values from the first entry in the list, that way you will never get a min or max that is not in the list.
Dave.
Quote from: KeepingRealBusy on June 14, 2012, 01:38:07 AM
Quote from: qWord on June 13, 2012, 10:20:11 PM
The A problem is the start value: this must either a value of the list or the maximum and minimum values of the type (double): +-1.7976931348623157E+308.
Always initialize the min and max values from the first entry in the list, that way you will never get a min or max that is not in the list.
Dave.
they can get but i use to do min=max=first entry and then compare
the second, etc.
MichaelW,
I would like to understand this fltmin procedure
Could you help me ?
1. If we have 10 numbers in the array
lengthof array is 10 no ?
2. How to access the last value ?
fltmin proc p:dword, n:dword
local _min:real4
mov ecx, n
mov edx, p
fld4 FLT_MAX
fstp _min
L0:
fld real4 ptr [edx+ecx*4]
fld _min
fcomip st, st(1)
jb L1
fst _min
L1:
fstp st
sub ecx, 1
jns L0
fld _min
ret
[/coed]
Looks to me that you have to decrement ecx before you start the loop whenever you use a reg as both an index and a count, otherwise you are accessing an entry outside of the array.
At least that is what I have been doing for 47 years of programming. In addition, the array will be accessed in the reverse order (last to first), which is valid in this particular algo.
Dave.
Yes, I intended that ECX be decremented before the start of the loop. As it is the code accesses the first (index 9) through the last (index 0) values, but unfortunately it also accesses the least-significant dword of r8 (index 10).
Quote from: qWord on June 13, 2012, 10:20:11 PM
The A problem is the start value: this must either a value of the list or the maximum and minimum values of the type (double): +-1.7976931348623157E+308.
The start value was ok, but taking a value from the list is actually more efficient, thanks for the idea :icon14:
The problem is really that Win7-64 trashes lots of xmm regs. For now, I have identified SetConsoleCP (used in Print) and QueryPerformanceFrequency (used in NanoTimer) as culprits - the latter trashes xmm0...xmm5. Win7-32 does not do such nasty things. Will keep you posted :biggrin:
MichaelW,
Yes the problem was to call fltmin with lengthof array :t
I dont like that
fld4 FLT_MAX and
jb L1 ; why to set it again if it is = _min ?
fst _min ; one _min is better than the other ?
Dave,
Yes the problem was to call fltmin with lengthof array
Yes we know that the array is accessed in the reverse order
and we can do _min=last. No problem. Reverse order is usually
what i do, i have not a count !
Quote
47 years of programming
I guess you are nearly 70, no ? ;)
RuiLoureiro,
73 this year. You are dealing with an old dog that doesn't learn new tricks too easily.
Dave.
Dave,
I hope you live the others 73 ! ;)
I hope to get that number one day in the near future
Best regards
Rui Loureiro
Well the problem is to help Farabi
MichaelW,
Take a look at this
Based on the example we can write this:
It should work for n=1 to ... N
I dont tested it
"printf" doesnt work wirh me
For REAL8, replace real4 by real8
and *4 by *8 i think
We can also replace jmp start by sub ecx, 1 but...
Quote
;n=lengthof array
MyMin proc p:dword, n:dword
local _min:real4
mov ecx, n
mov edx, p
sub ecx, 1
;
fld real4 ptr [edx+ecx*4]
fstp _min
jmp start
L0:
fld real4 ptr [edx+ecx*4]
fld _min
fcomip st, st(1)
jbe L1
fst _min
L1:
fstp st
start:
sub ecx, 1
jns L0
fld _min
ret
MyMin endp
Quote
;n=lengthof array
MyMax proc p:dword, n:dword
local _max:real4
mov ecx, n
mov edx, p
sub ecx, 1
;
fld real4 ptr [edx+ecx*4]
fstp _max
jmp start
L0:
fld real4 ptr [edx+ecx*4]
fld _max
fcomip st, st(1)
jae L1
fst _max
L1:
fstp st
start:
sub ecx, 1
jns L0
fld _max
ret
MyMax endp
In my tests there was no significant speed advantage of JAE/JBE over JA/JB (even though intuitively it seems to me that there should be), and no speed advantage of setting min/max to the first element over setting it to +/- FLT_MAX.
;==============================================================================
include \masm32\include\masm32rt.inc
.686
;==============================================================================
;-------------------------------------
; These from VC Toolkit 2003 float.h:
;-------------------------------------
FLT_MAX equ 3.402823466e+38
DBL_MAX equ 1.7976931348623158e+308
;==============================================================================
.data
array real4 -8.8, -3.9, 111.5, 0.5, 3.6, 1.2, 4.9, 9.9, -98.2, 0.0
r4 real4 ?
r8 real8 ?
.code
;==============================================================================
;------------------------------------------------------------------------
; This is Abel's version of a Park-Miller-Carta generator, details here:
; http://www.masm32.com/board/index.php?topic=6558.0
; Modified to return a floating-point value in the interval [0,1) at
; the top of the FPU stack in ST(0), as per the normal convention.
;
; The period of the core generator is 2147483646 (tested), and it runs
; in 23 cycles on a P3, including the call overhead and a fstp to store
; the result to memory. Note that setting frnd_divider to the period
; instead of to a power of 2 caused a 2x slowdown.
;------------------------------------------------------------------------
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
align 4
frnd proc
.data
align 8
abel_rand_seed dd 1
frnd_divider dq 2147483648
.code
mov eax, abel_rand_seed
mov ecx, 16807 ; a = 7^5
mul ecx ; edx:eax == a*seed == D:A
mov ecx, 7fffffffh ; ecx = m
add edx, edx ; edx = 2*D
cmp eax, ecx ; eax = A
jna @F
sub eax, ecx ; if A>m, A = A - m
@@:
add eax, edx ; eax = A + 2*D
jns @F
sub eax, ecx ; If (A + 2*D)>m
@@:
mov abel_rand_seed, eax ; save new seed
fild abel_rand_seed
fild frnd_divider
fdiv
ret
frnd endp
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef
;==============================================================================
fltmax proc p:dword, n:dword
local _max:real4
;xor esi, esi
mov ecx, n
dec ecx
mov edx, p
;fld4 -FLT_MAX
fld real4 ptr [edx+ecx*4]
fstp _max
L0:
;inc esi
fld real4 ptr [edx+ecx*4]
fld _max
fcomip st, st(1)
ja L1
;inc esi
fst _max
L1:
fstp st
sub ecx, 1
jns L0
fld _max
;printf("max:%d\n",esi)
ret
fltmax endp
;==============================================================================
fltmin proc p:dword, n:dword
local _min:real4
;xor esi, esi
mov ecx, n
dec ecx
mov edx, p
;fld4 FLT_MAX
fld real4 ptr [edx+ecx*4]
fstp _min
L0:
;inc esi
fld real4 ptr [edx+ecx*4]
fld _min
fcomip st, st(1)
jb L1
;inc esi
fst _min
L1:
fstp st
sub ecx, 1
jns L0
fld _min
;printf("min:%d\n",esi)
ret
fltmin endp
;==============================================================================
start:
;==============================================================================
invoke Sleep, 3000
mov esi, alloc(10000000*4)
xor ebx, ebx
invoke GetTickCount
movzx edi, ax
.WHILE ebx < edi
invoke frnd
fstp r4
add ebx, 1
.ENDW
xor ebx, ebx
.WHILE ebx < 10000000
invoke frnd
fld4 999.0
fmul
fld4 498.0
fsub
fstp r4
mov eax, dword ptr r4
mov [esi+ebx*4], eax
add ebx, 1
.ENDW
invoke GetTickCount
push eax
invoke fltmin, esi, 9999999
fstp r8
;printf("%.1f\n",r8)
invoke fltmax, esi, 9999999
fstp r8
;printf("%.1f\n",r8)
invoke GetTickCount
pop edx
sub eax, edx
printf("%d\n",eax)
free esi
inkey
exit
;==============================================================================
end start
The only advantage I can see of setting min/max to the first element (or any other particular element) instead of setting it to +/- FLT_MAX would be if there is a significant error in the value of FLT_MAX, and the array contains values that are close to +/- FLT_MAX.
MichaelW,
Thank you for testing this cases.
Meanwhile, my question is why to find out the max value
(if it is 3.402823466e+38 or 3.402843566e+38 or
or 1.7976931348623158e+308 or 1.7976941348623158e+308) ?
Why if we have a set of N values and we can start with
one of them ? I have one answer: to invent a different way.
To me it makes no sense, simply. I never use it.
Second question is this: In the following fltmax procedure
you set _max with real4 ptr [edx+ecx*4]
fld real4 ptr [edx+ecx*4]
fstp _max
and next you compare the same values
fld real4 ptr [edx+ecx*4]
fld _max
fcomip st, st(1)
and next you set real4 ptr [edx+ecx*4] to _max again
fst _max
because it is the same first value
Quote
fltmax proc p:dword, n:dword
local _max:real4
;xor esi, esi
mov ecx, n
dec ecx
mov edx, p
;fld4 -FLT_MAX
fld real4 ptr [edx+ecx*4]
fstp _max ; put real4 ptr [edx+ecx*4] to _max
L0:
;inc esi
fld real4 ptr [edx+ecx*4]
fld _max
fcomip st, st(1) ; compare real4 ptr [edx+ecx*4] with _max
ja L1
;inc esi
fst _max ; put it again
L1:
fstp st
sub ecx, 1
jns L0
fld _max
;printf("max:%d\n",esi)
ret
fltmax endp
It should be
Quote
fltmax proc p:dword, n:dword
local _max:real4
;xor esi, esi
mov ecx, n
dec ecx
mov edx, p
;fld4 -FLT_MAX
fld real4 ptr [edx+ecx*4]
fstp _max
dec ecx
L0:
;inc esi
fld real4 ptr [edx+ecx*4]
fld _max
fcomip st, st(1) ; compare the first with another
ja L1
;inc esi
fst _max
L1:
fstp st
sub ecx, 1
jns L0
fld _max
;printf("max:%d\n",esi)
ret
fltmax endp
Quote from: jj2007 on June 14, 2012, 03:56:15 AM
The problem is really that Win7-64 trashes lots of xmm regs. For now, I have identified SetConsoleCP (used in Print) and QueryPerformanceFrequency (used in NanoTimer) as culprits - the latter trashes xmm0...xmm5. Win7-32 does not do such nasty things. Will keep you posted :biggrin:
Hey jj, remember the x64 calling convention? XMM0..XMM5 are all volatile. Register Usage (http://msdn.microsoft.com/en-us/library/9z1stfyw.aspx)
Quote from: sinsi on June 14, 2012, 08:59:54 PM
Hey jj, remember the x64 calling convention? XMM0..XMM5 are all volatile. Register Usage (http://msdn.microsoft.com/en-us/library/9z1stfyw.aspx)
Hey sinsi,
The link is good but they are talking x64 architecture, and the code runs in x32 mode. Under Win7-32, none of the XMM regs is ever being trashed, now five of them are zeroed. I bet that must break quite a bit of existing code ::)
Ouch, a problem I guess. Although what does the Microsoft x86 calling convention say about xmm registers?
MichaelW,
I installed masm32 v10 and now i can run the example
With
fld4 FLT_MAX
fld4 -FLT_MAX
i get 63
Without it i get only 47
But with DEC ECX only 46
Ok 47 to 46 is not too much
but my logic says we dont need to compare the same
values 2 times.
I have P4 3GHz
Quote from: MichaelW on June 14, 2012, 02:36:02 PM
The only advantage I can see of setting min/max to the first element (or any other particular element) instead of setting it to +/- FLT_MAX would be if there is a significant error in the value of FLT_MAX, and the array contains values that are close to +/- FLT_MAX.
Michael,
If the array held the values 1,2,3,4,5, then setting the max to +FLT_MAX would end up with a max value of +FLT_MAX when it should be 5. Always seed max and min with the first value, predecrement the index, then loop with jnz (since the first element was the basis of min/max) instead of jns. Jns is required if you are summing an array from the end to the start and need to include the first element.
Dave.
I preset max to -FLT_MAX and min to +FLT_MAX. Since FLT_MAX came from the Microsoft header files, I'm assuming that it is correct.
I will concede that decrementing the count, seeding with the first value, and looping with jnz will have a significant speed advantage on very small arrays.
Quote from: MichaelW on June 15, 2012, 11:50:20 AM
I preset max to -FLT_MAX and min to +FLT_MAX. Since FLT_MAX came from the Microsoft header files, I'm assuming that it is correct.
Michael,
I think the opposite should be true: preset min to -FLT_MAX and max to +FLT_MIN, that way the first entry compared will be saved as either a max or min. You will notice, though, that you will have two useless compares and conditional jumps over the number of instructions required to just set max and min to the first entry.
Another speed up, especially for huge arrays of large numbers, would be to just save the index (or pointer to the value) of the max and/or min value, and then only save the value when the end of the array was found (or first entry if scanning in reverse).
Dave.
Quote from: KeepingRealBusy on June 15, 2012, 01:16:42 PMI think the opposite should be true: preset min to -FLT_MAX and max to +FLT_MIN...
i think you have that backwards
if you set the "current min" to +FLT_MAX, any value will replace it
but, pre-loading the first value to initialize min and max seems like a better way to go
I just knew that I would screw it up. But saving the first as min and max is correct.
Dave.
Quote from: KeepingRealBusy on June 15, 2012, 01:16:42 PM
Another speed up .. would be to just save the index (or pointer to the value) of the max and/or min value, and then only save the value when the end of the array was found (or first entry if scanning in reverse).
Dave.
Against what would you compare each element then?
I converted qword's code to a procedure and applied Dave's modifications (except the last) to my code, and did a cycle count comparison.
;==============================================================================
include \masm32\include\masm32rt.inc
.686
include \masm32\macros\timers.asm
;==============================================================================
.data
array real4 -8.8, -3.9, 111.5, 0.5, 3.6, 1.2, 4.9, 9.9, -98.2, 0.0
r4 real4 ?
r8 real8 ?
r8min real8 ?
r8max real8 ?
.code
;==============================================================================
fltmax proc p:dword, n:dword
local _max:real4
mov ecx, n
dec ecx
mov edx, p
fld real4 ptr [edx+ecx*4]
fstp _max
L0:
fld real4 ptr [edx+ecx*4]
fld _max
fcomip st, st(1)
ja L1
fst _max
L1:
fstp st
sub ecx, 1
jnz L0
fld _max
ret
fltmax endp
;==============================================================================
fltmin proc p:dword, n:dword
local _min:real4
mov ecx, n
dec ecx
mov edx, p
fld real4 ptr [edx+ecx*4]
fstp _min
L0:
fld real4 ptr [edx+ecx*4]
fld _min
fcomip st, st(1)
jb L1
fst _min
L1:
fstp st
sub ecx, 1
jnz L0
fld _min
ret
fltmin endp
;==============================================================================
fltminmax proc p:dword, n:dword
mov edx, p
mov ecx, 1
mov eax, n
dec eax
fld REAL4 ptr [edx]
fld st
.while ecx < eax
fld REAL4 ptr [edx+ecx*8]
fxch st(1)
fcomi st,st(1)
fcmovnbe st,st(1)
fxch st(2)
fcomi st,st(1)
fcmovb st,st(1)
fstp st(1)
fxch
lea ecx,[ecx+1]
.endw
ret
fltminmax endp
;fstp r8Min
;fstp r8Max
;==============================================================================
start:
;==============================================================================
invoke GetCurrentProcess
invoke SetProcessAffinityMask, eax, 1
invoke fltmin, addr array, lengthof array
fstp r8
printf("%.1f\t",r8)
invoke fltmax, addr array, lengthof array
fstp r8
printf("%.1f\n",r8)
invoke fltminmax, addr array, lengthof array
fstp r8min
fstp r8max
printf("%.1f\t%.1f\n\n", r8min, r8max)
invoke Sleep, 4000
counter_begin 10000000, REALTIME_PRIORITY_CLASS
counter_end
printf("%d cycles\n", eax)
counter_begin 10000000, REALTIME_PRIORITY_CLASS
invoke fltmin, addr array, lengthof array
fstp r8
counter_end
printf("%d cycles\n", eax)
counter_begin 10000000, REALTIME_PRIORITY_CLASS
invoke fltmin, addr array, lengthof array
fstp r8
counter_end
printf("%d cycles\n", eax)
counter_begin 10000000, REALTIME_PRIORITY_CLASS
invoke fltminmax, addr array, lengthof array
fstp r8
fstp r8
counter_end
printf("%d cycles\n\n", eax)
inkey
exit
;==============================================================================
end start
Typical on a P3:
0 cycles
90 cycles
90 cycles
90 cycles
Typical on a P4 (Northwood):
0 cycles
53 cycles
49 cycles
152 cycles
For my code I suspect that retaining the working min and max values in the FPU and would be faster, but I can't as yet see any way to do that without a substantial increase in the number of FPU instructions.
Another possibility that I played with, that I think should be much faster, is to do the comparisons entirely with integer instructions using the methods described here:
http://www.cygnus-software.com/papers/comparingfloats/Comparing%20floating%20point%20numbers.htm
MichaelW,
See your code. You dont test fltmax but
fltmin 2 times
Meanwhile
Here is the results on my P4 3GHz
-98.2 111.5
-98.2 111.5
0 cycles
106 cycles
105 cycles
247 cycles
Attached See reply #42 for a version including MichaelW's fltminmax (with a tiny correction: fld REAL4 ptr [edx+ecx*4]).
AMD Athlon(tm) Dual Core Processor 4450B (SSE3)
Getting min & max for 10000000 REAL8 values, version B:
66868 µs for FPU
50010 µs for ArrayMinMax, REAL4
56519 µs for ArrayMinMax, REAL8
29226 µs for SSE2, REAL8
59826 µs for fltminmax
66922 µs for FPU
50006 µs for ArrayMinMax, REAL4
56466 µs for ArrayMinMax, REAL8
29395 µs for SSE2, REAL8
59925 µs for fltminmax
66974 µs for FPU
50069 µs for ArrayMinMax, REAL4
56522 µs for ArrayMinMax, REAL8
29286 µs for SSE2, REAL8
60750 µs for fltminmax
Results:
ArrayMinMax= -888.887715/999.998946
r4MinMax= -888.887756/999.998779
r8MinMax= -888.887786/999.998822
SSE2Min= -888.887910/999.998720
prescott w/htt
Intel(R) Pentium(R) 4 CPU 3.00GHz (SSE3)
Getting min & max for 10000000 REAL8 values, version B:
103484 µs for FPU
28553 µs for ArrayMinMax, REAL4
46333 µs for ArrayMinMax, REAL8
27718 µs for SSE2, REAL8
102134 µs for fltminmax
104330 µs for FPU
25788 µs for ArrayMinMax, REAL4
44979 µs for ArrayMinMax, REAL8
27742 µs for SSE2, REAL8
101988 µs for fltminmax
106032 µs for FPU
28141 µs for ArrayMinMax, REAL4
47526 µs for ArrayMinMax, REAL8
27644 µs for SSE2, REAL8
101950 µs for fltminmax
Here is Mymin and Mymax. I explained what i did
Here the results:
-198.4 421.7
-198.4 421.7
-198.4 302223021437139660000000000000000.0
0 cycles
658 cycles ; Mymin
673 cycles ; Mymax
854 cycles ; fltmin
751 cycles ; fltmax
5957 cycles
Press any key to continue ...
Quote
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
Mymin proc p:dword, n:dword
mov ecx, [esp+8] ;n
sub ecx, 1
mov edx, [esp+4] ;p
fld real4 ptr [edx+ecx*4] ; set st(0) to MIN value
sub ecx, 1 ; point to the next value
L0:
fld real4 ptr [edx+ecx*4]
fcomi st, st(1) ; compare st(1)=MIN with st(0)
jae L1
fstp st(1) ; remove the MIM, st(0) is the MIN
sub ecx, 1
jns L0 ; if ecx>0 or ecx=0 loop to L0
ret 8
L1: fstp st
sub ecx, 1
jns L0 ; if ecx>0 or ecx=0 loop to L0
ret 8
Mymin endp
;...***---
Mymax proc p:dword, n:dword
mov ecx, [esp+8] ;n
sub ecx, 1
mov edx, [esp+4] ;p
fld real4 ptr [edx+ecx*4] ; set st(0) to MAX value
sub ecx, 1 ; point to the next value
L0:
fld real4 ptr [edx+ecx*4] ; point to the next value
fcomi st, st(1) ; compare st(1)=MAX with st(0)
jbe L1
fstp st(1) ; remove the MAx, st(0) is the MAX
sub ecx, 1
jns L0 ; if ecx>0 or ecx=0 loop to L0
ret 8
L1:
fstp st
sub ecx, 1
jns L0 ; if ecx>0 or ecx=0 loop to L0
ret 8
Mymax endp
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef
;==============================================================================
include \masm32\include\masm32rt.inc
.686
include \masm32\macros\timers.asm
;==============================================================================
lenarray equ 90
.data
align 4
array real4 -8.8, -3.9, 111.5, 0.5, 3.6, 1.2, 4.9, 9.9, -98.2, 0.0
array1 real4 -8.7, -3.5, 121.5, 1.5, 8.6, 2.3, 5.9, 19.9, -198.3, 1.0
array2 real4 -6.7, -2.5, 221.5, 4.5, 7.6, 1.3, 6.9, 0.9, -97.4, 2.0
array3 real4 -8.9, -3.5, 121.5, 3.5, -2.6, 1.5, 5.9, 1.9, -98.4, 1.1
array4 real4 -5.7, -1.5, 421.7, 1.6, 2.6, 4.3, 15.9, 7.9,-198.4, 1.2
array5 real4 -8.7, -3.5, 121.5, 1.5, 8.6, 2.3, 5.9, 19.9, -198.3, 1.0
array6 real4 -6.7, -2.5, 221.5, 4.5, 7.6, 1.3, 6.9, 0.9, -97.4, 2.0
array7 real4 -8.9, -3.5, 121.5, 3.5, -2.6, 1.5, 5.9, 1.9, -98.4, 1.1
array8 real4 -5.7, -1.5, 421.7, 1.6, 2.6, 4.3, 15.9, 7.9,-198.4, 1.2
r4 real4 ?
r8 real8 ?
r8min real8 ?
r8max real8 ?
.code
;==============================================================================
;==============================================================================
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
Mymin proc p:dword, n:dword
mov ecx, [esp+8] ;n
sub ecx, 1
mov edx, [esp+4] ;p
fld real4 ptr [edx+ecx*4] ; set st(0) to MIN value
sub ecx, 1 ; point to the next value
L0:
fld real4 ptr [edx+ecx*4]
fcomi st, st(1) ; compare st(1)=MIN with st(0)
jae L1
fstp st(1) ; remove the MIM, st(0) is the MIN
sub ecx, 1
jns L0 ; if ecx>0 or ecx=0 loop to L0
ret 8
L1: fstp st
sub ecx, 1
jns L0 ; if ecx>0 or ecx=0 loop to L0
ret 8
Mymin endp
Mymax proc p:dword, n:dword
mov ecx, [esp+8] ;n
sub ecx, 1
mov edx, [esp+4] ;p
fld real4 ptr [edx+ecx*4] ; set st(0) to MAX value
sub ecx, 1 ; point to the next value
L0:
fld real4 ptr [edx+ecx*4] ; point to the next value
fcomi st, st(1) ; compare st(1)=MAX with st(0)
jbe L1
fstp st(1) ; remove the MAx, st(0) is the MAX
sub ecx, 1
jns L0 ; if ecx>0 or ecx=0 loop to L0
ret 8
L1:
fstp st
sub ecx, 1
jns L0 ; if ecx>0 or ecx=0 loop to L0
ret 8
Mymax endp
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
fltmax proc p:dword, n:dword
local _max:real4
mov ecx, n
dec ecx
mov edx, p
fld real4 ptr [edx+ecx*4]
fstp _max
;dec ecx
L0:
fld real4 ptr [edx+ecx*4]
fld _max
fcomip st, st(1)
ja L1
;jae L1
fst _max
L1:
fstp st
sub ecx, 1
jnz L0 ; jnz ? IF ecx = 0 we must compare !
fld _max
ret
fltmax endp
;==============================================================================
fltmin proc p:dword, n:dword
local _min:real4
mov ecx, n
dec ecx
mov edx, p
fld real4 ptr [edx+ecx*4]
fstp _min
;dec ecx
L0:
fld real4 ptr [edx+ecx*4]
fld _min
fcomip st, st(1)
jb L1
;jbe L1
fst _min
L1:
fstp st
sub ecx, 1
jnz L0 ; jnz ? IF ecx = 0 we must compare !
fld _min
ret
fltmin endp
;==============================================================================
fltminmax proc p:dword, n:dword
mov edx, p
mov ecx, 1
mov eax, n
dec eax
fld REAL4 ptr [edx]
fld st
.while ecx < eax
fld REAL4 ptr [edx+ecx*8]
fxch st(1)
fcomi st,st(1)
fcmovnbe st,st(1)
fxch st(2)
fcomi st,st(1)
fcmovb st,st(1)
fstp st(1)
fxch
lea ecx,[ecx+1]
.endw
ret
fltminmax endp
;fstp r8Min
;fstp r8Max
;==============================================================================
start:
;==============================================================================
invoke GetCurrentProcess
invoke SetProcessAffinityMask, eax, 1
invoke Mymin, addr array, lenarray ;lengthof array
fstp r8
printf("%.1f\t",r8)
invoke Mymax, addr array, lenarray ;lengthof array
fstp r8
printf("%.1f\n",r8)
;----------------------------------------
invoke fltmin, addr array, lenarray ;lengthof array
fstp r8
printf("%.1f\t",r8)
invoke fltmax, addr array, lenarray ;lengthof array
fstp r8
printf("%.1f\n",r8)
invoke fltminmax, addr array, lenarray ;lengthof array
fstp r8min
fstp r8max
printf("%.1f\t%.1f\n\n", r8min, r8max)
invoke Sleep, 4000
counter_begin 10000000, REALTIME_PRIORITY_CLASS
counter_end
printf("%d cycles\n", eax)
;------------------------------
counter_begin 10000000, REALTIME_PRIORITY_CLASS
invoke Mymin, addr array, lenarray ;lengthof array
fstp r8
counter_end
printf("%d cycles\n", eax)
counter_begin 10000000, REALTIME_PRIORITY_CLASS
invoke Mymax, addr array, lenarray ;lengthof array
fstp r8
counter_end
printf("%d cycles\n", eax)
;--------------------------------
counter_begin 10000000, REALTIME_PRIORITY_CLASS
invoke fltmin, addr array, lenarray ;lengthof array
fstp r8
counter_end
printf("%d cycles\n", eax)
counter_begin 10000000, REALTIME_PRIORITY_CLASS
invoke fltmin, addr array, lenarray ;lengthof array
fstp r8
counter_end
printf("%d cycles\n", eax)
counter_begin 10000000, REALTIME_PRIORITY_CLASS
invoke fltminmax, addr array, lenarray ;lengthof array
fstp r8
fstp r8
counter_end
printf("%d cycles\n\n", eax)
inkey
exit
;==============================================================================
end start
Quote from: RuiLoureiro on June 15, 2012, 11:24:48 PM
Here is Mymin and Mymax.
Thanks, added to testbed:
AMD Athlon(tm) Dual Core Processor 4450B (SSE3)
Getting min & max for 10000000 REAL8 values, version B:
67004 µs for FPU
49927 µs for ArrayMinMax, REAL4
56748 µs for ArrayMinMax, REAL8
29678 µs for SSE2, REAL8
59847 µs for fltminmax
20046 µs for Mymin
28936 µs for Mymin
50009 µs for Mymin+Mymax
66939 µs for FPU
50074 µs for ArrayMinMax, REAL4
56960 µs for ArrayMinMax, REAL8
29261 µs for SSE2, REAL8
60256 µs for fltminmax
20170 µs for Mymin
29049 µs for Mymin
48802 µs for Mymin+Mymax
Results (numbers are always a bit different because the pseudo random generator produces a new set every time):
ArrayMinMax= -888.887786/999.998822
r4MinMax= -888.887695/999.998962
r4MinMax= -888.887634/999.998535
r8MinMax= -888.887485/999.998558
SSE2Min= -888.887614/999.998955
:biggrin:
Jochen,
Thanks for Mymin and Mymax ;)
You are too fast !
Intel(R) Pentium(R) 4 CPU 3.00GHz (SSE3)
Getting min & max for 10000000 REAL8 values, version B:
121636 µs for FPU
43742 µs for ArrayMinMax, REAL4
61251 µs for ArrayMinMax, REAL8
27210 µs for SSE2, REAL8
117418 µs for fltminmax
28273 µs for Mymin
28194 µs for Mymin
52216 µs for Mymin+Mymax
116571 µs for FPU
60052 µs for ArrayMinMax, REAL4
93684 µs for ArrayMinMax, REAL8
38531 µs for SSE2, REAL8
113058 µs for fltminmax
27492 µs for Mymin
24843 µs for Mymin
50303 µs for Mymin+Mymax
Results:
ArrayMinMax= -888.887786/999.998822
r4MinMax= -888.887695/999.998962
r4MinMax= -888.887634/999.998535
r8MinMax= -888.887485/999.998558
SSE2Min= -888.887614/999.998955
Quote from: jj2007 on June 15, 2012, 11:36:13 PMnumbers are always a bit different because the pseudo random generator produces a new set every time
seems like they ought to test the same array :redface:
Quote from: dedndave on June 15, 2012, 11:50:17 PM
Quote from: jj2007 on June 15, 2012, 11:36:13 PMnumbers are always a bit different because the pseudo random generator produces a new set every time
seems like they ought to test the same array :redface:
Dave,
Could you show what you get ?
Here is Myminmax
On exit st(0) = MIN
and st(1) = MAX
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
Myminmax proc p:dword, n:dword
mov ecx, [esp+8] ;n
sub ecx, 1
mov edx, [esp+4] ;p
fld real4 ptr [edx+ecx*4] ; set st(1) to MAX value
fld st(0) ; set st(0) to MIN value
sub ecx, 1 ; point to the next value
L0:
fld real4 ptr [edx+ecx*4]
fcomi st, st(1) ; compare st(1)=MIN with st(0)
jae L1
fxch st(1)
jb L2
L1: fcomi st, st(2) ; compare st(2)=MAX with st(0)
jbe L2
fxch st(2)
L2: fstp st
sub ecx, 1
jns L0 ; if ecx>0 or ecx=0 loop to L0
ret 8
Myminmax endp
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef
Quote from: RuiLoureiro on June 16, 2012, 12:18:27 AMDave,
Could you show what you get ?
sure :biggrin:
prescott w/htt
Intel(R) Pentium(R) 4 CPU 3.00GHz (SSE3)
Getting min & max for 10000000 REAL8 values, version B:
105308 µs for FPU
32522 µs for ArrayMinMax, REAL4
46646 µs for ArrayMinMax, REAL8
27849 µs for SSE2, REAL8
101967 µs for fltminmax
21209 µs for Mymin
20266 µs for Mymin
43820 µs for Mymin+Mymax
103278 µs for FPU
29161 µs for ArrayMinMax, REAL4
48483 µs for ArrayMinMax, REAL8
27742 µs for SSE2, REAL8
101931 µs for fltminmax
20891 µs for Mymin
22931 µs for Mymin
39875 µs for Mymin+Mymax
Results:
ArrayMinMax= -888.887786/999.998822
r4MinMax= -888.887695/999.998962
r4MinMax= -888.887634/999.998535
r8MinMax= -888.887485/999.998558
SSE2Min= -888.887614/999.998955
Quote from: dedndave on June 15, 2012, 11:50:17 PM
Quote from: jj2007 on June 15, 2012, 11:36:13 PMnumbers are always a bit different because the pseudo random generator produces a new set every time
seems like they ought to test the same array :redface:
RandFill proc uses ebx
mov MbRndSeed, Mirror$("Ciao")
... but it doesn't make your algos faster :biggrin:
Here is my laptop:
Intel(R) Pentium(R) 4 CPU 3.20GHz (SSE2)
Getting min & max for 10000000 REAL8 values, version B:
72043 µs for FPU
28304 µs for ArrayMinMax, REAL4
43843 µs for ArrayMinMax, REAL8
34622 µs for SSE2, REAL8
69628 µs for fltminmax
20043 µs for Mymin
19272 µs for Mymin
38684 µs for Mymin+Mymax
72595 µs for FPU
25563 µs for ArrayMinMax, REAL4
41723 µs for ArrayMinMax, REAL8
39040 µs for SSE2, REAL8
72373 µs for fltminmax
19587 µs for Mymin
21988 µs for Mymin
43626 µs for Mymin+Mymax
Results:
ArrayMinMax= -888.887786/999.998822
r4MinMax= -888.887695/999.998962
r4MinMax= -888.887634/999.998535
r8MinMax= -888.887485/999.998558
SSE2Min= -888.887614/999.998955
Dave.
Thank you all for testing it
Jochen,
Quote
but it doesn't make your algos faster
:biggrin:
No, the result would be «00009 µs for Mymin»
More ... or ... less ! :greensml:
MichaelW,
Take a look at your reply #37
Quote
I converted qword's code to a procedure and applied Dave's modifications
(except the last) to my code, and did a cycle count comparison.
You NEVER compare the first value
because you did this:
sub ecx, 1
jnz L0 ; when ECX=0 dont COMPARE !
We MUST use jns and NOT jnz
sub ecx, 1
jns L0 ; when ECX>0 or ECX= 0 COMPARE !
KeepingRealBusy
Hi Dave
Take a look at your reply #31
Quote
Always seed max and min with the first value, predecrement the index,
then loop with jnz (since the first element was the basis of min/max) instead of jns
No. We need to use
jns instead of
jnz Your error is here : "the first element was the basis of min/max"
Here, the first element (is the last) is at ECX=length_of_array-1
and not at ECX=0.
In any way, if we test from first (ECX=0) to last [ECX=length_of_array-1]
we compare ECX with length_of_array
add ecx, 1
cmp ecx, length_of_array
jne L?
Rui,
My statement is correct. When I said First, I meant First (element index 0 - whatever the array pointer points to). With using the index as both an index and a count, the first COMPARE will be the last element against the Low (if min testing) or against the high (if max testing), then decrementing the index/count, and skipping index 0 because the value was initially put into min or max (or both if testing min/max together).
Dave.
Quote from: KeepingRealBusy on June 16, 2012, 02:25:17 AM
Rui,
My statement is correct. When I said First, I meant First (element index 0 - whatever the array pointer points to). With using the index as both an index and a count, the first COMPARE will be the last element against the Low (if min testing) or against the high (if max testing), then decrementing the index/count, and skipping index 0 because the value was initially put into min or max (or both if testing min/max together).
Dave
Dave,
Yes, this statement is completely correct.
Now, i understood ! ;)
But i would say it is out of context because we are using
the last value as the first min/max and not «element index 0».
And this is why MichaelW changed jns to jnz but not
the first element.
Following your suggestion, we could write a proc without one
instruction dec ecx.
Here are the new procedures
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
Mymin proc p:dword, n:dword
mov ecx, [esp+8] ;n
;sub ecx, 1
mov edx, [esp+4] ;p
;fld real4 ptr [edx+ecx*4] ; set st(0) to MIN value
fld real4 ptr [edx] ; set st(0) to MIN value
sub ecx, 1 ; point to the next value
L0:
fld real4 ptr [edx+ecx*4]
fcomi st, st(1) ; compare st(1)=MIN with st(0)
jae L1
fstp st(1) ; remove the MIM, st(0) is the MIN
sub ecx, 1
;jns L0 ; if ecx>0 or ecx=0 loop to L0
jnz L0 ; if ecx>0 loop to L0
ret 8
L1: fstp st
sub ecx, 1
;jns L0 ; if ecx>0 or ecx=0 loop to L0
jnz L0 ; if ecx>0 loop to L0
ret 8
Mymin endp
Mymax proc p:dword, n:dword
mov ecx, [esp+8] ;n
;sub ecx, 1
mov edx, [esp+4] ;p
;fld real4 ptr [edx+ecx*4] ; set st(0) to MAX value
fld real4 ptr [edx] ; set st(0) to MIN value
sub ecx, 1 ; point to the next value
L0:
fld real4 ptr [edx+ecx*4] ; point to the next value
fcomi st, st(1) ; compare st(1)=MAX with st(0)
jbe L1
fstp st(1) ; remove the MAX, st(0) is the MAX
sub ecx, 1
;jns L0 ; if ecx>0 or ecx=0 loop to L0
jnz L0 ; if ecx>0 loop to L0
ret 8
L1:
fstp st
sub ecx, 1
;jns L0 ; if ecx>0 or ecx=0 loop to L0
jnz L0 ; if ecx>0 loop to L0
ret 8
Mymax endp
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef
This is the new minmax
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
Myminmax proc p:dword, n:dword
mov ecx, [esp+8] ;n
;sub ecx, 1
mov edx, [esp+4] ;p
;fld real4 ptr [edx+ecx*4] ; set st(1) to MAX value
fld real4 ptr [edx] ; set st(1) to MAX value
fld st(0) ; set st(0) to MIN value
sub ecx, 1 ; point to the next value
L0:
fld real4 ptr [edx+ecx*4]
fcomi st, st(1) ; compare st(1)=MIN with st(0)
jae L1
fxch st(1)
jb L2
L1: fcomi st, st(2) ; compare st(2)=MAX with st(0)
jbe L2
fxch st(2)
L2: fstp st
sub ecx, 1
;jns L0 ; if ecx>0 or ecx=0 loop to L0
jnz L0 ; if ecx>0 loop to L0
ret 8
Myminmax endp
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef
Rui,
Under those conditions your code is correct and you need the jns to compare the first element.
You are setting min and max to the last value, and then comparing the next to the last values in turn against the min and max values.
Dave.
Dave,
Did you see the last code i posted ?
It seems it is better. The first Min and Max is now
the «element index 0», so i used jnz because
we dont need to compare that element again
(if ecx=0, exit)
Rui,
I did download some version and tested the .exe, (see above) but have not yet examined all of the code, so I'm not sure that I have your last version.
Dave.
Quote from: RuiLoureiro on June 16, 2012, 02:48:54 AM
This is the new minmax
Pretty fast, second best on my CPU:
Intel(R) Celeron(R) M CPU 420 @ 1.60GHz (SSE3)
Getting min & max for 10000000 REAL4 and REAL8 values:
47 ms for FPU
70 ms for ArrayMinMax, REAL4
77 ms for ArrayMinMax, REAL8
31 ms for SSE2JJ, REAL8
45 ms for fltminmax
36 ms for Myminmax, REAL4
47 ms for FPU
70 ms for ArrayMinMax, REAL4
78 ms for ArrayMinMax, REAL8
31 ms for SSE2JJ, REAL8
46 ms for fltminmax
36 ms for Myminmax, REAL4
Results:
ArrayMinMax= -888.887818/999.998966
r4MinMax= -888.887817/999.998962
r4MinMax= -888.887817/999.998962
r8MinMax= -888.887818/999.998966
SSE2Min= -888.887818/999.998966
51 bytes for fltminmax
42 bytes for Myminmax
29 bytes for SSE2JJ
Intel(R) Pentium(R) 4 CPU 3.00GHz (SSE3)
Getting min & max for 10000000 REAL4 and REAL8 values:
120 ms for FPU
37 ms for ArrayMinMax, REAL4
54 ms for ArrayMinMax, REAL8
28 ms for SSE2a, REAL8
26 ms for SSE2b, REAL8
112 ms for fltminmax
24 ms for Mymin
30 ms for Mymin
47 ms for Mymin+Mymax
39 ms for Myminmax
109 ms for FPU
36 ms for ArrayMinMax, REAL4
81 ms for ArrayMinMax, REAL8
37 ms for SSE2a, REAL8
29 ms for SSE2b, REAL8
109 ms for fltminmax
24 ms for Mymin
24 ms for Mymin
50 ms for Mymin+Mymax
36 ms for Myminmax
Results:
ArrayMinMax= -888.887818/999.998966
r4MinMax= -888.887817/999.998962
r4MinMax= -888.887817/999.998962
r8MinMax= -888.887818/999.998966
SSE2Min= -888.887818/999.998966
one of Mymin is Mymax Jochen !
prescott w/htt
Intel(R) Pentium(R) 4 CPU 3.00GHz (SSE3)
Getting min & max for 10000000 REAL4 and REAL8 values:
109 ms for FPU
31 ms for ArrayMinMax, REAL4
48 ms for ArrayMinMax, REAL8
28 ms for SSE2a, REAL8
29 ms for SSE2b, REAL8
103 ms for fltminmax
20 ms for Mymin
23 ms for Mymin
42 ms for Mymin+Mymax
33 ms for Myminmax
104 ms for FPU
31 ms for ArrayMinMax, REAL4
46 ms for ArrayMinMax, REAL8
28 ms for SSE2a, REAL8
26 ms for SSE2b, REAL8
105 ms for fltminmax
20 ms for Mymin
20 ms for Mymin
44 ms for Mymin+Mymax
32 ms for Myminmax
Results:
ArrayMinMax= -888.887818/999.998966
r4MinMax= -888.887817/999.998962
r4MinMax= -888.887817/999.998962
r8MinMax= -888.887818/999.998966
SSE2Min= -888.887818/999.998966
I've streamlined it a bit, and added code sizes :biggrin:
(oh, and I forgot: it's now exactly the same array - have you noticed how fast... :eusa_boohoo:)
Rui,
Here is my laptop:
Intel(R) Pentium(R) 4 CPU 3.20GHz (SSE2)
Getting min & max for 10000000 REAL4 and REAL8 values:
72 ms for FPU
24 ms for ArrayMinMax, REAL4
45 ms for ArrayMinMax, REAL8
35 ms for SSE2JJ, REAL8
70 ms for fltminmax
25 ms for Myminmax, REAL4
75 ms for FPU
26 ms for ArrayMinMax, REAL4
42 ms for ArrayMinMax, REAL8
40 ms for SSE2JJ, REAL8
73 ms for fltminmax
28 ms for Myminmax, REAL4
Results:
ArrayMinMax= -888.887818/999.998966
r4MinMax= -888.887817/999.998962
r4MinMax= -888.887817/999.998962
r8MinMax= -888.887818/999.998966
SSE2Min= -888.887818/999.998966
51 bytes for fltminmax
42 bytes for Myminmax
29 bytes for SSE2JJ
bye
I assume these functions return the max or min in st(0) or the min/max in st(0) and st(1.) The code for all three (Mymin, Mymax, Myminmax) look good except for one pesky problem, they will fail for exactly one entry in the array (they will walk off of the end of the array). You need to insert "jnz L0 ret 8" in front of L0.
I find FPU code hard to follow, I usually code it in C and copy the generated code from the .cod file (the third tenant of the programming creed "cheat lie and steal").
Dave.
Hi Dave,
I need to answer this way
1.
Quote
The code for all three (Mymin, Mymax, Myminmax) look good
Yes you are right
2.
Quote
except for one pesky problem
Well, let's go to see where is the problem !
3.
Quote
they will fail for exactly one entry in the array
Well, if THEY fail, Mymin fails (for example).
So we can talk about Mymin to be simple.
One note: Well you are saying but you dont prove nothing.
a) the array has 10 real4 numbers
b) Mymin starts with st(0)=MIN = element in ECX=0
c) It starts the loop with ECX=9
d) It starts comparing MIN with the element in [edx+ecx*4]
It means that we use the element in [edx+9*4] (=
the LAST)
e) It stops when ECX=0, it means "when ecx=0 doesnt loop"
So it is
evident,
obvious, that it uses ALL numbers in the array.
But if you have some doubts run it and print each ECX.
or try
array real4 2, 3, 4, -1 you should get MIN=-1
now try
array real4 2, 3, -1, 4 you should get MIN=-1
now try
array real4 2, -1, 3, 4 you should get MIN=-1
now try
array real4 -1, 2, 3, 4 you should get MIN=-1
4.
Quote
You need to insert "jnz L0 ret 8" in front of L0.
No
5.
Quote
I find FPU code hard to follow, I usually code it in C
It seems you have some problems in reading assembly. ;)
EDIT: see the debug file
Jochen,
Could you replace Myminmax in your ArrayMinMax_vs_FPU2
by this:
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
Myminmax proc p:dword, n:dword
mov ecx, [esp+8] ;n
mov edx, [esp+4] ;p
fld real4 ptr [edx] ; set st(1) to MAX value
fld st(0) ; set st(0) to MIN value
sub ecx, 1 ; points to the last value
L0:
fld real4 ptr [edx+ecx*4]
fcomi st, st(1) ; compare st(1)=MIN with st(0)
jae L1
fxch st(1)
fstp st
sub ecx, 1
jnz L0 ; if ecx>0 loop to L0
ret 8
L1: fcomi st, st(2) ; compare st(2)=MAX with st(0)
jbe L2
fxch st(2)
L2: fstp st
sub ecx, 1
jnz L0 ; if ecx>0 loop to L0
ret 8
Myminmax endp
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef
Rui,
You set the max/min to the first entry, decrement ecx, (ecx now 0) and then uselessly compare the first entry with min and find it is equal so you will uselessly compare the first entry with max and find it is equal, so you pop st(0) and decrement ecx again (ecx now -1, not zero), so you will loop back to L0 (fld real4 ptr [edx+ecx*4]) and will get a memory access fault somewhere along the way. It will not work correctly for an array size of 1.
Actually, the following would work quite well:
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
Myminmax proc p:dword, n:dword
mov ecx, [esp+8] ;n
mov edx, [esp+4] ;p
fld real4 ptr [edx] ; set st(1) to MAX value
fld st(0) ; set st(0) to MIN value
sub ecx, 1 ; points to the last value
jnz L0 ; not a single entry.
ret 8 ; st(0) and st(1) are set with min/max.
L0:
fld real4 ptr [edx+ecx*4]
fcomi st, st(1) ; compare st(1)=MIN with st(0)
jae L1
fxch st(1)
jmp L2
; fstp st ; this code is exactly duplicated in L2
; sub ecx, 1
; jnz L0 ; if ecx>0 loop to L0
; ret 8
L1: fcomi st, st(2) ; compare st(2)=MAX with st(0)
jbe L2
fxch st(2)
L2: fstp st
sub ecx, 1
jnz L0 ; if ecx>0 loop to L0
ret 8
Myminmax endp
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef
Dave
Hi Dave,
Oooops you are trying to rewrite it to work with an array of 1
element and then you follow with arguments one after another
ABOUT that singular case.
It is very very interesting ! Yes in 99.999 % of the cases
we define an array of 1 element and in that cases we are
VERY VERY interested the computer tell us what is the MIN and MAX
of ONE VALUE !
Without doubt, Dave!
Now i can tell you that the procedure you are showing us
doesnt work when you call
invoke Myminmax, addr array, len in the case len=0 or len=-1 ...
Yes, actually, the following would work quite well
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
Myminmax proc p:dword, n:dword
mov ecx, [esp+8] ;n
mov edx, [esp+4] ;p
fld real4 ptr [edx] ; set st(1) to MAX value
fld st(0) ; set st(0) to MIN value
;
sub ecx, 1 ; points to the last value
L0:
fld real4 ptr [edx+ecx*4]
fcomi st, st(1) ; compare st(1)=MIN with st(0)
jae L1
fxch st(1)
fstp st ; YES this code is exactly duplicated in L2 YES
sub ecx, 1
jnz L0 ; if ecx>0 loop to L0
ret 8
L1: fcomi st, st(2) ; compare st(2)=MAX with st(0)
jbe L2
fxch st(2)
L2: fstp st
sub ecx, 1
jnz L0 ; if ecx>0 loop to L0
ret 8
Myminmax endp
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef
See the file i posted before
Assume the procedures we are writing
works for n>1
DEBUG values
The array is this:
array real4 -8.8, -3.9, 111.5, 0.5, 3.6, 1.2, 4.9, 9.9, -988.8, 0.0
2 Numbers
----------------------------------------
FPU Levels : 2
Conditional: ST > Source
Exception : e s p u o z d i
St(0) : -8.8 ««« ECX=9
St(1) : -8.8
2 Numbers
----------------------------------------
FPU Levels : 2
Conditional: ST < Source
Exception : e s P u o z d i
St(0) : -8.8 ««« ECX=8
St(1) : 0
2 Numbers
----------------------------------------
FPU Levels : 2
Conditional: ST < Source
Exception : e s P u o z d i
St(0) : -988.8 ««« ECX=7
St(1) : 0
2 Numbers
----------------------------------------
FPU Levels : 2
Conditional: ST < Source
Exception : e s P u o z d i
St(0) : -988.8 ««« ECX=6
St(1) : 9.9
2 Numbers
----------------------------------------
FPU Levels : 2
Conditional: ST < Source
Exception : e s P u o z d i
St(0) : -988.8 ««« ECX=5
St(1) : 9.9
2 Numbers
----------------------------------------
FPU Levels : 2
Conditional: ST < Source
Exception : e s P u o z d i
St(0) : -988.8 ««« ECX=4
St(1) : 9.9
2 Numbers
----------------------------------------
FPU Levels : 2
Conditional: ST < Source
Exception : e s P u o z d i
St(0) : -988.8 ««« ECX=3
St(1) : 9.9
2 Numbers
----------------------------------------
FPU Levels : 2
Conditional: ST < Source
Exception : e s P u o z d i
St(0) : -988.8 ««« ECX=2
St(1) : 9.9
2 Numbers
----------------------------------------
FPU Levels : 2
Conditional: ST < Source
Exception : e s P u o z d i
St(0) : -988.8 =MIN ««« ECX=1
St(1) : 111.5 =MAX
Rui,
I was not testing for an array size of 0 because there was no defined error return (an error could have been returned in eax) and since there was also no documented way to indicate that st(0) and st(1) were invalid upon return (loading st(0) and st(1) with fltmax and fltmin would also be deceptive since they would not be the correct max or min - there is no max or min of there is no array). I also did not check for an invalid or null pointer.
But, It should be perfectly valid to call this function with a valid pointer to an array for any size of the array, even for a single entry (max and min would be the same as the first entry in that case).
If you want to restrict this function to sizes > 1, then you should document the restriction.
My supplied fix works for all sizes > 0, and is shorter than you version. If you want to work for all sizes including 0, then you should check for 0 size and return an error in eax, otherwise set eax to the good return value and then scan the array (up to you to define these good/bad values in the documentation).
I am not trying to steal your code or take credit for it, I am just pointing out that your code, as posted and not documented otherwise, will fail for an array size of 1.
Dave.
Dave,
We are testing for speed and i wrote it
only to compare with fltmax and fltmin
procedures from VC toolkit
No need to check for null pointer,
generally it crashes.
Quote
If you want to restrict this function to sizes > 1,
then you should document the restriction.
Or you could ask me if it works for n=1.
Jochen and others developed their and
we dont know if it works for n=1 or not.
We are developing to see what it does
so it is not necessary.
Quote from: MichaelW on June 14, 2012, 01:23:11 AM
Another FPU solution, I think probably slow, with min and max as separate procedures, and since this was for graphics I guessed REAL4 instead of REAL8.
;==============================================================================
include \masm32\include\masm32rt.inc
.686
;==============================================================================
;-------------------------------------
; These from VC Toolkit 2003 float.h:
;-------------------------------------
FLT_MAX equ 3.402823466e+38
DBL_MAX equ 1.7976931348623158e+308
;==============================================================================
.data
array real4 -8.8, -3.9, 111.5, 0.5, 3.6, 1.2, 4.9, 9.9, -98.2, 0.0
r8 real8 ?
.code
;==============================================================================
fltmax proc p:dword, n:dword
local _max:real4
mov ecx, n
mov edx, p
fld4 -FLT_MAX
fstp _max
L0:
fld real4 ptr [edx+ecx*4]
fld _max
fcomip st, st(1)
ja L1
fst _max
L1:
fstp st
sub ecx, 1
jns L0
fld _max
ret
fltmax endp
;==============================================================================
fltmin proc p:dword, n:dword
local _min:real4
mov ecx, n
mov edx, p
fld4 FLT_MAX
fstp _min
L0:
fld real4 ptr [edx+ecx*4]
fld _min
fcomip st, st(1)
jb L1
fst _min
L1:
fstp st
sub ecx, 1
jns L0
fld _min
ret
fltmin endp
;==============================================================================
start:
;==============================================================================
invoke fltmin, addr array, lengthof array
fstp r8
printf("%.1f\n",r8)
invoke fltmax, addr array, lengthof array
fstp r8
printf("%.1f\n",r8)
inkey
exit
;==============================================================================
end start
MichaelW, Thanks a lot. This is cut the time converting the array from the real4 to real8.
Hi Michael, the mistake I can notice from your function is, if I had 5 array, I had to type 4 for the array count for it so it can be worked. I think that is the only bug I can notice.
(http://ompldr.org/vZXl0Yw/Test.PNG)
Have a look at the red line, it was telling us where is the lowest Vertex and the highest Vertex from an object.
I need this function to be functioning so I will be able to extract each edge and build a shdow volume from it.
Here is how I used it
fShadowProcessArrayX proc uses esi edi lpArray:dword,nVertexCount:dword
LOCAL x_arr,y_arr,z_arr:dword
LOCAL data_offset:dword
LOCAL memNeeded:dword
LOCAL MAX:VERTEX
LOCAL MIN:VERTEX
LOCAL DLT:VERTEX
LOCAL DLT2:VERTEX
LOCAL PVTPNT:VERTEX
LOCAL arrLen,arrOffs,lpResult:dword
LOCAL lgDP:Dword
LOCAL fLen:real4
mov ecx,nVertexCount
shl ecx,2
mov memNeeded,ecx
invoke mAlloc,memNeeded
mov x_arr,eax
invoke mAlloc,memNeeded
mov y_arr,eax
invoke mAlloc,memNeeded
mov z_arr,eax
mov ecx,nVertexCount
shl ecx,4
mov memNeeded,ecx
invoke mAlloc,memNeeded
mov lpResult,eax
mov esi,lpArray
xor ecx,ecx
mov data_offset,ecx
loop_extract:
push ecx
mov ecx,x_arr
add ecx,data_offset
mov eax,[esi].VERTEX.x
mov [ecx],eax
mov ecx,y_arr
add ecx,data_offset
mov eax,[esi].VERTEX.y
mov [ecx],eax
mov ecx,z_arr
add ecx,data_offset
mov eax,[esi].VERTEX.z
mov [ecx],eax
add esi,12
add data_offset,4
pop ecx
inc ecx
cmp ecx,nVertexCount
jl loop_extract
dec nVertexCount
invoke fltmax,x_arr,nVertexCount
fstp MAX.x
invoke fltmax,y_arr,nVertexCount
fstp MAX.y
invoke fltmax,z_arr,nVertexCount
fstp MAX.z
dec nVertexCount
invoke fltmin,x_arr,nVertexCount
fstp MIN.x
invoke fltmin,y_arr,nVertexCount
fstp MIN.y
invoke fltmin,z_arr,nVertexCount
fstp MIN.z
inc nVertexCount
invoke glColorMask,GL_TRUE,GL_TRUE,GL_TRUE,GL_TRUE
invoke glEnable,GL_COLOR_MATERIAL
invoke glDisable,GL_TEXTURE_2D
invoke glDisable,GL_STENCIL_TEST
invoke glColor4f,FP4(1.0f),FP4(0.0f),FP4(0.0f), FP4(1.)
invoke glBegin,GL_LINES
invoke glVertex3fv,addr MIN
invoke glVertex3fv,addr MAX
invoke glEnd
invoke glEnable,GL_STENCIL_TEST
invoke glColorMask,GL_FALSE,GL_FALSE,GL_FALSE,GL_FALSE
invoke GlobalFree,x_arr
invoke GlobalFree,y_arr
invoke GlobalFree,z_arr
invoke Vec_Sub,addr DLT,addr MAX,addr MIN
invoke Vec_Normalize,addr PVTPNT,addr DLT
invoke Vec_DotProduct,addr DLT,addr DLT
FDIV FP4(2.)
fstp fLen
invoke Vec_Scale,addr DLT,fLen
mov ecx,nVertexCount
shl ecx,2
invoke mAlloc,ecx
mov arrLen,eax
mov esi,lpArray
xor ecx,ecx
mov arrOffs,ecx
mov data_offset,ecx
loop_get_len:
push ecx
invoke Vec_Sub,addr DLT2,esi,addr DLT
invoke Vec_DotProduct,addr DLT2,addr DLT2
mov ecx,arrLen
add ecx,data_offset
fsqrt
fstp dword ptr[ecx]
add esi,12
add data_offset,4
pop ecx
inc ecx
cmp ecx,nVertexCount
jl loop_get_len
dec nVertexCount
invoke fltmax,arrLen,nVertexCount
fstp lgDP
mov edi,arrLen
loop_get_longest:
push ecx
FCMP dword ptr[edi],lgDP
jz done_loop
add edi,4
pop ecx
inc ecx
cmp ecx,nVertexCount
jl loop_get_longest
done_loop:
invoke GlobalFree,arrLen
ret
fShadowProcessArrayX endp
Thanks for your time,
Onan Farabi