fsmVecSub proc uses esi edi lpVDest:dword,lpVA:dword,lpVB:dword
mov esi,lpVA
mov edi,lpVB
mov eax,lpVDest
movups xmm0,[esi]
movups xmm1,[edi]
subps xmm0,xmm1
movups [eax],xmm0
ret
fsmVecSub endp
fsmVecAdd proc uses esi edi lpVDest:dword,lpVA:dword,lpVB:dword
mov esi,lpVA
mov edi,lpVB
mov eax,lpVDest
movups xmm0,[esi]
movups xmm1,[edi]
addps xmm0,xmm1
movups [eax],xmm0
ret
fsmVecAdd endp
fsmVecMul proc uses esi edi lpVDest:dword,lpVA:dword,lpVB:dword
mov esi,lpVA
mov edi,lpVB
mov eax,lpVDest
movups xmm0,[esi]
movups xmm1,[edi]
mulps xmm0,xmm1
movups [eax],xmm0
ret
fsmVecMul endp
Im sorry for wasting your time, but can anyone do timing for this function for me. It seems I messed up my timer code and it always yield -1 result. I want to know how many microseconds it achieved on 100 milions loops. It should be about half seconds.
as always, you can get Michael Webster's timers.asm in the first thread of the laboratory
;###############################################################################################
.XCREF
.NoList
INCLUDE \Masm32\Include\Masm32rt.inc
.686p
.MMX
.XMM
INCLUDE \Masm32\Macros\Timers.asm
.List
;###############################################################################################
Loop_Count = 10000 ;adjust the loop count so that each pass takes about 0.5 seconds
;###############################################################################################
.DATA
;***********************************************************************************************
.DATA?
;###############################################################################################
.CODE
;***********************************************************************************************
_main PROC
INVOKE GetCurrentProcess
INVOKE SetProcessAffinityMask,eax,1
INVOKE Sleep,750
mov ecx,5
Loop00: push ecx
counter_begin Loop_Count,HIGH_PRIORITY_CLASS
;put your code to be timed here
counter_end
print str$(eax),32
pop ecx
dec ecx
jnz Loop00
print chr$(13,10)
inkey
INVOKE ExitProcess,0
_main ENDP
;###############################################################################################
END _main
I did, but each time I used mul eax, the timer is error and return -1.
Based on my timing it took half a second for 100 milions loop. Multiply it by 8 it reached almost 1 GFlops per second. Not bad. Making a 3D software raytracer should be posible.