int 3
Print Str$("\ns_variance= \t %5f", gsl_stats_variance(ecx, 1, ebx))
Address Hex dump Command Comments
630E7AB0 Ú> À55 push ebp
630E7AB1 ³. 8BEC mov ebp, esp
...
630E7ACC ³. F20F1045 F8 movsd xmm0, [ebp-8] ; <<<< SIMD #######
630E7AD1 ³. F20F110424 movsd [esp], xmm0 ; ÚArg4_5
630E7AD6 ³. 53 push ebx ; ³Arg3 => [ARG.3]
630E7AD7 ³. FF75 0C push dword ptr [ebp+0C] ; ³Arg2 => [ARG.2]
630E7ADA ³. FF75 08 push dword ptr [ebp+8] ; ³Arg1 => [ARG.1]
630E7ADD ³. E8 AECAFFFF call 630E4590 ; Àgsl.630E4590
630E7AE2 ³. 660F6ECB movd xmm1, ebx ; <<<< SIMD #######
630E7AE6 ³. 8BC3 mov eax, ebx
630E7AE8 ³. F30FE6C9 cvtdq2pd xmm1, xmm1 ; <<<< SIMD #######
630E7AEC ³. C1E8 1F shr eax, 1F
630E7AEF ³. 83C4 14 add esp, 14
630E7AF2 ³. F20F580CC5 00B912 addsd xmm1, [eax*8+6312B900
630E7AFB ³. 8D43 FF lea eax, [ebx-1]
630E7AFE ³. 660F6EC0 movd xmm0, eax
630E7B02 ³. F30FE6C0 cvtdq2pd xmm0, xmm0 ; <<<< SIMD #######
630E7B06 ³. C1E8 1F shr eax, 1F
630E7B09 ³. 5B pop ebx
630E7B0A ³. F20F5804C5 00B912 addsd xmm0, [eax*8+6312B900
630E7B13 ³. F20F5EC8 divsd xmm1, xmm0 ; <<<< SIMD #######
630E7B17 ³. F20F114D F8 movsd [ebp-8], xmm1 ; <<<< SIMD #######
630E7B1C ³. DC4D F8 fmul qword ptr [ebp-8]
630E7B1F ³. 8BE5 mov esp, ebp
630E7B21 ³. 5D pop ebp
630E7B22 À. C3 retn
QuoteBut if you want speed, GSL is the best choice: http://www.gnu.org/software/gsl/
Vc-master\attic\sse\casts.h Vc-master\attic\sse\const.h Vc-master\attic\sse\const_data.h Vc-master\attic\sse\debug.h Vc-master\attic\sse\deinterleave.tcc Vc-master\attic\sse\detail.h Vc-master\attic\sse\helperimpl.h Vc-master\attic\sse\intrinsics.h Vc-master\attic\sse\limits.h Vc-master\attic\sse\macros.h Vc-master\attic\sse\mask.h Vc-master\attic\sse\mask.tcc Vc-master\attic\sse\math.h Vc-master\attic\sse\prefetches.tcc Vc-master\attic\sse\shuffle.h Vc-master\attic\sse\simd_cast.h Vc-master\attic\sse\simd_cast_caller.tcc Vc-master\attic\sse\type_traits.h Vc-master\attic\sse\types.h Vc-master\attic\sse\vector.h Vc-master\attic\sse\vector.tcc Vc-master\attic\sse\vectorhelper.h Vc-master\attic\sse\vectorhelper.tcc |
Quote from: hutch-- on November 19, 2017, 05:04:52 PM
I wonder how hard it would be to make a 64 bit SSE or later library to perform a range of common maths tasks ? I know you can do it up to 80 bit FP using the FP registers and mnemonics but I have done little work with maths in SSE or later and the aim with such an idea is to take advantage of the extra speed of SSE and later as against the older FP registers and mnemonics.
.code
XMScalarCos proc public
movaps xmm1, xmm0
movaps xmm2, xmm0
mulss xmm2, _XM_1DIV2PI ; xmm2=quocient
comiss xmm1, _XM_REAL4ZERO
.if ABOVEEQUAL?
addss xmm2, _XM_REAL4HALF
cvttss2si eax, xmm2
cvtsi2ss xmm2, eax
.else
subss xmm2, _XM_REAL4HALF
cvttss2si eax, xmm2
cvtsi2ss xmm2, eax
.endif
mulss xmm2, _XM_2PI
movss xmm1, xmm0
subss xmm1, xmm2 ; xmm1=y
movss xmm0, xmm1
comiss xmm0, _XM_PIDIV2
.if ABOVE?
movss xmm2, _XM_PI
subss xmm2, xmm1
movss xmm1, xmm2
movss xmm2, _XM_MINUSONE ; xmm2=sign
.else
movss xmm0, xmm1
comiss xmm0, _XM_MINUSPIDIV2
.if BELOW?
movss xmm2, _XM_MINUSPI
subss xmm2, xmm1
movss xmm1, xmm2
movss xmm2, _XM_MINUSONE ; xmm2=sign
.else
movss xmm2, _XM_PLUSONE
.endif
.endif
mulss xmm1, xmm1 ; xmm1 now =y^2
movss xmm3, _Constant1_XMScalarCos
mulss xmm3, xmm1
addss xmm3, _Constant2_XMScalarCos
mulss xmm3, xmm1
subss xmm3, _Constant3_XMScalarCos
mulss xmm3, xmm1
addss xmm3, _Constant4_XMScalarCos
mulss xmm3, xmm1
subss xmm3, _XM_REAL4HALF
mulss xmm3, xmm1
addss xmm3, _XM_PLUSONE
mulss xmm2, xmm3
movss xmm0, xmm2
ret
XMScalarCos endp
Quote from: aw27 on November 19, 2017, 08:25:09 PMAs an example, the MASM equivalent of the DirectX XMScalarCos (64-bit).
Quote from: aw27 on November 18, 2017, 12:35:00 AMWhen I post examples or code it is always complete and ready to be built::)
Quote from: MSDNDirectXMath supports vectors of 4 single-precision floating-point or four 32-bit (signed or unsigned) values.
QuoteIt is attached.
Apparently I have forgotten how to compile C#. It's posible that you upload testApp.exe?
QuoteI have also done all the collision functions, BoundingBox, BoundingFrustrum,BoundingOrientedBox,BoundingSphere, which are not part of the DirectxMath proper, and have tested with a modification of the Collision sample from the DirectxSdk (You can download the attachment in the next message because it goes over 512KB here). It outperforms the original.
only a little limited if follow DirectXMath
Quote from: HSE on November 28, 2017, 01:22:16 AMI know there is a problem in some systems but it is an issue of the Microsoft sample itself, which may have been fixed in the meantime in the github repository. I really have no clue.
Advanced graphics are beyond my scope but I see the collision idea. Text (buttons and listbox) are not visible in the computer that I runned the program.