fMat4fMulMatrixWithVertex proc uses esi edi lpMatrix:dword,lpVertex:dword
LOCAL rsltX,rsltY,rsltZ:dword
mov esi,lpMatrix
mov edi,lpVertex
fldz
fstp rsltX
fld [esi].VERTEX.x
fmul [edi].VERTEX.x
fstp rsltX
fld [esi].VERTEX.y
fmul [edi].VERTEX.y
fadd rsltX
fstp rsltX
fld [esi].VERTEX.z
fmul [edi].VERTEX.z
fadd rsltX
fstp rsltX
add esi,4*4
fldz
fstp rsltY
fld [esi].VERTEX.x
fmul [edi].VERTEX.x
fstp rsltY
fld [esi].VERTEX.y
fmul [edi].VERTEX.y
fadd rsltY
fstp rsltY
fld [esi].VERTEX.z
fmul [edi].VERTEX.z
fadd rsltY
fstp rsltY
add esi,4*4
fldz
fstp rsltZ
fld [esi].VERTEX.x
fmul [edi].VERTEX.x
fstp rsltZ
fld [esi].VERTEX.y
fmul [edi].VERTEX.y
fadd rsltZ
fstp rsltZ
fld [esi].VERTEX.z
fmul [edi].VERTEX.z
fadd rsltZ
fstp rsltZ
fld rsltX
fstp [edi].VERTEX.x
fld rsltY
fstp [edi].VERTEX.y
fld rsltZ
fstp [edi].VERTEX.z
ret
fMat4fMulMatrixWithVertex endp
This will multiply your matrix with a vertex,this is the CPU version. I have not test the speed but the GPU version have a latency slow enough if used to many time, with this, you can use it as much as you want without worrying about too much using it. Individual process the GPU version is faster but this CPU version intended for use to avoid the GPU latency because of GPU limitation on reading data from VGA card memory on old card.
I think you could gain a few speed by replacing
fldz
fstp rsltXYZ
with a simple
mov rsltXYZ,0
Quote from: Jean-Marie on January 06, 2014, 03:56:30 AM
I think you could gain a few speed by replacing
fldz
fstp rsltXYZ
with a simple
mov rsltXYZ,0
that's for sure.
Farabi,
on the other hand, your Vertex is similar to a normal vector. Why not using SIMD?
Gunther
Quote from: Jean-Marie on January 06, 2014, 03:56:30 AM
I think you could gain a few speed by replacing
fldz
fstp rsltXYZ
with a simple
mov rsltXYZ,0
Thanks. I'll note it on the next source code release, I'll try to optimize it.
Quote from: Gunther on January 06, 2014, 04:44:31 AM
Quote from: Jean-Marie on January 06, 2014, 03:56:30 AM
I think you could gain a few speed by replacing
fldz
fstp rsltXYZ
with a simple
mov rsltXYZ,0
that's for sure.
Farabi,
on the other hand, your Vertex is similar to a normal vector. Why not using SIMD?
Gunther
I'll try to make the SIMD version after I done the backup data to my new system. I just bough a new laptop.
Farabi,
Quote from: Farabi on January 06, 2014, 08:06:10 PM
I'll try to make the SIMD version after I done the backup data to my new system. I just bough a new laptop.
No hurry. Take it easy.
Gunther
mov esi,lpMatrix
mov edi,lpVertex
; fldz
; fstp rsltX
mov rsltX,0
fld [esi].VERTEX.x
fmul [edi].VERTEX.x
fstp rsltX
fld [esi].VERTEX.y
fmul [edi].VERTEX.y
fadd rsltX
fstp rsltX
fld [esi].VERTEX.z
fmul [edi].VERTEX.z
fadd rsltX
fstp rsltX
this code can be changed to
; movq xmm0,[esi]
; movq xmm1,[edi]
; mulps xmm0,xmm1
But how to add all the element at xmm0 in single instruction?
Farabi,
you'll need a horizontal addittion like haddps.
Gunther
Quote from: Gunther on January 12, 2014, 09:35:02 PM
Farabi,
you'll need a horizontal addittion like haddps.
Gunther
Never heard of it. Its SSE3, Im not sure my laptop is supported it.
Hi Farabi,
for which type of values do you need the horizontal addition. Here is one for float. We assume that your result is in XMM0 and you would like to di a horizontal addition:
movhlps xmm1, xmm0 ; get bits 64 - 127 from xmm0
addps xmm0, xmm1 ; sums in 2 dwords
pshufd xmm1, xmm0, 01h ; get bits 32 - 63 from xmm0
addss xmm0, xmm1 ; function result in 1 dword
Gunther
assuming that the matrix is saved row-wise, the following SSE1 solution (REAL4/float) should do the job:
M4x4 struct
; a11 ... a44
cntr_row = 1
REPEAT 4 ; for each row ...
cntr_clmn = 1
REPEAT 4 ; for each column ...
@CatStr(<a>,%cntr_row,%cntr_clmn) REAL4 ?
cntr_clmn = cntr_clmn + 1
ENDM
cntr_row = cntr_row + 1
ENDM
M4x4 ends
V4 struct
x REAl4 ?
y REAl4 ?
z REAl4 ?
w REAl4 ?
V4 ends
.code
; matrix is unaligned
mul_V4T_M4x4u proc pResult: ptr V4, pV4: ptr V4, pM4x4: ptr M4x4
mov eax,pResult
mov ecx,pV4
mov edx,pM4x4
movups xmm0,OWORD ptr [ecx]
movups xmm4,OWORD ptr [edx][0*OWORD]
movups xmm5,OWORD ptr [edx][1*OWORD]
movups xmm6,OWORD ptr [edx][2*OWORD]
movups xmm7,OWORD ptr [edx][3*OWORD]
movaps xmm1,xmm0
movaps xmm2,xmm0
movaps xmm3,xmm0
shufps xmm0,xmm0,00000000y ; all elements = x
shufps xmm1,xmm1,01010101y ; = y
shufps xmm2,xmm2,10101010y ; = z
shufps xmm3,xmm3,11111111y ; = w
mulps xmm0,xmm4 ; a11 * x , a12 * x , a13 * x , a14 * x
mulps xmm1,xmm5 ; a21 * y , a22 * y , ...
mulps xmm2,xmm6 ; a31 * z , ...
mulps xmm3,xmm7 ; a41 * w , ...
addps xmm0,xmm1 ; (a11 * x)+(a21 * y) , (a12 * x)+(a22 * y ) , ...
addps xmm0,xmm2 ; (a11 * x)+(a21 * y)+(a31 * z), ...
addps xmm0,xmm3 ; (a11 * x)+(a21 * y)+(a31 * z)+(a41 * w), ...
movups OWORD ptr [eax],xmm0
ret
mul_V4T_M4x4u endp
; matrix is aligned
mul_V4T_M4x4a proc pResult: ptr V4, pV4: ptr V4, pM4x4: ptr M4x4
mov eax,pResult
mov ecx,pV4
mov edx,pM4x4
movups xmm0,OWORD ptr [ecx] ; movaps ? -> align 16
movaps xmm1,xmm0
movaps xmm2,xmm0
movaps xmm3,xmm0
shufps xmm0,xmm0,00000000y
shufps xmm1,xmm1,01010101y
shufps xmm2,xmm2,10101010y
shufps xmm3,xmm3,11111111y
mulps xmm0,OWORD ptr [edx][0*OWORD]
mulps xmm1,OWORD ptr [edx][1*OWORD]
mulps xmm2,OWORD ptr [edx][2*OWORD]
mulps xmm3,OWORD ptr [edx][3*OWORD]
addps xmm0,xmm1
addps xmm0,xmm2
addps xmm0,xmm3
movups OWORD ptr [eax],xmm0 ; movaps ? -> align 16
ret
mul_V4T_M4x4a endp
EDIT: for the case that the matrix is saved column-wise:
M4x4 struct
; a11 ... a44
cntr_clmn = 1
REPEAT 4 ; for each column ...
cntr_row = 1
REPEAT 4 ; for each row ...
@CatStr(<a>,%cntr_row,%cntr_clmn) REAL4 ?
cntr_row = cntr_row + 1
ENDM
cntr_clmn = cntr_clmn + 1
ENDM
M4x4 ends
V4 struct
x REAl4 ?
y REAl4 ?
z REAl4 ?
w REAl4 ?
V4 ends
mul_V4T_M4x4 proc pResult: ptr V4, pV4: ptr V4, pM4x4: ptr M4x4
mov eax,pResult
mov ecx,pV4
mov edx,pM4x4
movups xmm4,OWORD ptr [ecx] ; movaps? -> align 16
movups xmm0,OWORD ptr [edx][0*OWORD] ; ^
movups xmm1,OWORD ptr [edx][1*OWORD] ; ^
movups xmm2,OWORD ptr [edx][2*OWORD] ; ^
movups xmm3,OWORD ptr [edx][3*OWORD] ; ^
mulps xmm0,xmm4 ; a11 * x , a21 * y , a31 * z , a41 * w
mulps xmm1,xmm4 ; a12 * x , a22 * y , ...
mulps xmm2,xmm4 ; a13 * x , ...
mulps xmm3,xmm4 ; a14 * x , ...
; xmm0-3 = column 1-4
movaps xmm4,xmm0
movaps xmm5,xmm2
unpcklps xmm4,xmm1
unpcklps xmm5,xmm3
unpckhps xmm0,xmm1
unpckhps xmm2,xmm3
movaps xmm1,xmm4
movaps xmm3,xmm0
shufps xmm4,xmm5,01000100y ; row 1
shufps xmm1,xmm5,11101110y ; row 2
shufps xmm3,xmm2,01000100y ; row 3
shufps xmm0,xmm2,11101110y ; row 4
addps xmm1,xmm4 ; (a11 * x)+(a21 * y) , (a12 * x)+(a22 * y) , ...
addps xmm0,xmm3 ; (a31 * z)+(a41 * w) , (a32 * z)+(a42 * w) , ...
addps xmm0,xmm1 ; (a11 * x)+(a21 * y)+(a31 * z)+(a41 * w) , ...
movups OWORD ptr [eax],xmm0 ; movaps? -> align 16
ret
mul_V4T_M4x4 endp
Thanks qWord and Gunther, your knowledge is above my knowledge.
I want to show off my rendering engine using all of this technique to you. https://drive.google.com/file/d/0B4e67L-fbWVQRHZldUJjXy1UWjA/edit?usp=sharing I've cut all dependencies and reduce the size from 20Mbytes to 2MBytes of package, from 600kbytes of application to 50 kbytes, on the application it had the dynamic shadowing system, motion capture using bvh format, and some of basic 3D math.