Multiply a vertex with a matrix

Farabi · January 05, 2014, 04:10:27 PM


fMat4fMulMatrixWithVertex proc uses esi edi lpMatrix:dword,lpVertex:dword
	LOCAL rsltX,rsltY,rsltZ:dword
	
	mov esi,lpMatrix
	mov edi,lpVertex
	
	fldz
	fstp rsltX
	
	fld [esi].VERTEX.x
	fmul [edi].VERTEX.x
	fstp rsltX
	fld [esi].VERTEX.y
	fmul [edi].VERTEX.y
	fadd rsltX
	fstp rsltX
	fld [esi].VERTEX.z
	fmul [edi].VERTEX.z
	fadd rsltX
	fstp rsltX
	
	add esi,4*4
	fldz
	fstp rsltY
	
	fld [esi].VERTEX.x
	fmul [edi].VERTEX.x
	fstp rsltY
	fld [esi].VERTEX.y
	fmul [edi].VERTEX.y
	fadd rsltY
	fstp rsltY
	fld [esi].VERTEX.z
	fmul [edi].VERTEX.z
	fadd rsltY
	fstp rsltY
	
	add esi,4*4
	fldz
	fstp rsltZ
	
	fld [esi].VERTEX.x
	fmul [edi].VERTEX.x
	fstp rsltZ
	fld [esi].VERTEX.y
	fmul [edi].VERTEX.y
	fadd rsltZ
	fstp rsltZ
	fld [esi].VERTEX.z
	fmul [edi].VERTEX.z
	fadd rsltZ
	fstp rsltZ
	
	fld rsltX
	fstp [edi].VERTEX.x
	fld rsltY
	fstp [edi].VERTEX.y
	fld rsltZ
	fstp [edi].VERTEX.z
	
	ret
fMat4fMulMatrixWithVertex endp

This will multiply your matrix with a vertex,this is the CPU version. I have not test the speed but the GPU version have a latency slow enough if used to many time, with this, you can use it as much as you want without worrying about too much using it. Individual process the GPU version is faster but this CPU version intended for use to avoid the GPU latency because of GPU limitation on reading data from VGA card memory on old card.

Jean-Marie · January 06, 2014, 03:56:30 AM

I think you could gain a few speed by replacing
fldz
fstp rsltXYZ

with a simple
mov rsltXYZ,0

Gunther · January 06, 2014, 04:44:31 AM

Quote from: Jean-Marie on January 06, 2014, 03:56:30 AM
I think you could gain a few speed by replacing
fldz
fstp rsltXYZ

with a simple
mov rsltXYZ,0

that's for sure.

Farabi,

on the other hand, your Vertex is similar to a normal vector. Why not using SIMD?

Gunther

Farabi · January 06, 2014, 08:04:51 PM

Quote from: Jean-Marie on January 06, 2014, 03:56:30 AM
I think you could gain a few speed by replacing
fldz
fstp rsltXYZ

with a simple
mov rsltXYZ,0

Thanks. I'll note it on the next source code release, I'll try to optimize it.

Farabi · January 06, 2014, 08:06:10 PM

Quote from: Gunther on January 06, 2014, 04:44:31 AM
Quote from: Jean-Marie on January 06, 2014, 03:56:30 AM
I think you could gain a few speed by replacing
fldz
fstp rsltXYZ

with a simple
mov rsltXYZ,0

that's for sure.

Farabi,

on the other hand, your Vertex is similar to a normal vector. Why not using SIMD?

Gunther

I'll try to make the SIMD version after I done the backup data to my new system. I just bough a new laptop.

Gunther · January 07, 2014, 09:02:22 AM

Farabi,

Quote from: Farabi on January 06, 2014, 08:06:10 PM
I'll try to make the SIMD version after I done the backup data to my new system. I just bough a new laptop.

No hurry. Take it easy.

Gunther

Farabi · January 11, 2014, 02:49:21 PM

Code Select


	mov esi,lpMatrix
	mov edi,lpVertex
	
;	fldz
;	fstp rsltX	
	mov rsltX,0
	
	fld [esi].VERTEX.x
	fmul [edi].VERTEX.x
	fstp rsltX
	fld [esi].VERTEX.y
	fmul [edi].VERTEX.y
	fadd rsltX
	fstp rsltX
	fld [esi].VERTEX.z
	fmul [edi].VERTEX.z
	fadd rsltX
	fstp rsltX

this code can be changed to

Code Select


;	movq xmm0,[esi]
;	movq xmm1,[edi]
;	mulps xmm0,xmm1

But how to add all the element at xmm0 in single instruction?

Gunther · January 12, 2014, 09:35:02 PM

Farabi,

you'll need a horizontal addittion like haddps.

Gunther

Farabi · January 13, 2014, 04:08:05 PM

Quote from: Gunther on January 12, 2014, 09:35:02 PM
Farabi,

you'll need a horizontal addittion like haddps.

Gunther

Never heard of it. Its SSE3, Im not sure my laptop is supported it.

Gunther · January 14, 2014, 12:20:01 AM

Hi Farabi,

for which type of values do you need the horizontal addition. Here is one for float. We assume that your result is in XMM0 and you would like to di a horizontal addition:

Code Select


        movhlps    xmm1, xmm0            ; get bits 64 - 127 from xmm0
        addps      xmm0, xmm1            ; sums in 2 dwords
        pshufd     xmm1, xmm0, 01h       ; get bits 32 - 63 from xmm0
        addss      xmm0, xmm1            ; function result in 1 dword

Gunther

qWord · January 14, 2014, 01:32:00 AM

assuming that the matrix is saved row-wise, the following SSE1 solution (REAL4/float) should do the job:

Code Select

M4x4 struct
    ; a11 ... a44
    cntr_row = 1
    REPEAT 4 ; for each row ...
        cntr_clmn = 1
        REPEAT 4 ; for each column ...
            @CatStr(<a>,%cntr_row,%cntr_clmn) REAL4 ?
            cntr_clmn = cntr_clmn + 1
        ENDM
        cntr_row = cntr_row + 1
    ENDM
M4x4 ends

V4 struct
    x   REAl4 ?
    y   REAl4 ?
    z   REAl4 ?
    w   REAl4 ?
V4 ends

.code

; matrix is unaligned
mul_V4T_M4x4u proc pResult: ptr V4, pV4: ptr V4, pM4x4: ptr M4x4
    
    mov eax,pResult
    mov ecx,pV4
    mov edx,pM4x4

    movups xmm0,OWORD ptr [ecx]
    movups xmm4,OWORD ptr [edx][0*OWORD]
    movups xmm5,OWORD ptr [edx][1*OWORD]
    movups xmm6,OWORD ptr [edx][2*OWORD]
    movups xmm7,OWORD ptr [edx][3*OWORD]
    movaps xmm1,xmm0
    movaps xmm2,xmm0
    movaps xmm3,xmm0
    shufps xmm0,xmm0,00000000y  ; all elements  = x
    shufps xmm1,xmm1,01010101y  ;               = y
    shufps xmm2,xmm2,10101010y  ;               = z
    shufps xmm3,xmm3,11111111y  ;               = w
    mulps xmm0,xmm4         ; a11 * x , a12 * x , a13 * x , a14 * x
    mulps xmm1,xmm5         ; a21 * y , a22 * y , ...
    mulps xmm2,xmm6         ; a31 * z , ...
    mulps xmm3,xmm7         ; a41 * w , ...
    addps xmm0,xmm1         ; (a11 * x)+(a21 * y) , (a12 * x)+(a22 * y ) , ...
    addps xmm0,xmm2         ; (a11 * x)+(a21 * y)+(a31 * z), ...
    addps xmm0,xmm3         ; (a11 * x)+(a21 * y)+(a31 * z)+(a41 * w), ...
    movups OWORD ptr [eax],xmm0

    ret
    
mul_V4T_M4x4u endp

; matrix is aligned
mul_V4T_M4x4a proc pResult: ptr V4, pV4: ptr V4, pM4x4: ptr M4x4
    
    mov eax,pResult
    mov ecx,pV4
    mov edx,pM4x4   

    movups xmm0,OWORD ptr [ecx]     ; movaps ? -> align 16
    movaps xmm1,xmm0
    movaps xmm2,xmm0
    movaps xmm3,xmm0
    shufps xmm0,xmm0,00000000y
    shufps xmm1,xmm1,01010101y
    shufps xmm2,xmm2,10101010y
    shufps xmm3,xmm3,11111111y
    mulps xmm0,OWORD ptr [edx][0*OWORD]
    mulps xmm1,OWORD ptr [edx][1*OWORD]
    mulps xmm2,OWORD ptr [edx][2*OWORD]
    mulps xmm3,OWORD ptr [edx][3*OWORD]
    addps xmm0,xmm1
    addps xmm0,xmm2
    addps xmm0,xmm3
    movups OWORD ptr [eax],xmm0     ; movaps ? -> align 16

    ret
    
mul_V4T_M4x4a endp

EDIT: for the case that the matrix is saved column-wise:

Code Select

M4x4 struct
    ; a11 ... a44
    cntr_clmn  = 1
    REPEAT 4 ; for each column ...
        cntr_row = 1
        REPEAT 4 ; for each row ...
            @CatStr(<a>,%cntr_row,%cntr_clmn) REAL4 ?
            cntr_row = cntr_row + 1
        ENDM
        cntr_clmn = cntr_clmn + 1
    ENDM
M4x4 ends

V4 struct
    x   REAl4 ?
    y   REAl4 ?
    z   REAl4 ?
    w   REAl4 ?
V4 ends

mul_V4T_M4x4 proc pResult: ptr V4, pV4: ptr V4, pM4x4: ptr M4x4
    
    mov eax,pResult
    mov ecx,pV4
    mov edx,pM4x4

    movups xmm4,OWORD ptr [ecx]             ; movaps? -> align 16 
    movups xmm0,OWORD ptr [edx][0*OWORD]    ; ^
    movups xmm1,OWORD ptr [edx][1*OWORD]    ; ^
    movups xmm2,OWORD ptr [edx][2*OWORD]    ; ^
    movups xmm3,OWORD ptr [edx][3*OWORD]    ; ^
    
    mulps xmm0,xmm4         ; a11 * x , a21 * y , a31 * z , a41 * w
    mulps xmm1,xmm4         ; a12 * x , a22 * y , ...
    mulps xmm2,xmm4         ; a13 * x , ...
    mulps xmm3,xmm4         ; a14 * x , ...
    
    ; xmm0-3 = column 1-4
    movaps xmm4,xmm0
    movaps xmm5,xmm2
    unpcklps xmm4,xmm1
    unpcklps xmm5,xmm3
    unpckhps xmm0,xmm1
    unpckhps xmm2,xmm3
    movaps xmm1,xmm4
    movaps xmm3,xmm0
    shufps xmm4,xmm5,01000100y  ; row 1
    shufps xmm1,xmm5,11101110y  ; row 2
    shufps xmm3,xmm2,01000100y  ; row 3
    shufps xmm0,xmm2,11101110y  ; row 4
    
    addps xmm1,xmm4     ; (a11 * x)+(a21 * y) , (a12 * x)+(a22 * y) , ...
    addps xmm0,xmm3     ; (a31 * z)+(a41 * w) , (a32 * z)+(a42 * w) , ...
    addps xmm0,xmm1     ; (a11 * x)+(a21 * y)+(a31 * z)+(a41 * w) , ...
    
    movups OWORD ptr [eax],xmm0     ; movaps? -> align 16 

    ret
    
mul_V4T_M4x4 endp

Farabi · January 14, 2014, 03:05:07 PM

Thanks qWord and Gunther, your knowledge is above my knowledge.

I want to show off my rendering engine using all of this technique to you. https://drive.google.com/file/d/0B4e67L-fbWVQRHZldUJjXy1UWjA/edit?usp=sharing I've cut all dependencies and reduce the size from 20Mbytes to 2MBytes of package, from 600kbytes of application to 50 kbytes, on the application it had the dynamic shadowing system, motion capture using bvh format, and some of basic 3D math.

The MASM Forum

News: