GPU Matrix multiply

Farabi · June 17, 2012, 09:04:50 PM



fMat4fGPUMul proc uses esi edi lpdestination:dword,lpMat4x41:dword,lpMat4x42:dword
	
	invoke glPushMatrix
		invoke glMatrixMode,GL_MODELVIEW
	;	invoke glLoadIdentity
		invoke glLoadMatrixf,lpMat4x41
		invoke glMultMatrixf,lpMat4x42
		invoke glGetFloatv,GL_MODELVIEW_MATRIX,lpdestination
	invoke glPopMatrix
	
	ret
fMat4fGPUMul endp

Im Impressed with the capability of my SiS card on multiplying a matrix. It took 3 seconds for calculating 100 millions matrix. Compared to my FPU version of matrix multiplication wich is took 20 minutes, I should reconsider that something was wrong on the software, not the hardware. For a system that clocked at 250 Mhz, it is far superior than a dual core 1.86 Ghz processor.

Now, I think we need a new term for a processor quality. How can a dual core muli hundreds Mega Hertz can be so slow compared to a GPU which is not even a quarter from it.

zooba · June 18, 2012, 07:34:18 PM

FLOPS would be the measurement you're after.

johnsa · June 19, 2012, 04:59:43 AM

100 million matrix multiplies shouldn't take 20min..
On my Core2 Duo 2.0ghz (about 5 years old) I can do at least 10-20 million per second using sse2.

If you're using FPU, It sounds like you might have a denormal issue or an fpu stack alignment problem to make it that slow.
Either way my suggestion would be to use SIMD for anything like that.

Another tip might be to store your matrices in ROW and COLUMN order simultaneously to avoid having to swizzle them on the fly for vertex/vector transforms by that matrix. Not that this
usually matters much as you only have to swizzle once per transform batch.

Farabi · June 19, 2012, 12:25:19 PM

Quote from: johnsa on June 19, 2012, 04:59:43 AM

100 million matrix multiplies shouldn't take 20min..
On my Core2 Duo 2.0ghz (about 5 years old) I can do at least 10-20 million per second using sse2.

If you're using FPU, It sounds like you might have a denormal issue or an fpu stack alignment problem to make it that slow.
Either way my suggestion would be to use SIMD for anything like that.

Another tip might be to store your matrices in ROW and COLUMN order simultaneously to avoid having to swizzle them on the fly for vertex/vector transforms by that matrix. Not that this
usually matters much as you only have to swizzle once per transform batch.

My FPU Version was a bad version, but that show me that I should just use the GPU than the FPU

Quote
;Code Preview
; Oh It is so ugly code, I though its all the same

fMat4fMulMatrix proc uses esi edi lpMat4fDestination:dword,lpMat4fM1:dword,lpMat4fM2:dword
   LOCAL mtmp[16]:dword
   LOCAL ftmp:real4

   mov esi,lpMat4fM1
   mov edi,lpMat4fM2

   ;invoke fMat4fLoadZero,addr mtmp

   ;result.matrix[0] = (m1.matrix[0]*m2.matrix[0]) + (m1.matrix[4]*m2.matrix[1]) +(m1.matrix[8]*m2.matrix[2]) +(m1.matrix[12]*m2.matrix[3]);
   invoke fMat4fMulProcess,esi,edi,0,0,4,1,8,2,12,3
   invoke fFillMatrix4f,addr mtmp,0,eax
   ;result.matrix[4] = (m1.matrix[0]*m2.matrix[4]) + (m1.matrix[4]*m2.matrix[5]) +(m1.matrix[8]*m2.matrix[6]) +(m1.matrix[12]*m2.matrix[7]);
   invoke fMat4fMulProcess,esi,edi,0,4,4,5,8,6,12,7
   invoke fFillMatrix4f,addr mtmp,4,eax
   ;result.matrix[8] = (m1.matrix[0]*m2.matrix[8]) + (m1.matrix[4]*m2.matrix[9]) +(m1.matrix[8]*m2.matrix[10])+(m1.matrix[12]*m2.matrix[11]);
   invoke fMat4fMulProcess,esi,edi,0,8,4,9,8,10,12,11
   invoke fFillMatrix4f,addr mtmp,8,eax
   ;result.matrix[12]= (m1.matrix[0]*m2.matrix[12])+ (m1.matrix[4]*m2.matrix[13])+(m1.matrix[8]*m2.matrix[14])+(m1.matrix[12]*m2.matrix[15]);
   invoke fMat4fMulProcess,esi,edi,0,12,4,13,8,14,12,15
   invoke fFillMatrix4f,addr mtmp,12,eax

   ;result.matrix[1] = (m1.matrix[1]*m2.matrix[0]) +(m1.matrix[5]*m2.matrix[1]) +(m1.matrix[9]*m2.matrix[2]) +(m1.matrix[13]*m2.matrix[3]);
   invoke fMat4fMulProcess,esi,edi,1,0,5,1,9,2,13,3
   invoke fFillMatrix4f,addr mtmp,1,eax
   ;result.matrix[5] = (m1.matrix[1]*m2.matrix[4]) +(m1.matrix[5]*m2.matrix[5]) +(m1.matrix[9]*m2.matrix[6]) +(m1.matrix[13]*m2.matrix[7]);
   invoke fMat4fMulProcess,esi,edi,1,4,5,5,9,6,13,7
   invoke fFillMatrix4f,addr mtmp,5,eax
   ;result.matrix[9] = (m1.matrix[1]*m2.matrix[8]) +(m1.matrix[5]*m2.matrix[9]) +(m1.matrix[9]*m2.matrix[10])+(m1.matrix[13]*m2.matrix[11]);
   invoke fMat4fMulProcess,esi,edi,1,8,5,9,9,10,13,11
   invoke fFillMatrix4f,addr mtmp,9,eax
   ;result.matrix[13]= (m1.matrix[1]*m2.matrix[12])+(m1.matrix[5]*m2.matrix[13])+(m1.matrix[9]*m2.matrix[14])+(m1.matrix[13]*m2.matrix[15]);
   invoke fMat4fMulProcess,esi,edi,1,12,5,13,9,14,13,15
   invoke fFillMatrix4f,addr mtmp,13,eax

   ;result.matrix[2] = (m1.matrix[2]*m2.matrix[0]) +(m1.matrix[6]*m2.matrix[1]) +(m1.matrix[10]*m2.matrix[2]) +(m1.matrix[14]*m2.matrix[3]);
   invoke fMat4fMulProcess,esi,edi,2,0,6,1,10,2,14,3
   invoke fFillMatrix4f,addr mtmp,2,eax
   ;result.matrix[6] = (m1.matrix[2]*m2.matrix[4]) +(m1.matrix[6]*m2.matrix[5]) +(m1.matrix[10]*m2.matrix[6]) +(m1.matrix[14]*m2.matrix[7]);
   invoke fMat4fMulProcess,esi,edi,2,4,6,5,10,6,14,7
   invoke fFillMatrix4f,addr mtmp,6,eax
   ;result.matrix[10]= (m1.matrix[2]*m2.matrix[8]) +(m1.matrix[6]*m2.matrix[9]) +(m1.matrix[10]*m2.matrix[10])+(m1.matrix[14]*m2.matrix[11]);
   invoke fMat4fMulProcess,esi,edi,2,8,6,9,10,10,14,11
   invoke fFillMatrix4f,addr mtmp,10,eax
   ;result.matrix[14]= (m1.matrix[2]*m2.matrix[12])+(m1.matrix[6]*m2.matrix[13])+(m1.matrix[10]*m2.matrix[14])+(m1.matrix[14]*m2.matrix[15]);
   invoke fMat4fMulProcess,esi,edi,2,12,6,13,10,14,14,15
   invoke fFillMatrix4f,addr mtmp,14,eax

   ;result.matrix[3] = (m1.matrix[3]*m2.matrix[0]) +(m1.matrix[7]*m2.matrix[1]) +(m1.matrix[11]*m2.matrix[2]) +(m1.matrix[15]*m2.matrix[3]);
   invoke fMat4fMulProcess,esi,edi,3,0,7,1,11,2,15,3
   invoke fFillMatrix4f,addr mtmp,3,eax
   ;result.matrix[7] = (m1.matrix[3]*m2.matrix[4]) +(m1.matrix[7]*m2.matrix[5]) +(m1.matrix[11]*m2.matrix[6]) +(m1.matrix[15]*m2.matrix[7]);
   invoke fMat4fMulProcess,esi,edi,3,4,7,5,11,6,15,7
   invoke fFillMatrix4f,addr mtmp,7,eax
   ;result.matrix[11]= (m1.matrix[3]*m2.matrix[8]) +(m1.matrix[7]*m2.matrix[9]) +(m1.matrix[11]*m2.matrix[10])+(m1.matrix[15]*m2.matrix[11]);
   invoke fMat4fMulProcess,esi,edi,3,8,7,9,11,10,15,11
   invoke fFillMatrix4f,addr mtmp,11,eax
   ;result.matrix[15]= (m1.matrix[3]*m2.matrix[12])+(m1.matrix[7]*m2.matrix[13])+(m1.matrix[11]*m2.matrix[14])+(m1.matrix[15]*m2.matrix[15]);
   invoke fMat4fMulProcess,esi,edi,3,12,7,13,11,14,15,15
   invoke fFillMatrix4f,addr mtmp,15,eax

   invoke MemCopy,addr mtmp,lpMat4fDestination,16*4

   ret
fMat4fMulMatrix endp

Anyway, if you want to multiply a vector, you might want this code

Code Select



fMat4fGPUMulVector proc uses esi edi lpMat4f:dword,lpVector:dword
	LOCAL mat[16]:dword
	
	mov esi,lpVector
	invoke glPushMatrix
		invoke glMatrixMode,GL_MODELVIEW
		invoke glLoadIdentity
		invoke glLoadMatrixf,lpMat4f
		invoke glTranslatef,[esi].VERTEX.x,[esi].VERTEX.y,[esi].VERTEX.z
		invoke glGetFloatv,GL_MODELVIEW_MATRIX,addr mat
	invoke glPopMatrix
	
	lea edi,mat
	add edi,12*4
	invoke MemCopy,edi,esi,4*4
	
	ret
fMat4fGPUMulVector endp

That is the fastest I can get.

The MASM Forum

News:

GPU Matrix multiply

Farabi

zooba

johnsa

Farabi