fMat4fGPUMul proc uses esi edi lpdestination:dword,lpMat4x41:dword,lpMat4x42:dword
invoke glPushMatrix
invoke glMatrixMode,GL_MODELVIEW
; invoke glLoadIdentity
invoke glLoadMatrixf,lpMat4x41
invoke glMultMatrixf,lpMat4x42
invoke glGetFloatv,GL_MODELVIEW_MATRIX,lpdestination
invoke glPopMatrix
ret
fMat4fGPUMul endp
Im Impressed with the capability of my SiS card on multiplying a matrix. It took 3 seconds for calculating 100 millions matrix. Compared to my FPU version of matrix multiplication wich is took 20 minutes, I should reconsider that something was wrong on the software, not the hardware. For a system that clocked at 250 Mhz, it is far superior than a dual core 1.86 Ghz processor.
Now, I think we need a new term for a processor quality. How can a dual core muli hundreds Mega Hertz can be so slow compared to a GPU which is not even a quarter from it.
FLOPS (http://en.wikipedia.org/wiki/Flops) would be the measurement you're after.
100 million matrix multiplies shouldn't take 20min..
On my Core2 Duo 2.0ghz (about 5 years old) I can do at least 10-20 million per second using sse2.
If you're using FPU, It sounds like you might have a denormal issue or an fpu stack alignment problem to make it that slow.
Either way my suggestion would be to use SIMD for anything like that.
Another tip might be to store your matrices in ROW and COLUMN order simultaneously to avoid having to swizzle them on the fly for vertex/vector transforms by that matrix. Not that this
usually matters much as you only have to swizzle once per transform batch.
Quote from: johnsa on June 19, 2012, 04:59:43 AM
100 million matrix multiplies shouldn't take 20min..
On my Core2 Duo 2.0ghz (about 5 years old) I can do at least 10-20 million per second using sse2.
If you're using FPU, It sounds like you might have a denormal issue or an fpu stack alignment problem to make it that slow.
Either way my suggestion would be to use SIMD for anything like that.
Another tip might be to store your matrices in ROW and COLUMN order simultaneously to avoid having to swizzle them on the fly for vertex/vector transforms by that matrix. Not that this
usually matters much as you only have to swizzle once per transform batch.
My FPU Version was a bad version, but that show me that I should just use the GPU than the FPU
Quote
;Code Preview
; Oh It is so ugly code, I though its all the same
fMat4fMulMatrix proc uses esi edi lpMat4fDestination:dword,lpMat4fM1:dword,lpMat4fM2:dword
LOCAL mtmp[16]:dword
LOCAL ftmp:real4
mov esi,lpMat4fM1
mov edi,lpMat4fM2
;invoke fMat4fLoadZero,addr mtmp
;result.matrix[0] = (m1.matrix[0]*m2.matrix[0]) + (m1.matrix[4]*m2.matrix[1]) +(m1.matrix[8]*m2.matrix[2]) +(m1.matrix[12]*m2.matrix[3]);
invoke fMat4fMulProcess,esi,edi,0,0,4,1,8,2,12,3
invoke fFillMatrix4f,addr mtmp,0,eax
;result.matrix[4] = (m1.matrix[0]*m2.matrix[4]) + (m1.matrix[4]*m2.matrix[5]) +(m1.matrix[8]*m2.matrix[6]) +(m1.matrix[12]*m2.matrix[7]);
invoke fMat4fMulProcess,esi,edi,0,4,4,5,8,6,12,7
invoke fFillMatrix4f,addr mtmp,4,eax
;result.matrix[8] = (m1.matrix[0]*m2.matrix[8]) + (m1.matrix[4]*m2.matrix[9]) +(m1.matrix[8]*m2.matrix[10])+(m1.matrix[12]*m2.matrix[11]);
invoke fMat4fMulProcess,esi,edi,0,8,4,9,8,10,12,11
invoke fFillMatrix4f,addr mtmp,8,eax
;result.matrix[12]= (m1.matrix[0]*m2.matrix[12])+ (m1.matrix[4]*m2.matrix[13])+(m1.matrix[8]*m2.matrix[14])+(m1.matrix[12]*m2.matrix[15]);
invoke fMat4fMulProcess,esi,edi,0,12,4,13,8,14,12,15
invoke fFillMatrix4f,addr mtmp,12,eax
;result.matrix[1] = (m1.matrix[1]*m2.matrix[0]) +(m1.matrix[5]*m2.matrix[1]) +(m1.matrix[9]*m2.matrix[2]) +(m1.matrix[13]*m2.matrix[3]);
invoke fMat4fMulProcess,esi,edi,1,0,5,1,9,2,13,3
invoke fFillMatrix4f,addr mtmp,1,eax
;result.matrix[5] = (m1.matrix[1]*m2.matrix[4]) +(m1.matrix[5]*m2.matrix[5]) +(m1.matrix[9]*m2.matrix[6]) +(m1.matrix[13]*m2.matrix[7]);
invoke fMat4fMulProcess,esi,edi,1,4,5,5,9,6,13,7
invoke fFillMatrix4f,addr mtmp,5,eax
;result.matrix[9] = (m1.matrix[1]*m2.matrix[8]) +(m1.matrix[5]*m2.matrix[9]) +(m1.matrix[9]*m2.matrix[10])+(m1.matrix[13]*m2.matrix[11]);
invoke fMat4fMulProcess,esi,edi,1,8,5,9,9,10,13,11
invoke fFillMatrix4f,addr mtmp,9,eax
;result.matrix[13]= (m1.matrix[1]*m2.matrix[12])+(m1.matrix[5]*m2.matrix[13])+(m1.matrix[9]*m2.matrix[14])+(m1.matrix[13]*m2.matrix[15]);
invoke fMat4fMulProcess,esi,edi,1,12,5,13,9,14,13,15
invoke fFillMatrix4f,addr mtmp,13,eax
;result.matrix[2] = (m1.matrix[2]*m2.matrix[0]) +(m1.matrix[6]*m2.matrix[1]) +(m1.matrix[10]*m2.matrix[2]) +(m1.matrix[14]*m2.matrix[3]);
invoke fMat4fMulProcess,esi,edi,2,0,6,1,10,2,14,3
invoke fFillMatrix4f,addr mtmp,2,eax
;result.matrix[6] = (m1.matrix[2]*m2.matrix[4]) +(m1.matrix[6]*m2.matrix[5]) +(m1.matrix[10]*m2.matrix[6]) +(m1.matrix[14]*m2.matrix[7]);
invoke fMat4fMulProcess,esi,edi,2,4,6,5,10,6,14,7
invoke fFillMatrix4f,addr mtmp,6,eax
;result.matrix[10]= (m1.matrix[2]*m2.matrix[8]) +(m1.matrix[6]*m2.matrix[9]) +(m1.matrix[10]*m2.matrix[10])+(m1.matrix[14]*m2.matrix[11]);
invoke fMat4fMulProcess,esi,edi,2,8,6,9,10,10,14,11
invoke fFillMatrix4f,addr mtmp,10,eax
;result.matrix[14]= (m1.matrix[2]*m2.matrix[12])+(m1.matrix[6]*m2.matrix[13])+(m1.matrix[10]*m2.matrix[14])+(m1.matrix[14]*m2.matrix[15]);
invoke fMat4fMulProcess,esi,edi,2,12,6,13,10,14,14,15
invoke fFillMatrix4f,addr mtmp,14,eax
;result.matrix[3] = (m1.matrix[3]*m2.matrix[0]) +(m1.matrix[7]*m2.matrix[1]) +(m1.matrix[11]*m2.matrix[2]) +(m1.matrix[15]*m2.matrix[3]);
invoke fMat4fMulProcess,esi,edi,3,0,7,1,11,2,15,3
invoke fFillMatrix4f,addr mtmp,3,eax
;result.matrix[7] = (m1.matrix[3]*m2.matrix[4]) +(m1.matrix[7]*m2.matrix[5]) +(m1.matrix[11]*m2.matrix[6]) +(m1.matrix[15]*m2.matrix[7]);
invoke fMat4fMulProcess,esi,edi,3,4,7,5,11,6,15,7
invoke fFillMatrix4f,addr mtmp,7,eax
;result.matrix[11]= (m1.matrix[3]*m2.matrix[8]) +(m1.matrix[7]*m2.matrix[9]) +(m1.matrix[11]*m2.matrix[10])+(m1.matrix[15]*m2.matrix[11]);
invoke fMat4fMulProcess,esi,edi,3,8,7,9,11,10,15,11
invoke fFillMatrix4f,addr mtmp,11,eax
;result.matrix[15]= (m1.matrix[3]*m2.matrix[12])+(m1.matrix[7]*m2.matrix[13])+(m1.matrix[11]*m2.matrix[14])+(m1.matrix[15]*m2.matrix[15]);
invoke fMat4fMulProcess,esi,edi,3,12,7,13,11,14,15,15
invoke fFillMatrix4f,addr mtmp,15,eax
invoke MemCopy,addr mtmp,lpMat4fDestination,16*4
ret
fMat4fMulMatrix endp
Anyway, if you want to multiply a vector, you might want this code
fMat4fGPUMulVector proc uses esi edi lpMat4f:dword,lpVector:dword
LOCAL mat[16]:dword
mov esi,lpVector
invoke glPushMatrix
invoke glMatrixMode,GL_MODELVIEW
invoke glLoadIdentity
invoke glLoadMatrixf,lpMat4f
invoke glTranslatef,[esi].VERTEX.x,[esi].VERTEX.y,[esi].VERTEX.z
invoke glGetFloatv,GL_MODELVIEW_MATRIX,addr mat
invoke glPopMatrix
lea edi,mat
add edi,12*4
invoke MemCopy,edi,esi,4*4
ret
fMat4fGPUMulVector endp
That is the fastest I can get.