assuming that the matrix is saved row-wise, the following SSE1 solution (REAL4/float) should do the job:
M4x4 struct
; a11 ... a44
cntr_row = 1
REPEAT 4 ; for each row ...
cntr_clmn = 1
REPEAT 4 ; for each column ...
@CatStr(<a>,%cntr_row,%cntr_clmn) REAL4 ?
cntr_clmn = cntr_clmn + 1
ENDM
cntr_row = cntr_row + 1
ENDM
M4x4 ends
V4 struct
x REAl4 ?
y REAl4 ?
z REAl4 ?
w REAl4 ?
V4 ends
.code
; matrix is unaligned
mul_V4T_M4x4u proc pResult: ptr V4, pV4: ptr V4, pM4x4: ptr M4x4
mov eax,pResult
mov ecx,pV4
mov edx,pM4x4
movups xmm0,OWORD ptr [ecx]
movups xmm4,OWORD ptr [edx][0*OWORD]
movups xmm5,OWORD ptr [edx][1*OWORD]
movups xmm6,OWORD ptr [edx][2*OWORD]
movups xmm7,OWORD ptr [edx][3*OWORD]
movaps xmm1,xmm0
movaps xmm2,xmm0
movaps xmm3,xmm0
shufps xmm0,xmm0,00000000y ; all elements = x
shufps xmm1,xmm1,01010101y ; = y
shufps xmm2,xmm2,10101010y ; = z
shufps xmm3,xmm3,11111111y ; = w
mulps xmm0,xmm4 ; a11 * x , a12 * x , a13 * x , a14 * x
mulps xmm1,xmm5 ; a21 * y , a22 * y , ...
mulps xmm2,xmm6 ; a31 * z , ...
mulps xmm3,xmm7 ; a41 * w , ...
addps xmm0,xmm1 ; (a11 * x)+(a21 * y) , (a12 * x)+(a22 * y ) , ...
addps xmm0,xmm2 ; (a11 * x)+(a21 * y)+(a31 * z), ...
addps xmm0,xmm3 ; (a11 * x)+(a21 * y)+(a31 * z)+(a41 * w), ...
movups OWORD ptr [eax],xmm0
ret
mul_V4T_M4x4u endp
; matrix is aligned
mul_V4T_M4x4a proc pResult: ptr V4, pV4: ptr V4, pM4x4: ptr M4x4
mov eax,pResult
mov ecx,pV4
mov edx,pM4x4
movups xmm0,OWORD ptr [ecx] ; movaps ? -> align 16
movaps xmm1,xmm0
movaps xmm2,xmm0
movaps xmm3,xmm0
shufps xmm0,xmm0,00000000y
shufps xmm1,xmm1,01010101y
shufps xmm2,xmm2,10101010y
shufps xmm3,xmm3,11111111y
mulps xmm0,OWORD ptr [edx][0*OWORD]
mulps xmm1,OWORD ptr [edx][1*OWORD]
mulps xmm2,OWORD ptr [edx][2*OWORD]
mulps xmm3,OWORD ptr [edx][3*OWORD]
addps xmm0,xmm1
addps xmm0,xmm2
addps xmm0,xmm3
movups OWORD ptr [eax],xmm0 ; movaps ? -> align 16
ret
mul_V4T_M4x4a endp
EDIT: for the case that the matrix is saved column-wise:
M4x4 struct
; a11 ... a44
cntr_clmn = 1
REPEAT 4 ; for each column ...
cntr_row = 1
REPEAT 4 ; for each row ...
@CatStr(<a>,%cntr_row,%cntr_clmn) REAL4 ?
cntr_row = cntr_row + 1
ENDM
cntr_clmn = cntr_clmn + 1
ENDM
M4x4 ends
V4 struct
x REAl4 ?
y REAl4 ?
z REAl4 ?
w REAl4 ?
V4 ends
mul_V4T_M4x4 proc pResult: ptr V4, pV4: ptr V4, pM4x4: ptr M4x4
mov eax,pResult
mov ecx,pV4
mov edx,pM4x4
movups xmm4,OWORD ptr [ecx] ; movaps? -> align 16
movups xmm0,OWORD ptr [edx][0*OWORD] ; ^
movups xmm1,OWORD ptr [edx][1*OWORD] ; ^
movups xmm2,OWORD ptr [edx][2*OWORD] ; ^
movups xmm3,OWORD ptr [edx][3*OWORD] ; ^
mulps xmm0,xmm4 ; a11 * x , a21 * y , a31 * z , a41 * w
mulps xmm1,xmm4 ; a12 * x , a22 * y , ...
mulps xmm2,xmm4 ; a13 * x , ...
mulps xmm3,xmm4 ; a14 * x , ...
; xmm0-3 = column 1-4
movaps xmm4,xmm0
movaps xmm5,xmm2
unpcklps xmm4,xmm1
unpcklps xmm5,xmm3
unpckhps xmm0,xmm1
unpckhps xmm2,xmm3
movaps xmm1,xmm4
movaps xmm3,xmm0
shufps xmm4,xmm5,01000100y ; row 1
shufps xmm1,xmm5,11101110y ; row 2
shufps xmm3,xmm2,01000100y ; row 3
shufps xmm0,xmm2,11101110y ; row 4
addps xmm1,xmm4 ; (a11 * x)+(a21 * y) , (a12 * x)+(a22 * y) , ...
addps xmm0,xmm3 ; (a31 * z)+(a41 * w) , (a32 * z)+(a42 * w) , ...
addps xmm0,xmm1 ; (a11 * x)+(a21 * y)+(a31 * z)+(a41 * w) , ...
movups OWORD ptr [eax],xmm0 ; movaps? -> align 16
ret
mul_V4T_M4x4 endp