I made some experiments with UpdateModel, Stretch, and also Squash. SSE2 is good for executing parallel calculations, otherwise it seems to be outperformed by FPU. Note that with FPU, you don't have the overhead of using conversion instructions(cvtdq**).
I have already used a lookup table for log2 function. Replacing the whole Squash formula by a LUT could be done in theory, but that would need to be a huge one since p0 ranges between -16,999999999... to +15,999999999. And reducing the mantissa would hurt the compression...
;Update Weights in the logistic domain, such as:
;Wi=Wi+(Learning Rate*((1-Symbol-(Squash/2^32))*Stretch i))
fstp Temp ;store Squash value in local data (float)
pcmpeqd xmm1,xmm1
psrld xmm1,31 ;xmm1 low dword=1
sub dword ptr Temp+4,2000000H ;substract 32 from exponent field
cmp MatchPrediction,2 ;MatchPrediction=2 if no match found
subss xmm1,Symbol ;xmm1=1-Symbol
movsd xmm3,Temp ;xmm3=Squash/2^32 (float 64 bits)
cvtdq2pd xmm1,xmm1 ;xmm1=1-Symbol (float 64 bits)
subsd xmm1,xmm3 ;xmm1=(1-Symbol)-(Squash/2^32)
mulsd xmm1,@@LearningRate ;xmm1=(1-Symbol)-(Squash/2^32)*Learning Rate
cvtsd2ss xmm1,xmm1 ;xmm1=xmm0 converted to 32 bits precision
movapd xmm3,xmm1
mulss xmm1,Stretch6
addss xmm1,Weight6
movss Weight6,xmm1
movapd xmm1,xmm3
mulss xmm1,Stretch4
addss xmm1,Weight4
movss Weight4,xmm1
movapd xmm1,xmm3
mulss xmm1,Stretch3
addss xmm1,Weight3
movss Weight3,xmm1
movapd xmm1,xmm3
mulss xmm1,Stretch2
addss xmm1,Weight2
movss Weight2,xmm1
movapd xmm1,xmm3
mulss xmm1,Stretch1
addss xmm1,Weight1
movss Weight1,xmm1
je @@IF1 ;skip update of Match if model is off
movapd xmm1,xmm3
mulss xmm1,StretchM
addss xmm1,WeightM
movss WeightM,xmm1
@@Stretch:
movzx edx,word ptr [edx] ;edx=Bit history of Order-x table
@@Stretch2:
mov edx,[edi+edx*4]
movzx eax,dx ;eax=frequency of bit 0
movd xmm1,eax
shr edx,16 ;edx=frequency of bit 1
lea ecx,[eax+edx] ;ecx=n0+n1
cvtdq2ps xmm1,xmm1 ;frequency of bit 0 (float)
mulss xmm1,[ecx*4+ebx] ;xmm1=bit 0 range=Frequency*(2^32/TableCount)
movd eax,xmm1 ;eax=p0 (float)
cvtps2pd xmm2,xmm1 ;xmm2=p0 (float 64 bits)
mov edx,eax
shr eax,23 ;eax=biased exponent
sub eax,127 ;eax=exponent
and edx,7FFFFFH ;edx=mantissa
shr edx,9 ;edx=mantissa reduced to 14 bits
movd xmm1,[edx*4+esi] ;xmm1=log2(mantissa)
cvtsi2ss xmm3,eax ;xmm3=exponent (float)
addss xmm3,xmm1 ;xmm3=exp+log2(mantissa)=log2(p0)
movsd xmm1, @@TwoPower32 ;xmm1=2^32 (float 64 bits)
subsd xmm1,xmm2 ;xmm1=2^32-p0
cvtsd2ss xmm1,xmm1 ;convert xmm1 to float 32 bits
movd eax,xmm1 ;eax=2^32-p0 (float)
mov edx,eax
shr eax,23 ;eax=biased exponent
sub eax,127 ;eax=exponent
and edx,7FFFFFH ;edx=mantissa
shr edx,9 ;edx=mantissa reduced to 14 bits
movd xmm1,[edx*4+esi] ;xmm1=log2(mantissa)
cvtsi2ss xmm2,eax ;xmm2=exponent (float)
addss xmm2,xmm1 ;xmm2=exp+log2(mantissa)=log2(2^32-p0)
subss xmm3,xmm2 ;xmm3=log2(p0)-log2(2^32-p0)
movss Temp,xmm3
fld dword ptr Temp ;store result in FPU stack
ret