The max digits measure are a bit misleading as it don't correspond directly to the actual output which may be larger than the digits used in the calculation. Here's a simple conversion which shifts out the integer part and reduce the actual number to 32-bit by adding zeros.
F_SIGBITS equ 24
F_EXPBITS equ 8
F_EXPMASK equ (1 shl F_EXPBITS) - 1
F_EXPBIAS equ F_EXPMASK shr 1
F_EXPMAX equ F_EXPMASK - F_EXPBIAS
BUFFERSIZE equ 256
ftoa proc uses esi edi ebx string:ptr, float:real4
local buffer[BUFFERSIZE]:byte
mov eax,float
.repeat
.switch eax
.case 0xFFFFFFFF ; floating-point exceptions
.case 0xFFC00001
.case 0xFFBFFFFF
.case 0xFF800001
.case 0xFFC00000
.case 0xFF800000
.case 0xFF7FFFFF
.case 0x7F7FFFFF
.case 0x7F800001
.case 0x7FBFFFFF
.case 0x7FC00000
.case 0x7FFFFFFF
mov ecx,'NaN+'
lea esi,buffer[BUFFERSIZE-5]
.if eax & 0x80000000
mov cl,'-'
.endif
mov [esi],ecx
mov buffer[BUFFERSIZE-1],0
.break
.endsw
and eax,0x7FFFFFFF
movd xmm0,eax
mov ecx,eax
shr ecx,F_SIGBITS-1
xor edx,edx ; integer value
xor ebx,ebx ; n * '0'
.if ecx >= F_EXPBIAS
sub ecx,F_EXPBIAS-1
shl eax,F_EXPBITS
or eax,0x80000000
.if cl >= 32
mov edx,eax ; shift left 32
xor eax,eax
.repeat
xor esi,esi
.repeat
shl eax,1
rcl edx,1
rcl esi,1
.if esi >= 10
sub esi,10
inc eax
.endif
.untilcxz
mov ecx,64
inc ebx
.until !edx
cvtsi2ss xmm1,eax
cvtss2si edx,xmm1
subss xmm0,xmm0
.else
shld edx,eax,cl
cvtsi2ss xmm1,edx
subss xmm0,xmm1
.endif
.endif
push ebx
push edx
mov eax,10000000.0
movd xmm1,eax
mulss xmm0,xmm1
cvtss2si eax,xmm0
lea esi,buffer[BUFFERSIZE-1]
mov byte ptr [esi],0
mov ecx,10
mov ebx,7
.repeat
xor edx,edx
div ecx
dec esi
or bh,dl
add dl,'0'
mov [esi],dl
dec bl
.untilz
dec esi
mov byte ptr [esi],'.'
.if !bh
mov byte ptr [esi+2],0
.endif
pop eax
pop ebx
.if eax
.while ebx
dec esi
mov byte ptr [esi],'0'
dec ebx
.endw
.repeat
xor edx,edx
div ecx
dec esi
add dl,'0'
mov [esi],dl
.until !eax
.else
dec esi
mov byte ptr [esi],'0'
.endif
.if byte ptr float[3] & 0x80
dec esi
mov byte ptr [esi],'-'
.endif
.until 1
mov edi,string
lea ecx,buffer[BUFFERSIZE]
sub ecx,esi
mov eax,edi
rep movsb
ret
ftoa endp
This will round the numbers similar to sprintf and have more or less the same limitations.
local buffer[64]:sbyte
lea edi,buffer
printf("%s (1.0)\n", ftoa(edi, 1.0))
printf("%s (333.3)\n", ftoa(edi, 333.3))
printf("%s (1.6777216)\n", ftoa(edi, 1.6777216))
printf("%s (-9999999999.9)\n", ftoa(edi, -9999999999.9))
printf("%s (100000000000.0)\n", ftoa(edi, 100000000000.0))
printf("%s (-99999999999999999999.9)\n", ftoa(edi, -99999999999999999999.9))
printf("%s (-100000000000000000000.0)\n", ftoa(edi, -100000000000000000000.0))
printf("%s (-1.0e20)\n", ftoa(edi, -1.0e20))
printf("%s (0x7FBFFFFF)\n", ftoa(edi, 0x7FBFFFFF))
printf("%s (-1)\n", ftoa(edi, -1))
ret
1.0 (1.0)
333.2999878 (333.3)
1.6777216 (1.6777216)
-10000000000.0 (-9999999999.9)
100000000000.0 (100000000000.0)
-100000000000000000000.0 (-99999999999999999999.9)
-100000000000000000000.0 (-100000000000000000000.0)
-100000000000000000000.0 (-1.0e20)
+NaN (0x7FBFFFFF)
-NaN (-1)
I added it to the test and increased the buffer to 64. Apparently ASMC do not except movaps xmm0,m64 so a few other changes was added. Well, obviously slower than the SEE version but somewhat faster than sprintf.
SIMD Real4 to ASCII conversion by Siekmanski 2018.
1000000 calls per Run for the Cycle counter and the Routine timer.
Intel(R) Core(TM) i5-6500T CPU @ 2.50GHz
Routine timers starting now....
ftoa Cycles: 130 RoutineTime: 0.053859066 seconds
Real4_2_ASCII Cycles: 24 RoutineTime: 0.010993086 seconds
sprintf Cycles: 1124 RoutineTime: 0.451840946 seconds
Result ftoa : 1.6777216
Result Real4_2_ASCII: 1.6777216
Result sprintf : 1.677722