Hi Rui,
You are right, this is not what we want.  :(
The only logical thing I can think of right now is that the 8 digit calculation is not enough to cover the 32bit floating-point rounding phenomena.
Have to think this all over, suggestions are welcome of course.
Hi Siekmanski,
It seems that you need to study the problem or you may try another algo to get the digits. It seems that there is a problem when it prints the string in the scientific format or it doesnt do it. When we multiply -12345.678 by -123456.78 it gives 000000005 but the result is 1.5241577E+9 (last digit rounded). So i think you need time and a little bit of work. Try another algo.  :t

Tomorrow i'll try to do something else, my purpuse was to write it in SSE only with 32bit float calculations. ( max 8 digits )
1.5241577E+7 fits inside a real4, but 1.5241577E+9 doesn't.

AFAIK the largest number that fits inside a real4 is 16777215 (24bit), maybe I'm wrong here?
If so I need to do calculations for more than 8 digits.
You are not right, in real4 the exponent goes from -38 to +38. So the converter should show some numbers d.xxxxxxE-38 to d.xxxxxxE+38 (see simplyFPU). So if it shows 1.5241577E+9 it is far from the limit. You dont need to do calculations for more than 8 digits but you need to decode the exponent part. It seems that you dont do it.

Thanks Rui,

Now I know what to do.  :t
The max digits measure are a bit misleading as it don't correspond directly to the actual output which may be larger than the digits used in the calculation. Here's a simple conversion which shifts out the integer part and reduce the actual number to 32-bit by adding zeros.
Code: [Select]
`    F_SIGBITS   equ 24    F_EXPBITS   equ 8    F_EXPMASK   equ (1 shl F_EXPBITS) - 1    F_EXPBIAS   equ F_EXPMASK shr 1    F_EXPMAX    equ F_EXPMASK - F_EXPBIAS    BUFFERSIZE  equ 256ftoa proc uses esi edi ebx string:ptr, float:real4  local buffer[BUFFERSIZE]:byte    mov eax,float    .repeat        .switch eax          .case 0xFFFFFFFF  ; floating-point exceptions          .case 0xFFC00001          .case 0xFFBFFFFF          .case 0xFF800001          .case 0xFFC00000          .case 0xFF800000          .case 0xFF7FFFFF          .case 0x7F7FFFFF          .case 0x7F800001          .case 0x7FBFFFFF          .case 0x7FC00000          .case 0x7FFFFFFF            mov ecx,'NaN+'            lea esi,buffer[BUFFERSIZE-5]            .if eax & 0x80000000                mov cl,'-'            .endif            mov [esi],ecx            mov buffer[BUFFERSIZE-1],0            .break        .endsw        and eax,0x7FFFFFFF        movd xmm0,eax        mov ecx,eax        shr ecx,F_SIGBITS-1        xor edx,edx ; integer value        xor ebx,ebx ; n * '0'        .if ecx >= F_EXPBIAS            sub ecx,F_EXPBIAS-1            shl eax,F_EXPBITS            or  eax,0x80000000            .if cl >= 32                mov edx,eax ; shift left 32                xor eax,eax                .repeat                    xor esi,esi                    .repeat                        shl eax,1                        rcl edx,1                        rcl esi,1                        .if esi >= 10                            sub esi,10                            inc eax                        .endif                    .untilcxz                    mov ecx,64                    inc ebx                .until !edx                cvtsi2ss xmm1,eax                cvtss2si edx,xmm1                subss xmm0,xmm0            .else                shld edx,eax,cl                cvtsi2ss xmm1,edx                subss xmm0,xmm1            .endif        .endif        push ebx        push edx        mov  eax,10000000.0        movd xmm1,eax        mulss xmm0,xmm1        cvtss2si eax,xmm0        lea esi,buffer[BUFFERSIZE-1]        mov byte ptr [esi],0        mov ecx,10        mov ebx,7        .repeat            xor edx,edx            div ecx            dec esi            or  bh,dl            add dl,'0'            mov [esi],dl            dec bl        .untilz        dec esi        mov byte ptr [esi],'.'        .if !bh            mov byte ptr [esi+2],0        .endif        pop eax        pop ebx        .if eax            .while ebx                dec esi                mov byte ptr [esi],'0'                dec ebx            .endw            .repeat                xor edx,edx                div ecx                dec esi                add dl,'0'                mov [esi],dl            .until !eax        .else            dec esi            mov byte ptr [esi],'0'        .endif        .if byte ptr float[3] & 0x80            dec esi            mov byte ptr [esi],'-'        .endif    .until 1    mov edi,string    lea ecx,buffer[BUFFERSIZE]    sub ecx,esi    mov eax,edi    rep movsb    retftoa endp`
This will round the numbers similar to sprintf and have more or less the same limitations.

Code: [Select]
`  local buffer[64]:sbyte    lea edi,buffer    printf("%s (1.0)\n", ftoa(edi, 1.0))    printf("%s (333.3)\n", ftoa(edi, 333.3))    printf("%s (1.6777216)\n", ftoa(edi, 1.6777216))    printf("%s (-9999999999.9)\n", ftoa(edi, -9999999999.9))    printf("%s (100000000000.0)\n", ftoa(edi, 100000000000.0))    printf("%s (-99999999999999999999.9)\n", ftoa(edi, -99999999999999999999.9))    printf("%s (-100000000000000000000.0)\n", ftoa(edi, -100000000000000000000.0))    printf("%s (-1.0e20)\n", ftoa(edi, -1.0e20))    printf("%s (0x7FBFFFFF)\n", ftoa(edi, 0x7FBFFFFF))    printf("%s (-1)\n", ftoa(edi, -1))    ret`
1.0 (1.0)
333.2999878 (333.3)
1.6777216 (1.6777216)
-10000000000.0 (-9999999999.9)
100000000000.0 (100000000000.0)
-100000000000000000000.0 (-99999999999999999999.9)
-100000000000000000000.0 (-100000000000000000000.0)
-100000000000000000000.0 (-1.0e20)
+NaN (0x7FBFFFFF)
-NaN (-1)

I added it to the test and increased the buffer to 64. Apparently ASMC do not except movaps xmm0,m64 so a few other changes was added. Well, obviously slower than the SEE version but somewhat faster than sprintf.
Code: [Select]
`SIMD Real4 to ASCII conversion by Siekmanski 2018.1000000 calls per Run for the Cycle counter and the Routine timer.Intel(R) Core(TM) i5-6500T CPU @ 2.50GHz Routine timers starting now....ftoa          Cycles: 130 RoutineTime: 0.053859066 secondsReal4_2_ASCII Cycles: 24 RoutineTime: 0.010993086 secondssprintf       Cycles: 1124 RoutineTime: 0.451840946 secondsResult ftoa         : 1.6777216Result Real4_2_ASCII:  1.6777216Result sprintf      : 1.677722`

Thanks nidud,  :t

My mistake was that I thought there would be no more than 8 digits in the largest value.
I misunderstood the real4 format.

Thanks to Rui I'm a bit wiser now.

So far I tested this in masm to see what the largest real4 value would be:

masm real4 3.40282356E+38     maximum input for a real4 value
sprintf    340282306073709650000000000000000000000.000000 39:6 digits this is the result from sprintf
sprintf    3.40282e+038 scientific notation

The maximum possible digits before the floating-point is 39, from which the first most significant 7 digits are reliable values, the rest is just garbage but need to be counted as digits to present the number and the rest are just zeros.
sprintf, prints the first 7-8 digits, then followed by 9 or 10 garbage numbers, the rest are zeros.

If the number fits as a whole in 8 digits, i'll print it as such else, I will print it as scientific notation with 8 digits.
Hi Siekmanski,
>> If the number fits as a whole in 8 digits, i'll print it as such else, I will print it as scientific notation with 8 digits. (which means 7 decimal places)

Very well, seems to be a good decision (we dont need to see garbage) :t

In the previous sources I calculated with 8 digits which causes the occasional rounding errors.
And I didn't had enough knowledge of the internal workings of the floating point format.

In this new routine 7 digits are used for the calculations and does the job without errors ( so far as I have tested it, no errors occurred ).
And it now covers the whole range -1.175494E-38 to 3.402823E+38

I'm still not happy with the speed of the maximum digits count routine.
It now uses a fast Log10(x)+1 approximation routine but, it needs a few checks to get the exact number of digits from the float.
For now it only prints in scientific notation but, itâ€™s a fast one and without memory swaps to insert the decimal point to construct the string.
The decimal point is now integrated into the ascii converter constant.

I'll continue and try to write the fastest possible float to ascii routine.
I still have another idea to write a maximum digits count routine in a totally other way and hope it will be faster than the Log10(x)+1 approach.
Next week I'll start coding it and will see if it is faster or not.
Will be continued.

I have posted the fully commented source code in the first post. http://masm32.com/board/index.php?topic=7441.msg81351#msg81351

Code: [Select]
`SIMD Real4 to ASCII conversion by Siekmanski 2018.1000000 calls per Run for the Cycle counter and the Routine timer.Intel(R) Core(TM) i7-4930K CPU @ 3.40GHz Routine timers starting now....Real4_2_ASCII Cycles: 69 RoutineTime: 0.022400193 secondssprintf       Cycles: 1955 RoutineTime: 0.600757429 secondsResult Real4_2_ASCII:  1.234567e+14Result sprintf      : 1.234567e+014Press any key to continue...`
Code: [Select]
`SIMD Real4 to ASCII conversion by Siekmanski 2018.1000000 calls per Run for the Cycle counter and the Routine timer.Intel(R) Core(TM) i5-4210U CPU @ 1.70GHz Routine timers starting now....Real4_2_ASCII Cycles: 61 RoutineTime: 0.028886304 secondssprintf       Cycles: 1866 RoutineTime: 3.438464466 secondsResult Real4_2_ASCII:  1.234567e+14Result sprintf      : 1.234567e+014Press any key to continue...`

Code: [Select]
`AMD A6-3500 APU with Radeon(tm) HD GraphicsReal4_2_ASCII Cycles: 98 RoutineTime: 0.049566970 secondssprintf       Cycles: 2630 RoutineTime: 1.251230629 seconds` :t

Rui found a typo in the floating-point exceptions list for the +Infinity message.

change 0FF800000h to 07F800000h
Should be:

cmp         eax,07F800000h
je            message_Inf

More easy to read that lines:
Code: [Select]
`    ; check floating-point exceptions    check macro value, message        cmp         eax, &value        je          &message    endm        check 0FFFFFFFFh, message_QnegNaN    check 0FFC00001h, message_QnegNaN    check 0FFBFFFFFh, message_SnegNaN    check 0FF800001h, message_SnegNaN    check 0FFC00000h, message_Indeterm    check 0FF800000h, message_NegInf    check 0FF7FFFFFh, message_NegNorm    check 07F7FFFFFh, message_Norm    check 07F800000h, message_Inf    check 07F800001h, message_SNaN    check 07FBFFFFFh, message_SNaN    check 07FC00000h, message_QNaN    check 07FFFFFFFh, message_QNaN`

More easy to read that lines:
Code: [Select]
`    ; check floating-point exceptions    check macro value, message        cmp         eax, &value        je          &message    endm        check 0FFFFFFFFh, message_QnegNaN    check 0FFC00001h, message_QnegNaN    check 0FFBFFFFFh, message_SnegNaN    check 0FF800001h, message_SnegNaN    check 0FFC00000h, message_Indeterm    check 0FF800000h, message_NegInf    check 0FF7FFFFFh, message_NegNorm    check 07F7FFFFFh, message_Norm    check 07F800000h, message_Inf    check 07F800001h, message_SNaN    check 07FBFFFFFh, message_SNaN    check 07FC00000h, message_QNaN    check 07FFFFFFFh, message_QNaN`

:t
Hi.

Code: [Select]
`= = =Redirect to file.SIMD Real4 to ASCII conversion by Siekmanski 2018.1000000 calls per Run for the Cycle counter and the Routine timer.Intel(R) Pentium(R) M processor 1.70GHz Routine timers starting now....Real4_2_ASCII Cycles: 106 RoutineTime: 0.082408239 secondssprintf       Cycles: 4925 RoutineTime: 3.293436177 secondsResult Real4_2_ASCII:  1.234567e+14Result sprintf      : 1.234567e+014Press any key to continue...= = =Screen CaptureF:\TEMP\TEST>REAL4_2_.EXESIMD Real4 to ASCII conversion by Siekmanski 2018.1000000 calls per Run for the Cycle counter and the Routine timer.Intel(R) Pentium(R) M processor 1.70GHz Routine timers starting now....Real4_2_ASCII Cycles: 107 RoutineTime: 0.066058396 secondssprintf       Cycles: 4941 RoutineTime: 2.925534111 secondsResult Real4_2_ASCII:  1.234567e+14Result sprintf      : 1.234567e+014Press any key to continue...= = =SIMD Real4 to ASCII conversion by Siekmanski 2018.1000000 calls per Run for the Cycle counter and the Routine timer.Intel(R) Core(TM) i3-4005U CPU @ 1.70GHz Routine timers starting now....Real4_2_ASCII Cycles: 68 RoutineTime: 0.043186625 secondssprintf       Cycles: 2923 RoutineTime: 1.514711201 secondsResult Real4_2_ASCII:  1.234567e+14Result sprintf      : 1.234567e+014Press any key to continue...`
Some timing difference between a redirect results to a file
and a screen capture of the results.

HTH,

Steve N.