I tested BiteRider's code. VS2022 doesn't like macro statements.
For reference SPrintf(): ~100mb/s using random xorshift64 unsigned long longs.
The naiive 20-figure "divide by 10" loop achieves 287mb/s and removes zero's.
This "SWAR" algorithm is the fastest (scalar) yet ~973mb/s, although its still 16-figures with the zeros.
;==============================================================
;Integer to String using SWAR method. RCX=num RDX=str
;==============================================================
EncodeTens proc ;rcx,rdx
shl rdx,32
or rcx,rdx
mov rax,20972
imul rax,rcx
shr rax,21
mov r8, 7f0000007fh ;((merged * 10486ULL) >> 20) & ((0x7FULL << 32) | 0x7FULL);
and rax,r8 ;top
mov rdx,100
imul rdx,rax
sub rcx,rdx ;bottom
shl rcx,16
add rcx, rax ;hundreds
mov rax,103
imul rax,rcx
shr rax, 10 ;tens
mov r8,0f000f000f000fh
and rax,r8
lea rdx, [rax+rax]
lea rdx, [rdx*4+rdx]
sub rcx,rdx
shl rcx,8
add rax,rcx
ret
EncodeTens endp
IntToChar_SWAR proc
mov r11,rdx
mov rdx,12379400392853802749
mov r8, 100000000
mov rax,rcx
mulx rax,rax,rax
shr rax,26 ;top
imul r8,rax
sub rcx,r8 ;bottom
push rcx
mov ecx,3518437209
imul rcx,rax
shr rcx,45 ;top\10^4
mov edx,10000
imul edx,ecx
sub eax,edx
mov edx,eax
call EncodeTens
mov r10,3030303030303030h
add rax,r10
mov qword ptr [r11],rax
pop rax
mov ecx,3518437209
imul rcx,rax
shr rcx,45 ;top\10^4
mov edx,10000
imul edx,ecx
sub eax,edx
mov edx,eax
call EncodeTens
add rax,r10
mov qword ptr [r11+8],rax
ret
IntToChar_SWAR endp
;==============================================================