Hello all,
I have hunted around for a qword integer to ascii converter in vain. I know that once upon a time Raymond created something he called qwta, but I've been unable to find it. No doubt someone else has taken the trouble to do this too, but I've come up empty handed.
Thanks,
Mark Allyn
that's probably my favorite algo to work on - and see the code of others :P
a lot of time is spent on it because it is fun and interesting
here is one for unsigned integers by Drizz that is pretty fast...
here is an update by Drizz..
comment #
- fixed point arithmetic conversion of Unsigned 64 integer to string -
- Multiply 64bit unsigned integer by 2^127/10^19
- make correction to prevent precision loss
- first digit will be at bit offset 127 (0 or 1)
- [1]8446744073709551615, first digit can only be 0 or 1
- after handling the first digit, we get subsequent digits multiplying by 10
- the "point" is moved to end so all other digits are constructed from shifted "out" bits (multiplying by 10)
#
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
_U64ToStrNLZ proc val:QWORD, pbuff:PTR BYTE
__locals equ 4*4+4*4+4;;//4*4=regs + 4*4=128bit temp
__saveregs equ <dword ptr [esp]>
__128bittmp equ <dword ptr [esp+4*4]>
__NLZmask equ <dword ptr [esp+4*4+4*4]>
__val_hi equ <dword ptr [esp+__locals+2*4]>
__val_lo equ <dword ptr [esp+__locals+1*4]>
__pbuff equ <dword ptr [esp+__locals+3*4]>
__cnst_hi equ 0EC1E4A7Dh
__cnst_lo equ 0B69561A5h
__correction_lo equ 0E30437FBh
__correction_hi equ 0AD7B4F1Bh
sub esp,__locals
mov __saveregs[0*4],ebp
mov __saveregs[1*4],esi
mov __saveregs[2*4],edi
mov __saveregs[3*4],ebx
;;// 64bit x 64bit = 128bit result
mov eax,__cnst_lo;//__y_lo; = b0
mul __val_lo;;// get a0*b0 = d1:d0
mov esi,eax ;mov __128bittmp[0*4],eax;;//d0
mov ecx,edx;;//d1
mov eax,__cnst_lo;//__y_lo; = b0
xor ebx,ebx
mul __val_hi;;// get a1*b0 = e1:e0
add ecx,eax;;//e0
adc ebx,edx;;//e1
mov eax,__cnst_hi;//__y_hi; =b1
mul __val_lo;;// get a0*b1 = f1:f0
add ecx,eax;;//f0
adc ebx,edx;;//f1
mov edi,ecx;mov __128bittmp[1*4],ecx
mov ecx,0
mov eax,__cnst_hi;//__y_hi; =b1
adc ecx,ecx
mul __val_hi;;// get a1*b1 = g1:g0
add eax,ebx;;//g0
adc edx,ecx;;//g1
mov ebp,__pbuff
;// -------------------------------
;// -------------------------------
xor ebx,ebx
add esi,__correction_lo
adc edi,__correction_hi
adc eax,ebx
adc edx,ebx
;// first digit 0 or 1, zero not written
mov ebx,edx
sar ebx,31
lea ecx,[ebx+'2']
mov byte ptr [ebp],cl
sub ebp,ebx
mov __NLZmask,ebx
;// second digit, 128 bits needed
;// shift 2:
;// - account for the first digit
;// - multiply by 2
shld ebx,edx,2
shld edx,eax,2
shld eax,edi,2
shld edi,esi,2
shl esi,2
and ebx,1; only bit 0
;// mul by 5
mov __128bittmp[0*4],esi
mov __128bittmp[1*4],edi
mov __128bittmp[2*4],eax
mov __128bittmp[3*4],edx
mov ecx,ebx
shld ebx,edx,2
shld edx,eax,2
shld eax,edi,2
shld edi,esi,2
shl esi,2
add esi,__128bittmp[0*4]
adc edi,__128bittmp[1*4]
adc eax,__128bittmp[2*4]
adc edx,__128bittmp[3*4]
adc ecx,ebx; X*2 + X*5
cmp ecx,1
sbb ebx,ebx
xor ebx,-1
add ecx,'0'
or __NLZmask,ebx
mov byte ptr [ebp],cl
sub ebp,__NLZmask
;// third digit, 128 bits needed
xor ebx,ebx
add esi,esi
adc edi,edi
adc eax,eax
adc edx,edx
adc ebx,ebx
mov __128bittmp[0*4],esi
mov __128bittmp[1*4],edi
mov __128bittmp[2*4],eax
mov __128bittmp[3*4],edx
mov ecx,ebx
shld ebx,edx,2
shld edx,eax,2
shld eax,edi,2
shld edi,esi,2
shl esi,2
add esi,__128bittmp[0*4]
adc edi,__128bittmp[1*4]
adc eax,__128bittmp[2*4]
adc edx,__128bittmp[3*4]
adc ecx,ebx
cmp ecx,1
sbb ebx,ebx
xor ebx,-1
add ecx,'0'
or __NLZmask,ebx
mov byte ptr [ebp],cl
sub ebp,__NLZmask
;// mul by 10 the rest not using lower qword anymore
;184
; 46744073709551615
REPT 16
xor ecx,ecx
add eax,eax;
adc edx,edx;
adc ecx,ecx
mov esi,eax;
mov edi,edx;
mov ebx,ecx
shld ecx,edx,2;
shld edx,eax,2;
shl eax,2;
add eax,esi;
adc edx,edi;
adc ecx,ebx
cmp ecx,1
sbb ebx,ebx
xor ebx,-1
add ecx,'0'
or __NLZmask,ebx
mov byte ptr [ebp],cl
sub ebp,__NLZmask
ENDM
xor ecx,ecx
add eax,eax;
adc edx,edx;
adc ecx,ecx
mov esi,eax;
mov edi,edx;
mov ebx,ecx
shld ecx,edx,2;
shld edx,eax,2;
shl eax,2;
add eax,esi;
adc edx,edi;
adc ecx,ebx
cmp ecx,1
sbb ebx,ebx
xor ebx,-1
add ecx,'0'
mov edx,__pbuff
or __NLZmask,ebx
mov word ptr [ebp],cx
sub ebp,__NLZmask
mov eax,ebp
;// -------------------------------
mov ebp,__saveregs[0*4]
mov esi,__saveregs[1*4]
mov edi,__saveregs[2*4]
mov ebx,__saveregs[3*4]
add esp,__locals
sub eax,edx
ret 8+4
_U64ToStrNLZ endp
and another update by Drizz...
;*****************************************************************
;
; uint64-to-string:
;
; split the uint64 value to two 10-digit numbers by magic divider method
; if possible use uint32-to-string nested subroutine, upper 10-digits will
; always fit to 32bit, but lower 10-digit must be adjusted dividing by 10
; (again with magic divider method), then the remainder can be passed to
; uint32-to-string.
;
; uint32-to-string:
;
; split the uint32 value to two 5-digit numbers by magic divider method,
; then scale the 5-digit part by 2^32/10000 (68DB9h) to get decimal
; digit in edx (upper dword) after multiplication, subsequent digits
; are got multiplying by 10
;
;*****************************************************************
New_U64ToStr proc Value:QWORD, lpszBuffer:DWORD
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
locals = 4*4
sub esp,locals
mov [esp+0*4],ebp
mov [esp+1*4],esi
mov [esp+2*4],edi
mov [esp+3*4],ebx
mov esi,[esp+1*4][locals]; a0
mov edi,[esp+2*4][locals]; a1
mov ebp,[esp+3*4][locals]; lpszBuffer
test edi,edi
jnz @F
call DOFULL32NLZ
mov eax,ebp; lpszBuffer
mov ebp,[esp+0*4]
mov esi,[esp+1*4]
mov edi,[esp+2*4]
mov ebx,[esp+3*4]
add esp,locals
sub eax,[esp+3*4]
ret 3*4
@@:
_mul_64x64_top64 0BDEDD5BFh, 0DBE6FECEh; /10000000000
shr edi,1
mov esi,edi
jz TOP10ZERO
call DOFULL32NLZ
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; *1000000000000 == 02_540BE400h
;_mul_32x64_no_overflow
mov eax,0540BE400h; = b0
mul edi
lea edx,[2*edi+edx]; hidword is 2
;endm
mov esi,[esp+1*4][locals]; a0
mov edi,[esp+2*4][locals]; a1
sub esi,eax
sbb edi,edx
mov [esp+1*4][locals],esi; a0
mov [esp+2*4][locals],edi; a1
mov ebx,esi; a0
jnz @F
; the rest also fits to 32bit **
call DOFULL32LZ
mov eax,ebp; lpszBuffer
mov ebp,[esp+0*4]
mov esi,[esp+1*4]
mov edi,[esp+2*4]
mov ebx,[esp+3*4]
add esp,locals
sub eax,[esp+3*4]
ret 3*4
; ** If it does not there will be no leading zero
; so below code also works in this case
TOP10ZERO:
mov esi,[esp+1*4][locals]; a0
mov edi,[esp+2*4][locals]; a1
mov ebx,esi
@@:
add esi,1
adc edi,0
_mul_64x64_top64 01B478423h, 0A7C5AC47h; /100000
shrd esi,edi,16
mov eax,100000
mov ecx,esi
mov esi,ebx
mul ecx
sub esi,eax
mov eax,68DB9h;2^32/10000
mul ecx
mov ebx,ecx
mov ecx,10
call DOFULL32NLZSTART
shrd esi,edi,16
mov eax,100000
mul esi
sub ebx,eax
mov ecx,10
mov eax,68DB9h;2^32/10000
mul esi
mov esi,ebx
call DOFULL32NLZSTART
mov eax,ebp; lpszBuffer
mov ebp,[esp+0*4]
mov esi,[esp+1*4]
mov edi,[esp+2*4]
mov ebx,[esp+3*4]
add esp,locals
sub eax,[esp+3*4]
ret 3*4
;; esi == value
;; ebp == buffer
DOFULL32NLZ::
mov eax,0A7C5AC47h; magic div /100000
mul esi
add eax,0A7C5AC47h; correction for 0FFFFFFFFh
adc edx,0
mov ecx,10
shr edx,16
imul eax,edx,100000
sub esi,eax;/100000 remainder
mov ebx,edx
mov eax,68DB9h;2^32/10000
mul edx
DOFULL32NLZSTART:
test ebx,ebx
jz NEXT5
cmp ebx,9999
ja DIGIT0
cmp ebx,999
ja DIGIT1
add eax,eax
cmp ebx,99
lea eax,[eax*4+eax]
ja DIGIT2
add eax,eax
cmp ebx,9
lea eax,[eax*4+eax]
ja DIGIT3
add eax,eax
test ebx,ebx
lea eax,[eax*4+eax]
jnz DIGIT4
NEXT5:
mov eax,68DB9h;2^32/10000
mul esi
cmp esi,9999
ja DIGIT5
cmp esi,999
ja DIGIT6
add eax,eax
cmp esi,99
lea eax,[eax*4+eax]
ja DIGIT7
add eax,eax
cmp esi,9
lea eax,[eax*4+eax]
ja DIGIT8
lea edx,[esi+'0']
jmp DIGIT9
DOFULL32LZ:
mov eax,0A7C5AC47h; magic div /100000
mul esi
add eax,0A7C5AC47h; correction for 0FFFFFFFFh
adc edx,0
mov ecx,10
shr edx,16
imul eax,edx,100000
sub esi,eax;/100000 remainder
mov ebx,edx
mov eax,68DB9h;2^32/10000
mul edx
DIGIT0:
add dl,'0'
mov [ebp],dl
inc ebp
i = 1
rept 7
@CatStr(<DIGIT>,%(i)):
if i eq 0; first five
elseif i eq 5; next five
mov eax,68DB9h;2^32/10000
mul esi
elseif i gt 0
mul ecx
endif
add dl,'0'
mov [ebp],dl
inc ebp
i = i + 1
endm
DIGIT8:
mul ecx
add dl,'0'
mov [ebp],dl
inc ebp
mul ecx
add dl,'0'
DIGIT9:
mov [ebp],dx
inc ebp
retn
OPTION PROLOGUE:PROLOGUEDEF
OPTION EPILOGUE:EPILOGUEDEF
New_U64ToStr endp
%echo New_U64ToStr: @CatStr(%($-New_U64ToStr)) bytes
; edi::esi *= x
_mul_64x64_top64 macro __x_lo:req, __x_hi:req
mov eax,dword ptr __x_lo; = b0
mul esi;__y_lo; get a0*b0 = d1:d0
mov ecx,edx; d1
mov eax,dword ptr __x_hi; = b0
mul esi;__y_lo; get a1*b0 = e1:e0
add ecx,eax;e0
mov esi,0
adc esi,edx;e1
mov eax,dword ptr __x_lo; =b1
mul edi;__y_hi; get a0*b1 = f1:f0
add ecx,eax;f0
mov eax,dword ptr __x_hi; =b1
adc esi,edx;f1
mov edx,edi
mov edi,0
adc edi,edi
mul edx;__y_hi; get a1*b1 = g1:g0
add esi,eax
adc edi,edx
endm
New_Int64ToStr proc QwValue:QWORD,pBuffer:DWORD
mov eax,[esp+1*4];U64.Lo
mov edx,[esp+2*4];U64.Hi
mov ecx,[esp+3*4];buff
test edx,edx
jns New_U64ToStr
mov byte ptr [ecx],'-'
xor eax,-1
xor edx,-1
inc ecx
add eax,1
adc edx,0
invoke New_U64ToStr,edx::eax,ecx
inc eax
ret 3*4
New_Int64ToStr endp
New_Int32ToStr proc dwValue,pBuffer
mov eax,[esp+4];dwValue
mov edx,[esp+8];pBuffer
test eax,eax
jns New_U32ToStr
mov byte ptr [edx],'-'
neg eax
push ebp
push ebx
push esi
mov esi,eax
lea ebp,[edx+1]
call DOFULL32NLZ
mov eax,ebp;lpszBuffer
pop esi
pop ebx
pop ebp
sub eax,[esp+2*4]; strlen
ret 2*4
New_Int32ToStr endp
New_U32ToStr proc dwValue,pBuffer
push ebp
push ebx
push esi
mov esi,[esp+1*4][3*4]; a0
mov ebp,[esp+2*4][3*4]; lpszBuffer
call DOFULL32NLZ
mov eax,ebp;lpszBuffer
pop esi
pop ebx
pop ebp
sub eax,[esp+2*4]; strlen
ret 2*4
New_U32ToStr endp
and here is one by Paul Dixon, with some speed-ups by lingo...
supposedly, slightly faster that Drizz's - but i like Drizz's methods a little better
here are some routines that i wrote that will handle integers of variable sizes
http://masm32.com/board/index.php?topic=222.0 (http://masm32.com/board/index.php?topic=222.0)
Quote from: allynm on June 06, 2012, 02:36:32 AM
I have hunted around for a qword integer to ascii converter in vain.
Which kind of qword?
include \masm32\MasmBasic\MasmBasic.inc
.data
MyQw QWORD 1234567890123456789
Init
Print Str$("Plain mem:\t%i\n", MyQw)
mov eax, dword ptr MyQw
mov edx, dword ptr MyQw[4]
Print Str$("Reg pair:\t%i\n", edx::eax)
movlps xmm0, MyQw
Print Str$("Xmm reg:\t%i\n", xmm0)
fild MyQw
Inkey Str$("Fpu reg:\t%i\n", ST(0))
Exit
end start
Hi Dedndave and JJ,
As always, you guys are extraordinarily helpful. Plenty to chew on in what you kindly sent. Not entirely sure what JJ's question about "what kind of qword" I had in mind quite meant. I was thinking specifically of a positive or negative integer or unsigned integer, not a REAL8.
Regards,
Mark
i think he meant signed or unsigned
normally, you can make an unsigned routine do signed integers with a little modification
if the value is negative, you can either:
1) remove the sign bit, evaluate as unsigned, then place the minus sign in front, if negative
or
2) invert the value and add 1 if it is negative - then do the sign thing as in (1)
If you have access to the latest version of the Fpulib, it contains umqtoa (for unsigned qword to ascii) and smqtoa (for signed qword to ascii). Both are described in the latest Fpulib help file. The source code of the modules is also included in the latest library package v2_341 downloadable from:
http://www.ray.masmcode.com/fpu.html#fpulib (http://www.ray.masmcode.com/fpu.html#fpulib)
Before you ask, the "m" stands for "multiply-instead-of-divide".
Hi Dave, Raymond, and indirectly, Drizz.
Thanks. Looking over Drizz's code it is clear that this problem is even tougher to solve than I originally thought. Incidentally, I did not have the latest copy of Fpulib so I was unaware of umqtoa or smqtoa. Thanks, Raymond.
Regards,
Mark
it really isn't as complex as it may seem
the guys that wrote these routines went to great lengths to make them as fast as is practical
that means that they use multiply-to-divide code and/or look-up-tables
here is some simple code that isn't nearly as fast as the others
.DATA
AscBuf DB '01234567890123456789',0 ;20 ASCII digits
.CODE
Asc64 PROC
;Convert 64-bit unsigned integer to ASCII decimal string
;
;Call With: EDX:EAX= QWORD value to convert
;
; Returns: EDI= Offset into AscBuf of first numchar
std
mov edi,offset AscBuf+18
mov ecx,edx
xchg eax,esi
mov ebx,100
Asc64a: xor edx,edx
xchg eax,ecx
div ebx
xchg eax,ecx
xchg eax,esi
div ebx
xchg eax,esi
xchg eax,edx
aam
xchg al,ah
or ax,3030h
stosw
mov eax,ecx
or eax,esi
jnz Asc64a
inc edi
inc edi
cld
cmp byte ptr [edi],30h ;leading 0 ?
jz Asc64b ;yes - supress it
ret ;no - done
Asc64b: inc edi
ret
Asc64 ENDP
what makes the 64-bit case interesting on 32-bit machines is the magnitude of the quotient
that is - if you choose to divide
it is really easy to run into a divide overflow problem :P
notice that, in that last one i posted, i used multiple precision division to avoid this
another thing that makes it fun...
there is no way to perform a 100% functional test
there are just too many values to run through
i think i calculated something like 400 years to run through them all on my machine :biggrin:
you have to use a certain amount of infered logic in verification
make sure all the pieces work - then make sure they are put together correctly
when you're done - test some known problematic values and move on
Quote from: allynm on June 06, 2012, 07:33:35 AMNot entirely sure what JJ's question about "what kind of qword" I had in mind quite meant. I was thinking specifically of a positive or negative integer or unsigned integer, not a REAL8.
Hi Mark,
My snippet is just a demo of the various representations of a QWORD integer: in memory, in a reg32:reg32 pair, in the lower half of an XMM register, in the FPU. If you want the Str$() macro to treat them as unsigned, replace the %i below with %u.
Under the hood is (inter alia, the routine is relatively big) a qword to ascii algo by drizz (old link (http://www.masm32.com/board/index.php?topic=9857.msg72422#msg72422)).
Str$ supports even simple algebra:
MyQw QWORD -1234567890123456789
..
mov eax, 1000000000
Print Str$("MyQw/eax=\t%If\n", MyQw/eax)
MyQw/eax = -1234567890.12345679Speed is rarely a problem. Str$() is an allrounder and as such not the fastest algo around but still much faster than the C equivalents. If you need to output a Million strings in an innermost loop, drizz' original algo is a good candidate :t
include \masm32\MasmBasic\MasmBasic.inc ; http://masmforum.com/~masm32/board/index.php?topic=94
.data
MyQw QWORD -1234567890123456789
Init
Print Str$("Plain mem:\t%i\n", MyQw)
mov eax, dword ptr MyQw
mov edx, dword ptr MyQw[4]
Print Str$("Reg pair:\t%i\n", edx::eax)
movlps xmm0, MyQw
Print Str$("Xmm reg:\t%i\n", xmm0)
fild MyQw
Inkey Str$("Fpu reg:\t%i\n", ST(0))
Exit
end start
Hi Dave, JJ,
I tried Dave's code and it runs very nicely. I'm not a speed person so this dimension doesn't matter as much to me as ease of understanding. Dave's code fits nicely inside a OLLY window and so its relatively easy to follow. This is just a practical concern for a novice like me.
JJ: I have to download MasmBasic--something I should have done some time ago. I will do so and give yours a try.
I also still have to do Raymond's.
Thanks all.
Mark
well - that is not a great routine - lol
i wrote that long ago
since then, i have found that STD and CLD are very slow instructions
you could easily convert that code so that it did not use LODS/STOS and make a much nicer version
also - that routine does not follow the ABI
and - the values are passed in register
it needs to be re-written :P
here - this one is a little better
example of use:
INVOKE Asc64,LoDword,HiDword
;EAX = address of ASCII decimal string
Asc64 PROTO :DWORD,:DWORD
.DATA
AscBuf DB '01234567890123456789',0 ;20 ASCII digits
.CODE
Asc64 PROC USES EBX ESI EDI dwLoDword:DWORD,dwHiDword:DWORD
;Convert 64-bit unsigned integer to ASCII decimal string
;
;Call With: dwLoDword = low DWORD of QWORD value to convert
; dwHiDword = high DWORD of QWORD value to convert
;
; Returns: EAX = Offset into AscBuf of first numchar
mov edi,offset AscBuf+18
mov ecx,dwHiDword
mov esi,dwLoDword
mov ebx,100
Asc64a: xor edx,edx
xchg eax,ecx
div ebx
xchg eax,ecx
xchg eax,esi
div ebx
xchg eax,esi
xchg eax,edx
aam
xchg al,ah
or ax,3030h
mov [edi],ax
mov eax,ecx
sub edi,2
or eax,esi
jnz Asc64a
inc edi
inc edi
cmp byte ptr [edi],30h ;leading 0 ?
jnz Asc64b ;no - done
inc edi ;yes - supress it
Asc64b: xchg eax,edi ;return pointer in EAX
ret
Asc64 ENDP
it could be modified so that you pass a pointer to the QWORD in memory instead of passing it directly
Hello everyo
Just an update. After figuring out how to use the EAX register in Raymonds code, the program ran beautifully....as of course one would expect. Dave mentioned "speed" several times. Seems to me it's worth running MichaelW's timers on these programs. I want to look also at Drizz's program too.
Mark
a lot depends on how it is to be used...
if you are going to convert a few values during the program session - speed isn't much of an issue
if your program uses FPU/MMX or SSE code, you may not want the routine to disturb those registers
if your program uses FPU/MMX or SSE code, and you are converting a lot of values - one like Drizz's may be what you want
if your program doesn't use FPU/MMX or SSE code, and you are converting a lot of values - use one like Ray's, or perhaps, an SSE version
There is an incredible essay titled "TRANSFER OF AN INTEGER BINARY VALUE INTO DECIMAL ASCII STRING" (http://www.andrijar.com/ascii/index.html) by Andrija Radović (http://mailto:andrija_radovic@hotmail.com). Must read :t
Hi Dave and JJ,
When I mentioned MichaelW's timer programs I was thinking of just a single conversion, and as you say, slight differences with just a single conversion might not matter but if the conversions need to be done manifold, could be another story altogether.
JJ-Thanks for the link. I will read it.
I should add that when I mentioned Raymond's code and the EAX register I should have made it clear that Raymond's documentation was very clear on the role EAX plays in his solution, it just took me awhile to recognize exactly what he meant. My bad, as they say...
Regards,
Mark
Quote from: allynm on June 08, 2012, 06:40:36 AM
When I mentioned MichaelW's timer programs...
I was trying to get the two algos by Paul Dixon and Lingo running but no luck. They are fast but don't produce the expected result - is that wrong usage?? ::)
mov esi, offset Src
mov edi, offset Dest
invoke uqword, edi, esi
Attention Lingo's b2a3264 may crash because he doesn't care for register preservation.
Quote from: jj2007 on June 08, 2012, 06:31:40 AM
There is an incredible essay titled "TRANSFER OF AN INTEGER BINARY VALUE INTO DECIMAL ASCII STRING" (http://www.andrijar.com/ascii/index.html) by Andrija Radović (http://mailto:andrija_radovic@hotmail.com). Must read :t
Converting from binary to decimal is simply a matter of base conversion. There are a few methods that can be used, as he pointed out. However, in the Horner's Rule method, he only mentioned dividing by 10. Furthermore, I don't think he noticed it was an application of Horner's Rule, or Ling Long Kai Fang.
A concept that I learned while writing the Ling Long Kai Fang routines was that the selection of bases is somewhat arbitrary. For example, many 32-bit routines might divide by 10,000, and convert 4 decimal digits to ASCII at once. Strictly speaking, this is not really conversion from binary (base 2) to decimal (base 10). Rather, it may be viewed as conversion from base 4,294,967,296 to base 10,000. While the values stored in a dword may indeed be binary, we can think of that dword as a single digit. If we were to apply the same line of thought to bytes, we might call it base 256, and words, base 65536.
I found it was important to realize the difference as I wrote the Ling Long Kai Fang routines. In the first version of the routines, I converted from base 4,294,967,296 to base 100,000,000. Each intermediate dword then held 8 decimal digits.
In the later version, I convert from base 4,294,967,296 to base 1,000,000,000. Now, each intermediate dword holds 9 decimal digits. It made the loop for conversion to ASCII considerably more tedious, but gave a signifigant performance improvement for very large integers.
Allynm,
Here is my version:
I have included a zip my QWORD conversion code (signed and unsigned) and the
test code and the timing code (included this text also). The test code is for
reference only, you need the output routines which I have not included (no, I
don't use the masm32.lib, I roll my own). The timing and conversion PROCs should
be free standing.
The following timing is extracted from a console output for one pass of 8
conversions with output, followed by 1,000,000 (a million) passes of the 8
conversions only - all in just over 2 seconds. That is fast enough for any
output I need to display.
The current time is: 8:59:14.68
18,446,744,073,709,551,615
0
4,294,967,295
4,294,967,296
9,223,372,036,854,775,807
-9,223,372,036,854,775,808
0
-1
The current time is: 8:59:16.81
The current time is: 8:59:14.68
----------
0:00:02.13
Dave.
Hi Dave,
What is the correct usage? For...
.data
Src QWORD 1234567890123456
Dest db 100 dup(?)
.code
start:
mov edx, offset Src
mov edi, offset Dest
call UBTD
... I get 15,001,234,558,140,725,952 ::)
Jochen,
after the routine has been called, EDI points to the first char
.data
Src QWORD 1234567890123456
Dest db 100 dup(?)
.code
start:
mov edx, offset Src
mov edi, offset Dest
call UBTD
print edi
i get...
0
:biggrin:
full program...
include \masm32\include\masm32rt.inc
.data
Src QWORD 1234567890123456
Dest db 100 dup(?)
ALIGN QWORD
; qHugeWork QWORD 0
q16BillionBillion QWORD 16*1000000000*1000000000
; q8BillionBillion QWORD 8*1000000000*1000000000
; q4BillionBillion QWORD 4*1000000000*1000000000
; q2BillionBillion QWORD 2*1000000000*1000000000
; q1BillionBillion QWORD 1*1000000000*1000000000
ALIGN DWORD
dHugeBillion DWORD 1000*1000*1000
dHugeBillions DWORD 0
dTop DWORD 0
ALIGN WORD
cbFirstTwo BYTE "00","01","02","03","04","05","06","07","08","09"
BYTE "10","11","12","13","14","15","16","17","18"
;
.code
start:
mov edx, offset Src
mov edi, offset Dest
call UBTD
print edi
push 0A0Dh
print esp
pop eax
exit
;-------------------------------------------------------------------------------
; UBTD - Convert unsigned 64 bit binary number to 26 digits with separators,
; (edx = QWORD pointer, edi = message buffer, returns edi pointing to first
; non-zero character or to the single zero character for a zero value).
;-------------------------------------------------------------------------------
ALIGN OWORD
UBTD PROC USES eax ebx ecx edx esi
mov eax,[edx] ; Get Low.
mov edx,[edx+4] ; Get High.
mov dTop,0 ; Clear high digits value.
mov ebx,16 ; Get the value to increment dTop.
mov ecx,5 ; Get the count of test values.
mov esi,OFFSET q16BillionBillion ; Point to the test values.
;
; Correct the input value to be below 1 billion billion
;
CorrectTop:
cmp edx,[esi+4] ; Is high too big?
jb Skip ; No.
sub eax,[esi] ; Correct the value.
sbb edx,[esi+4]
add dTop,ebx ; Increment the high digit value.
Skip:
shr ebx,1 ; Correct the increment value.
lea esi,[esi+8] ; Point to the next correction value.
dec ecx ; Decrement the test value count.
jnz CorrectTop ; Not all tested.
;
; Split the remaining 60 bit number to 2 30 bit numbers.
;
mov ebx,dHugeBillion ; Get 1 billion.
div ebx ; Convert to billions (eax) and fraction (edx).
mov dHugeBillions,eax ; Save the billions.
mov eax,edx ; Convert the Low value.
;
; Sample number result (maximum possible 64 bit number value):
;
; 18,446,744,073,709,551,615 BYTES
;
; Convert the fraction then the billions.
;
mov esi,eax ; Convert the Low value.
mov ecx,25 ; Set the character position for the last digit.
mov ebx,3 ; Set the character count for 3 digits.
;
; Convert by multiplying.
;
Cvt:
mov edx,3435973837 ; Get magic number for divisor of 10.
mul edx ; edx = (quotient * 8) + garbage.
shr edx,3 ; edx = quotient.
lea eax,[edx+edx*4] ; eax = quotient * 5.
shl eax,1 ; eax = quotient * 10.
neg eax ; eax = - quotient * 10.
lea eax,[esi+eax+"0"] ; eax = LSD.
mov [edi+ecx],al ; Save digit.
mov eax,edx ; eax = quotient.
mov esi,eax ; Save quotient.
dec ecx ; Point to prior digit space.
dec ebx ; Decrement 1000's
jnz Cnt ; Not there.
dec ecx ; Skip comma.
mov ebx,3 ; Set for the next 3 digits.
;
; Check the end of the low 9 digits.
;
Cnt:
cmp ecx,13 ; Offset for the billions?
jg Cvt ; No, not done with Low conversion.
je GetHigh ; Yes, get the high value.
cmp ecx,1 ; Total conversion complete?
jns Cvt ; No, keep converting High.
CvtTop:
mov eax,dTop ; Get the value for the top 2 digits (0 to 18).
mov ebx,OFFSET cbFirstTwo ; Point to conversion characters.
mov ax,[ebx+eax*2] ; Get the first two characters in ax (little endian will reverse them).
mov cl,',' ; Get a separator.
mov [edi],ax ; Save them (little endian will reverse them back to the correct order).
jmp Separate ; Go to add separators.
;
; Get the high value (billions) to convert.
;
GetHigh:
mov eax,dHugeBillions ; Get billions value.
mov esi,eax ; Convert the High value 4 digits.
jmp Cvt
;
; Separate with commas.
;
Separate:
mov [edi+2],cl
mov [edi+6],cl
mov [edi+10],cl
mov [edi+14],cl
mov [edi+18],cl
mov [edi+22],cl
mov cl,' ' ; Get leading blank pad.
jmp ScanNonZero ; Scan for decimal digit > 0.
;
; Blank leading commas and leading zeros, not last zero.
;
BlankFill:
mov [edi],cl ; Blank the character.
inc edi ; Point to the next character.
;
; Scan for additional digits.
;
ScanNonZero:
mov al,[edi] ; Get the character.
cmp al,"," ; Is it a leading comma?
jz BlankFill ; Yes, blank it.
cmp al,"0" ; Is it a leading '0'?
ja Exit ; No, a digit > '0'. done with blanking.
or al,al ; Is it the trailing null at the end of the string?
jnz BlankFill ; No, blank it.
dec edi ; Point to the last character.
mov BYTE PTR [edi],"0" ; Force the single '0' back.
;
; Exit UBTD.
;
Exit:
ret ; Exit PROC UBTD.
UBTD ENDP
;-------------------------------------------------------------------------------
; End of PROC UBTD.
;-------------------------------------------------------------------------------
end start
Daves,
Masm 6.14 and 6.15 don't like QWORD 1*1000000000*1000000000. It works fine with 8.0, 9.0 and JWasm. Workaround:
qHugeWork QWORD 0
if 1
q16BillionBillion QWORD 16000000000000000000
q8BillionBillion QWORD 8000000000000000000
q4BillionBillion QWORD 4000000000000000000
q2BillionBillion QWORD 2000000000000000000
q1BillionBillion QWORD 1000000000000000000
else
q16BillionBillion QWORD 16*1000000000*1000000000
q8BillionBillion QWORD 8*1000000000*1000000000
q4BillionBillion QWORD 4*1000000000*1000000000
q2BillionBillion QWORD 2*1000000000*1000000000
q1BillionBillion QWORD 1*1000000000*1000000000
endif
8)
just ling long kai fang it...
1234567890123456
15001234558140725952
20282409603651670423947251286015
Quote from: jj2007 on June 10, 2012, 06:59:53 AM
Daves,
Masm 6.14 and 6.15 don't like QWORD 1*1000000000*1000000000. It works fine with 8.0, 9.0 and JWasm. Workaround:
qHugeWork QWORD 0
if 1
q16BillionBillion QWORD 16000000000000000000
q8BillionBillion QWORD 8000000000000000000
q4BillionBillion QWORD 4000000000000000000
q2BillionBillion QWORD 2000000000000000000
q1BillionBillion QWORD 1000000000000000000
else
q16BillionBillion QWORD 16*1000000000*1000000000
q8BillionBillion QWORD 8*1000000000*1000000000
q4BillionBillion QWORD 4*1000000000*1000000000
q2BillionBillion QWORD 2*1000000000*1000000000
q1BillionBillion QWORD 1*1000000000*1000000000
endif
JJ,
Thank you for finding this. Another potential problem is the buffer. As I coded it, it expects at least a 26 character buffer with a zero terminator (and i never checked the terminator, just expected it to be a null terminator), could pose a problem when I go to print it expecting strlen to correctly size it. At a minimum I should zero the terminator, but this leaves me with the potential of a GPF for a memory access error. Oh, well! At least the code describes the buffer as 26 characters and a NULL.
"Other than that little problem, Mrs Lincoln, how did you enjoy the play?" Is the code adequately described? What about speed (not that it really matters)?
Dave.
Dave,
if you really want some constructive critisism...
passing arguments to a function in registers is old school DOS stuff (not the same as CMPSB - lol)
i do this myself - but not for "reusable" functions - only for internal functions
the routine does not preserve EDI, and thus, does not follow the ABI
also - ABI-compliant routines would return a result in EAX
although - for assembly language routines, ECX and EDX may also be used (not much good in C-callable)
at any rate - no need to preserve EAX, ECX, EDX
it does not hurt to preserve ECX and EDX if speed is not an issue
in my ling long kai fang routine, i return
EAX = status
ECX = decimal string length
EDX = buffer address
the ECX and EDX values are intended to be convenient for the (assembly language) caller
however, they are not required to use the function
so, a C-callable OBJ could be made and linked with a C program
i fixed the issue that Jochen mentioned, and i get this
8,001,234,566,890,123,456
assuming that the first few digits are extraneous, it would be this
1,234,566,890,123,456
it should be this
1,234,567,890,123,456
Quote from: dedndave on June 10, 2012, 09:48:46 AM
i do this myself - but not for "reusable" functions - only for internal functions
I suppose this is an internal function as well and thus does not follow "normal" ABI (parameters passing, regs preservation, the way of returning value).
Quote from: dedndave on June 10, 2012, 09:48:46 AM
in my ling long kai fang routine, i return
EAX = status
ECX = decimal string length
EDX = buffer address
the ECX and EDX values are intended to be convenient for the (assembly language) caller
however, they are not required to use the function
so, a C-callable OBJ could be made and linked with a C program
Hm... if you will return status and string length in one DWORD (to say
status shl 8 or length), and address in another, it is possible to grab both returned values in C(++) code - just declare the function as returning LARGE_INTEGER or ULONGLONG (__int64) - this type of return assumes value to be returned in EDX:EAX and thus could be normally accessed in HLL code :biggrin:
Dave,
I put JJ's value in my code (just replaced my max unsigned value) and it correctly converted it. I did not use JJ's fix yet. Maybe his fix does not actually create the correct constant?
I am currently using MASM 8.0 since I want to see ALL generated code and MASM 9.0 does not support the -Sg option.
Still looking.
Dave.
the listing shows the right stuff
that doesn't mean that's what's in the EXE, though :P
00000000 .data
00000000 Src QWORD 1234567890123456
000462D53C8ABAC0
00000008 00000064 [ Dest db 100 dup(?)
00
]
ALIGN QWORD
; qHugeWork QWORD 0
00000070 q16BillionBillion QWORD 16000000000000000000
DE0B6B3A76400000
; q8BillionBillion QWORD 8*1000000000*1000000000
; q4BillionBillion QWORD 4*1000000000*1000000000
; q2BillionBillion QWORD 2*1000000000*1000000000
; q1BillionBillion QWORD 1*1000000000*1000000000
ALIGN DWORD
00000078 3B9ACA00 dHugeBillion DWORD 1000000000
0000007C 00000000 dHugeBillions DWORD 0
00000080 00000000 dTop DWORD 0
ALIGN WORD
00000084 30 30 30 31 30 cbFirstTwo BYTE "00","01","02","03","04","05","06","07","08","09"
32 30 33 30 34
30 35 30 36 30
37 30 38 30 39
00000098 31 30 31 31 31 BYTE "10","11","12","13","14","15","16","17","18"
32 31 33 31 34
31 35 31 36 31
37 31 38
Quote from: Antariy on June 10, 2012, 10:03:50 AMHm... if you will return status and string length in one DWORD (to say status shl 8 or length), and address in another, it is possible to grab both returned values in C(++) code - just declare the function as returning LARGE_INTEGER or ULONGLONG (__int64) - this type of return assumes value to be returned in EDX:EAX and thus could be normally accessed in HLL code :biggrin:
my LLKF routines handle bignums
the string length may be larger than 1 Mb :biggrin:
and - the string length isn't needed to use the function
for C - all you need to know is success/fail, really
the output string is left-justified in the buffer, so they know the address
besides, who cares about those C guys, anyways - lol
if they want the string length, they can suck an egg and measure it
Dave,
My code seems to handle the conversion correctly (using MASM 8.0). I will use JJ's fix and see what I get. Very strange!
Dave.
Dave,
I tried with JJ's fix, worked fine. At least the second time. The first time I found that I had the incorrect number of zeros in dHugeBillion (too few). Fixed that and all is well?????
Dave.
Alex,
if i wanted to make a C-callable version, i could return the string length in EAX
then i could use SetLastError to reflect any error code if the string length is 0
for assembler, this would slow us down a little
i prefered to return the error code in EAX
Dave,
i have attached the program i put together
maybe you can spot where i went wrong
it only has the unsigned routine - i didn't get as far as testing the signed version
EDIT: attachment removed
Dave,
You have to fix ALL of the qxxBillionBillion constants, you only fixed the q16BillionBillion.
Dave.
:P
ok - they don't appear to be referenced in the unsigned routine
ahhh - that seems to fix it - my fault, Dave :redface:
Quote from: dedndave on June 10, 2012, 10:33:11 AM
the string length may be larger than 1 Mb :biggrin:
Then you obviously should return the string length in EDX to avoid huge performance losses on (l)strlen algo :biggrin:
It is Windows C standard: LARGE_INTEGER/LONGLONG returned value is in EDX:EAX and you can easily access it in HLL as 2 DWORDs with no performance loss.
lol
here is a little demo
this routine can handle some big numbers
signed or unsigned - mode selectable
1234567890123456
15001234558140725952
57896044618658097711785492504343953926634992332820282019728792003956564819968
-57896044618658097711785492504343953926634992332820282019728792003956564819968
let's see you return that in EDX:EAX :P
Quote from: dedndave on June 10, 2012, 11:01:52 AM
:P
ok - they don't appear to be referenced in the unsigned routine
ahhh - that seems to fix it - my fault, Dave :redface:
Dave,
They aren't, directly. The correction loop in UBTD walks through 6 QWORDS and corrects the input value to some value less than or equal to 1BillionBillion so I can do a divide. If I divide a max QWORD by a billion, I get an overflow with the remainder.
Dave.
Quote from: dedndave on June 10, 2012, 11:14:38 AM
let's see you return that in EDX:EAX :P
Read my post carefully.
EDX - is the output string length.
EAX - is the status.
No number itself is returned in regs :P :biggrin:
JJ,
I have updated the .zip with corrections found by you (assembly errors with MASM 6.15), and by me (not forcing a null terminator in the output string). I replaced the original .zip file.
Dave
Quote from: KeepingRealBusy on June 10, 2012, 09:12:49 AM
What about speed (not that it really matters)?
Dave.
Speed is fine, you are on par with TheSvin's code.
uqword is faster but I can't convince it to give a correct result
1) ::)
uqword=4362504522858521301
Intel(R) Celeron(R) M CPU 420 @ 1.60GHz (SSE3)
1234567890123456
4444444444444444444
1,234,567,890,123,456
1234567890123456
4444444444444444444
652 cycles for Str$
92 cycles for uqword
295 cycles for uqw2a (The Svin)
322 cycles for uqw2a (mCoder)
1013 cycles for i64toa (Towers)
341 cycles for UBTD (Dave)
633 cycles for Str$
92 cycles for uqword
297 cycles for uqw2a (The Svin)
303 cycles for uqw2a (mCoder)
1008 cycles for i64toa (Towers)
346 cycles for UBTD (Dave)
1) same for Lingo's code which yields the same wrong results and crashes in addition
uqword=4362504522858521301
Intel(R) Core(TM)2 Quad CPU Q9650 @ 3.00GHz (SSE4)
1234567890123456
4444444444444444444
1,234,567,890,123,456
1234567890123456
4444444444444444444
568 cycles for Str$
69 cycles for uqword
277 cycles for uqw2a (The Svin)
315 cycles for uqw2a (mCoder)
665 cycles for i64toa (Towers)
290 cycles for UBTD (Dave)
554 cycles for Str$
69 cycles for uqword
267 cycles for uqw2a (The Svin)
291 cycles for uqw2a (mCoder)
664 cycles for i64toa (Towers)
269 cycles for UBTD (Dave)
--- ok ---
prescott w/htt
Intel(R) Pentium(R) 4 CPU 3.00GHz (SSE3)
1527 cycles for Str$
182 cycles for uqword
726 cycles for uqw2a (The Svin)
911 cycles for uqw2a (mCoder)
1973 cycles for i64toa (Towers)
611 cycles for UBTD (Dave)
523 cycles for Ling Long Kai Fang (DednDave)
1548 cycles for Str$
179 cycles for uqword
675 cycles for uqw2a (The Svin)
844 cycles for uqw2a (mCoder)
2007 cycles for i64toa (Towers)
604 cycles for UBTD (Dave)
522 cycles for Ling Long Kai Fang (DednDave)
i wanted to see how Ling Long Kai Fang compared :biggrin:
not as well as i had hoped
but then, it will handle all practical sizes of integers
so, kind of comparing apples with oranges
Dave, no attachment??
JJ,
I replaced the original .zip, t least I thought I did, let me check.
Dave.
JJ,
I downloaded the .zip from the original post and checked its contents - the posted .zip is correct.
Dave.
too many Dave's :biggrin:
i think he was talking to me
actually, Jochen, i did not attach specifically because the LLKF routine falls in a catagory outside those being tested
if we were testing bignum routines, it would be different
i just wanted to see how it faired along side the others
i already knew it wasn't super-fast for smaller integers
in fact, i don't think the Ling Long Kai Fang method offers an advantage unless you are doing integers larger than, say, 96 bits or so
Dave,
I will work on a fix to make my posted version ABI compliant.
Dave.
Allynm,
I have included a zip my QWORD conversion code (signed and unsigned) and the
test code and the timing code (included this text also). The test code is for
reference only, you need the output routines which I have not included (no, I
don't use the masm32.lib, I roll my own). The timing and conversion PROCs should
be free standing.
The following timing is extracted from a console output for one pass of 8
conversions with output, followed by 1,000,000 (a million) passes of the 8
conversions only - all in just over 2 seconds. That is fast enough for any
output I need to display.
18,446,744,073,709,551,615
0
4,294,967,295
4,294,967,296
9,223,372,036,854,775,807
-9,223,372,036,854,775,808
0
-1
The current time is: 8:59:16.81
The current time is: 8:59:14.68
----------
0:00:02.13
This version of my QWORD conversion code is INTEL ABI compliant with the
following timing:
The current time is: 15:58:58.73
18,446,744,073,709,551,615
0
4,294,967,295
4,294,967,296
9,223,372,036,854,775,807
-9,223,372,036,854,775,808
0
-1
The current time is: 15:59:00.75
The current time is: 15:58:58.73
-----------
00:00:02.02
Dave.
:t
Quote from: jj2007 on June 08, 2012, 06:31:40 AM
There is an incredible essay titled "TRANSFER OF AN INTEGER BINARY VALUE INTO DECIMAL ASCII STRING" (http://www.andrijar.com/ascii/index.html) by Andrija Radović (http://mailto:andrija_radovic@hotmail.com). Must read :t
JJ,
Thank you for the link, good read.
Dave.
I finally succeeded in making Lingo's b2a3264 code produce output. It needs a special kind of QWORD, and some extra pushes and pops for esi & edi to avoid GPFs, but on the bright side it is pretty fast, hehe :biggrin:
Intel(R) Celeron(R) M CPU 420 @ 1.60GHz (SSE3)
687 cycles for Str$
101 cycles for uqword
298 cycles for uqw2a (The Svin)
316 cycles for uqw2a (mCoder)
1056 cycles for i64toa (Towers)
412 cycles for JJ
349 cycles for UBTD (Dave)
87 cycles for b2a3264
654 cycles for Str$
97 cycles for uqword
307 cycles for uqw2a (The Svin)
313 cycles for uqw2a (mCoder)
1055 cycles for i64toa (Towers)
408 cycles for JJ
347 cycles for UBTD (Dave)
87 cycles for b2a3264
Quote from: Antariy on June 10, 2012, 11:07:35 AM
It is Windows C standard: LARGE_INTEGER/LONGLONG
There are some pitfalls in C with this "standard":
// works:
long long q=1234567890123456789;
printf("long long = %lld\n", q);
// chokes:
// LARGE_INTEGER LI=1234567890123456789;
// works:
LARGE_INTEGER LI;
LI.QuadPart=1234567890123456789;
// works:
__asm fild q;
__asm fistp qword ptr LI;
printf("large_int = %lld\n", LI); // warning #2234; output large_int = 1234567890123456789
printf("large_int = %lld\n", LI.QuadPart); // same output, no warningThat was Pelles C, but I have actually an assembly question: How can you initialise a LARGE_INTEGER?
.data
MyLI0 LARGE_INTEGER <123, 456> ; error A2179
MyLI1 LARGE_INTEGER <123456> ; error A2179
MyLI2 LARGE_INTEGER.QuadPart <123456> ; error A2008
MyLI3 LARGE_INTEGER.QuadPart 123456 ; error A2179
Quote from: jj2007 on October 18, 2013, 06:40:20 PM
... How can you initialise a LARGE_INTEGER?
.data
MyLI0 LARGE_INTEGER <123, 456> ; error A2179
MyLI1 LARGE_INTEGER <123456> ; error A2179
MyLI2 LARGE_INTEGER.QuadPart <123456> ; error A2008
MyLI3 LARGE_INTEGER.QuadPart 123456 ; error A2179
Since it is an union, you can only initialize the first member. And since the first member is a struct, you'll need 2 levels of angle brackets:
MyLI0 LARGE_INTEGER <<123, 456>>
That's not very "user-friendly", of course. Even more, there's a bug in Masm's v8+ LOW32 or HIGH32 operator, because the following work-around is rejected:
LI1 equ 123456789abcdeh
MyLI0 LARGE_INTEGER << LOW32 LI1, HIGH32 LI1 >>
And finally, this "fix" also won't work
LI1 equ 123456789abcdeh
X1 LARGE_INTEGER << LI1 and 0ffffffffh, LI1 shr 32 >>
Quote from: japheth on October 18, 2013, 07:46:42 PMSince it is an union, you can only initialize the first member. And since the first member is a struct, you'll need 2 levels of angle brackets
Thanks, that works; so the workaround could be another structure. However, even if the first member is just a QWORD, Masm (6.15, 9, 10) and JWasm want the 2 levels of brackets.
include \masm32\MasmBasic\MasmBasic.inc ; at least Version 18.10.2013 (http://masm32.com/board/index.php?topic=94.0)
LARGE_INT STRUCT
UNION
liQw QWORD ?
liLi LARGE_INTEGER <>
ENDS
LARGE_INT ENDS
.data
MyLI1 LARGE_INTEGER <<12345, 1000000000>> ; low32, hi32
MyLI2 LARGE_INT <<1234567890123456789>> ; one fat QWORD
MyLI3 LONGLONG 1234567890123456789 ; one fat QWORD
Init
Print Str$("LI1=%i\n", MyLI1) ; LI1=4294967296000012345, OK
fild MyLI2
fistp MyLI1
Print Str$("LI1=%i\n", MyLI1) ; LI1=1234567890123456789, OK
Print Str$("LI2=%i\n", MyLI2) ; LI2=1234567890123456789, OK
Inkey Str$("LI3=%i\n", MyLI3) ; LI3=1234567890123456789, OK
Exit
end start
So, I ended up here while looking for a fast qw2asc routine. Is b2a3264 still the fastest? Has there been any usability updates?
Scratch that. I couldn't get b2a3264 to work. It's not working in the last posted test program. Looks like UBTD is the champ.
What is the problem with b2a3264? It works fine for me, but the 2nd algo, uqword, is a tick faster.
Btw we did really exotic things at the time: mov esi, pQword
fld FP10(0.00000000000000000009999999999999999972)
fld FP10(10.000000000000000028)
fild qword ptr [esi]
mov ecx, 19
fmul st, st(2)
mov edi, pBuffer
push 0
.Repeat
fisub dword ptr [esp]
fmul st, st(1)
fist dword ptr [esp]
mov eax, [esp]
add eax, "0"
stosb
dec ecx
.Until Zero?
:biggrin:
Perhaps I'm just misunderstanding how it works.
I took your last post and added the following lines after each counter_end to see the results of the last run-
pusha
print offset Dest," - "
invoke RtlZeroMemory,addr Dest,100
popa
And here is my results-
Intel(R) Core(TM) i7-6700K CPU @ 4.00GHz (MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX)
4343434343434343434
4343434343434343434 - 862 cycles for Str$
- 211 cycles for uqword
4444444444444444444 - 773 cycles for uqw2a (The Svin)
4444444444444444444 - 880 cycles for uqw2a (mCoder)
- 92 cycles for i64toa (Towers)
44444444444444/4+21 - 674 cycles for JJ
4,444,444,444,444,393,520 - 537 cycles for UBTD (Dave)
6 - 214 cycles for b2a3264
- 664 cycles for Str$
- 211 cycles for uqword
4444444444444393520 - 754 cycles for uqw2a (The Svin)
4444444444444393520 - 770 cycles for uqw2a (mCoder)
- 90 cycles for i64toa (Towers)
44444444444444/4+21 - 647 cycles for JJ
4,444,444,444,444,393,520 - 451 cycles for UBTD (Dave)
6 - 214 cycles for b2a3264
If the answer isn't in Dest, where is it?
If it is, then most of them aren't working.
edit:
Just to see if the answer was somewhere, I tried printing every printable character in Dest.
I replace my previous insertions with a macro called checkit
checkit macro
pusha
mov ebx,99
lea esi,Dest
mov edi,esi
.repeat
lodsb
.if al>31
stosb
.endif
dec ebx
.until ebx==0
mov al,0
stosb
print offset Dest," - "
invoke RtlZeroMemory,addr Dest,100
popa
endm
and for results I got:
Intel(R) Core(TM) i7-6700K CPU @ 4.00GHz (MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX)
4343434343434343434
4343434343434343434 - 871 cycles for Str$
343434343434343434 - 212 cycles for uqword
4444444444444444444 - 735 cycles for uqw2a (The Svin)
4444444444444444444 - 882 cycles for uqw2a (mCoder)
- 92 cycles for i64toa (Towers)
44444444444444/4+21 - 670 cycles for JJ
4,444,444,444,444,393,520 - 313 cycles for UBTD (Dave)
6513854137424602010 - 214 cycles for b2a3264
- 445 cycles for Str$
343434343434343434 - 151 cycles for uqword
4444444444444393520 - 655 cycles for uqw2a (The Svin)
4444444444444393520 - 882 cycles for uqw2a (mCoder)
- 92 cycles for i64toa (Towers)
44444444444444/4+21 - 663 cycles for JJ
4,444,444,444,444,393,520 - 382 cycles for UBTD (Dave)
6513854137424602010 - 184 cycles for b2a3264
so if the answer is there somewhere, I don't see it.
I got a little better results with an earlier test program (attached as tst4)
Intel(R) Core(TM) i7-6700K CPU @ 4.00GHz (SSE4)
Qword to Ascii algos:
1216 cycles for sprintf, result: 12345678901234567890
702 cycles for Asc64, result: 12345678901234567890
214 cycles for U64ToStr, result: 12345678901234567890
132 cycles for UBTD, result: 12,345,678,901,234,567,890
131 cycles for UBTDx, result: 12345678901234567890
36 cycles for b2a3264, result:
1172 cycles for sprintf, result: 12345678901234567890
701 cycles for Asc64, result: 12345678901234567890
207 cycles for U64ToStr, result: 12345678901234567890
138 cycles for UBTD, result: 12,345,678,901,234,567,890
125 cycles for UBTDx, result: 12345678901234567890
34 cycles for b2a3264, result:
1165 cycles for sprintf, result: 12345678901234567890
703 cycles for Asc64, result: 12345678901234567890
207 cycles for U64ToStr, result: 12345678901234567890
135 cycles for UBTD, result: 12,345,678,901,234,567,890
129 cycles for UBTDx, result: 12345678901234567890
34 cycles for b2a3264, result:
Code sizes:
Asc64 = 52
U64ToStr = 178
b2a3264 = 834 + 200 for chartable
Added umqtoa to the mix. Very respectable. Still can't get b2a3264 to work.
Intel(R) Core(TM) i7-6700K CPU @ 4.00GHz (SSE4)
Qword to Ascii algos:
1211 cycles for sprintf, result: 12345678901234567890
306 cycles for umqtoa, result: 12345678901234567890
712 cycles for Asc64, result: 12345678901234567890
216 cycles for U64ToStr, result: 12345678901234567890
137 cycles for UBTD, result: 12,345,678,901,234,567,890
125 cycles for UBTDx, result: 12345678901234567890
34 cycles for b2a3264, result:
1200 cycles for sprintf, result: 12345678901234567890
302 cycles for umqtoa, result: 12345678901234567890
727 cycles for Asc64, result: 12345678901234567890
209 cycles for U64ToStr, result: 12345678901234567890
125 cycles for UBTD, result: 12,345,678,901,234,567,890
137 cycles for UBTDx, result: 12345678901234567890
36 cycles for b2a3264, result:
1177 cycles for sprintf, result: 12345678901234567890
298 cycles for umqtoa, result: 12345678901234567890
711 cycles for Asc64, result: 12345678901234567890
213 cycles for U64ToStr, result: 12345678901234567890
130 cycles for UBTD, result: 12,345,678,901,234,567,890
125 cycles for UBTDx, result: 12345678901234567890
33 cycles for b2a3264, result:
Code sizes:
Asc64 = 52
U64ToStr = 178
b2a3264 = 834 + 200 for chartable
Intel(R) Core(TM) i5-2450M CPU @ 2.50GHz (SSE4)
Qword to Ascii algos:
1114 cycles for sprintf, result: 12345678901234567890
340 cycles for umqtoa, result: 12345678901234567890
619 cycles for Asc64, result: 12345678901234567890
259 cycles for U64ToStr, result: 12345678901234567890
154 cycles for UBTD, result: 12,345,678,901,234,567,890
168 cycles for UBTDx, result: 12345678901234567890
44 cycles for b2a3264, result:
1175 cycles for sprintf, result: 12345678901234567890
346 cycles for umqtoa, result: 12345678901234567890
638 cycles for Asc64, result: 12345678901234567890
254 cycles for U64ToStr, result: 12345678901234567890
152 cycles for UBTD, result: 12,345,678,901,234,567,890
150 cycles for UBTDx, result: 12345678901234567890
45 cycles for b2a3264, result:
1134 cycles for sprintf, result: 12345678901234567890
330 cycles for umqtoa, result: 12345678901234567890
643 cycles for Asc64, result: 12345678901234567890
261 cycles for U64ToStr, result: 12345678901234567890
150 cycles for UBTD, result: 12,345,678,901,234,567,890
154 cycles for UBTDx, result: 12345678901234567890
45 cycles for b2a3264, result:
Code sizes:
Asc64 = 52
U64ToStr = 178
b2a3264 = 834 + 200 for chartable
Quote from: jimg on July 07, 2017, 03:28:38 AM
Still can't get b2a3264 to work.
Two lines are missing:
option prologue:none ; turn it off
option epilogue:none ;
Here are few exotic ones - source attached:
Intel(R) Core(TM) i5-2450M CPU @ 2.50GHz (SSE4)
39062 cycles for 100 * MB Str$()
27288 cycles for 100 * q2a fpu 1
27213 cycles for 100 * q2a fpu 2
28920 cycles for 100 * q2a fpu 3
15460 cycles for 100 * UBTDx
37583 cycles for 100 * MB Str$()
26653 cycles for 100 * q2a fpu 1
27548 cycles for 100 * q2a fpu 2
28913 cycles for 100 * q2a fpu 3
15407 cycles for 100 * UBTDx
Exotic because they use the FPU. That old thread had some problems - some algos crashed, others didn't deliver the right results. More experimental than serious.
Btw where is umqtoa in your version?
It's part of fpulib. I haven't looked at the actual code yet.
And the earlier one has option prologue none, with the same results. I'll look more closely, but I really don't understand how it's supposed to work.
I just took the code right out of you latest post of QW2Ascii, prologue stuff and all and ran it without changes, exactly as you ran it, and I can get correct answers out of it.
I can't believe I'm getting this bad. If you get the chance, and only if you feel like it, would you put together a quickie with nothing but b2a3264 doing one number and printing the results?
Jim,
The problem sits here apparently:
mov edi,[esp+2*4]
mov [ecx+18],edx
mov eax, [esp+1*4] ; added by me
retn 2*4 ; changed by me
Mine runs with this, but the result looks crappy.
Note the order, different from the others:
invoke b2a3264, addr Dest, addr Src
P.S.: Lingo's code is fast, bloated and crappy... as usual.
35821 cycles for 100 * MB Str$()
27236 cycles for 100 * q2a fpu 1
26186 cycles for 100 * q2a fpu 2
25412 cycles for 100 * q2a fpu 3
14913 cycles for 100 * UBTDx
30051 cycles for 100 * umqtoa
4865 cycles for 100 * Lingo
18 bytes for MB Str$()
99 bytes for q2a fpu 1
87 bytes for q2a fpu 2
119 bytes for q2a fpu 3
215 bytes for UBTDx
187 bytes for umqtoa
883 bytes for Lingo
Search the source (*.asc in Richmasm/Wordpad/Ms Word, or *.asm in whatever) for the string ForLingo. For unknown reasons, it required that some extra qwords needed to be filled with the source values in order to produce results. An incredible mess 8)
Thank you.
I thought I could pull out just the parts needed to run the proc once, but somewhere, I screwed it up and I can't see where. I did my best to pull out your exact code. If you get a chance, please take a look and let me know what I fouled up.
Hi Jim,
Here is one that should work. Minor modifications, and it was definitely not your fault.
Much thanks. Now the fun begins :)
Take a look at the last line in the proc.
mov [ecx+8], edx ;
add ecx, -1 ;
jne Jo5 ;
I've given up a long time ago the wish to understand Lingo's code. But it is pretty clear that ecx will never reach the value 1 8)
I've modded the proc to more normal specifications (b2a3264x5). No use of esp. Normal prologue. Normal qword input.
I'm pretty happy with the results, and it's certainly much faster than any of the other algos.
I still don't understand what it's doing, I can't even figure out what the magic numbers he's using are.
Here's my results :)
Qword to Ascii algos:
1228 cycles for sprintf, result: 18446744073709551615
305 cycles for umqtoa, result: 18446744073709551615
699 cycles for Asc64, result: 18446744073709551615
213 cycles for U64ToStr, result: 18446744073709551615
124 cycles for UBTD, result: 18,446,744,073,709,551,615
163 cycles for UBTDx, result: 18446744073709551615
42 cycles for b2a3264, result: 18446744073709551615
41 cycles for b2a3264x5, result: 18446744073709551615
1173 cycles for sprintf, result: 12345678901234567890
299 cycles for umqtoa, result: 12345678901234567890
699 cycles for Asc64, result: 12345678901234567890
207 cycles for U64ToStr, result: 12345678901234567890
137 cycles for UBTD, result: 12,345,678,901,234,567,890
175 cycles for UBTDx, result: 12345678901234567890
41 cycles for b2a3264, result: 12345678901234567890
35 cycles for b2a3264x5, result: 12345678901234567890
615 cycles for sprintf, result: 4294967296
85 cycles for umqtoa, result:
425 cycles for Asc64, result:
74 cycles for U64ToStr, result: 4294967296
160 cycles for UBTD, result: 4,294,967,296
188 cycles for UBTDx, result: 4294967296
42 cycles for b2a3264, result: 4294967296
40 cycles for b2a3264x5, result: 4294967296
601 cycles for sprintf, result: 3012345678
64 cycles for umqtoa, result:
418 cycles for Asc64, result:
61 cycles for U64ToStr, result: 3012345678
158 cycles for UBTD, result: 3,012,345,678
188 cycles for UBTDx, result: 3012345678
19 cycles for b2a3264, result: 3012345678
14 cycles for b2a3264x5, result: 3012345678
429 cycles for sprintf, result: 123456
41 cycles for umqtoa, result:
295 cycles for Asc64, result:
31 cycles for U64ToStr, result: 123456
173 cycles for UBTD, result: 123,456
195 cycles for UBTDx, result: 123456
19 cycles for b2a3264, result: 123456
13 cycles for b2a3264x5, result: 123456
191 cycles for sprintf, result: 1
13 cycles for umqtoa, result:
190 cycles for Asc64, result:
10 cycles for U64ToStr, result: 1
194 cycles for UBTD, result: 1
210 cycles for UBTDx, result: 1
14 cycles for b2a3264, result: 1
6 cycles for b2a3264x5, result: 1
189 cycles for sprintf, result: 0
12 cycles for umqtoa, result:
179 cycles for Asc64, result:
9 cycles for U64ToStr, result: 0
188 cycles for UBTD, result: 0
214 cycles for UBTDx, result: 0
15 cycles for b2a3264, result: 0
7 cycles for b2a3264x5, result: 0
Looking back over what's gone before, I find I was blinded by the apparent low number of lingo's routine, but as you said earlier, I think the Dixon routine is just as fast, and it is much better commented and logical, although they bear an uncanny resemblance to each other. I have no idea which came first, but I'd put my money on Dixon. I just wasted two days screwing around with Lingos routine but I'm switching to Dixon simply because of the documentation :)
Check here (https://translate.google.it/translate?sl=ru&tl=en&js=y&prev=_t&hl=en&ie=UTF-8&u=http%3A%2F%2Fwww.cyberforum.ru%2Fasm-beginners%2Fthread988568.html&edit-text=). It looks a bit garbled, but if you search Dixon inside the page, you'll find it.
Here (https://forum.powerbasic.com/forum/user-to-user-discussions/powerbasic-for-windows/9890-assembler-format-question) is January 2004 code by Paul Dixon (Assembler embedded in PowerBasic). Looks different but gives you an idea how old the final version might be. Hutch converted a similar one to Masm (http://masm32.com/board/index.php?topic=4095.0), also in 2010: This post is mainly for Paul Dixon as it is a modified version of his conversion algo that Ian_B modified (http://www.masmforum.com/board/index.php?topic=14642.msg119179#msg119179)
pushad 'save registers
sub esp,12 'create a bit of workspace on the stack
mov edi,esp 'point edi at workspace
fild n&& 'load the data into FPU
mov eax,n&& 'must dereference it since it was passed as a parameter
fild qword [eax]
fbstp tbyte [edi] 'convert data to BCD and save it
mov esi,xp& 'pointer to result string
mov ecx,1 'loop counter for the 2 DWORDS holding the result
mov edx,0 'need to count backwards too as string is stored the opposite way to integer
p:
mov eax,[edi+edx*4+1] 'do conversion 4 low nibbles at a time
and eax,&h0f0f0f0f
add eax,&h30303030
mov [ecx*8+esi+7],al
shr eax,8
mov [ecx*8+esi+5],al
shr eax,8
mov [ecx*8+esi+3],al
shr eax,8
mov [ecx*8+esi+1],al
shr eax,8
mov eax,[edi+edx*4+1] 'and 4 high nibbles at a time
and eax,&hf0f0f0f0
shr eax,4
add eax,&h30303030
mov [ecx*8+esi+6],al
shr eax,8
mov [ecx*8+esi+4],al
shr eax,8
mov [ecx*8+esi+2],al
shr eax,8
mov [ecx*8+esi],al
shr eax,8
inc edx
dec ecx 'finished 2 DWORDs?
jns lp
mov eax,[edi] 'yes, now do the 2 left over digits that didn't fit (18 digits)
and eax,&h0f
add eax,&h30
mov [esi+17],al
mov eax,[edi]
and eax,&hf0
shr eax,4
add eax,&h30
mov [esi+16],al
add esp,12 'remove workspace from stack
popad 'restore registers
Thanks for everything, JJ.
Otherwise I'm done. I'm happy with my cleanup of the Dixon routine.
Hopefully, this is my last post on the topic and I can move up one level in my stack.
edit:
And of course, the first time I went to use it, I needed the size of the string, so I modified the attached code to return in in eax.
Quote from: jimg on July 09, 2017, 03:17:49 AM
Thanks for everything, JJ.
My pleasure, Jim :icon14:
@jimg
Maybe I should have mentioned this earlier.
The Dixon procedure is fine as long as you realize and understand the limitations of using the fbstp instruction for the conversion. See
http://www.ray.masmcode.com/tutorial/fpuchap6.htm#fbstp
The above link also has a sub-link to an explanation of the 'packed BCD format' used by the FPU. That may help you understand what those conversion procedures are attempting to do.
Ray,
Can you explain to us mere mortals where the BCD elements are in Dixon's code? I can't see them...
uqword proc ; lpbuf:DWORD, lpNumber:DWORD ;unsigned DWORD to ASCII, Paul Dixon
mov ecx, [esp+2*4] ; lp qword number
mov eax, [ecx] ; eax->low dword
mov edx, [ecx+4] ; edx->high dword
mov ecx, [esp+1*4] ; ecx, lpbuf
or edx, edx ;if top word is not used then..
jz udword ; .. use unsigned Dword routine as it;s faster
push ebp ;save registers that need to be saved
push esi
mov ebp, eax ; save a copy of low word for later
mov esi, edx ; save a copy of high word for later
mov pAnswer, ecx ; save a copy of buffer pointer for later, don;t stack it or it;s awkward to get back
; do 64 bit multiply by 2^110\1e14+1 to make it more likely I get no rounding errors = 0B424DC35 095CD810h
; this is a 4 part operation, LOxLO, LOxHI, HIxLO, HIxHI and add the 4 results offset appropriately
mov ecx, 095CD810h ; 2^110\1e14+1 low word
mul ecx ;
mov eax, esi ; get number high word, LSBs of MUL not needed so they;re ignored
; nop
push edi
push ebx
mov edi, edx ; save high word of result
mul ecx ; now do high word mul
mov ecx, 0B424DC35h ; get ready for other half of MUL
add edi, eax ; Add low word into result
adc edx, 0 ; and handle the possible carry
mov eax, ebp ; Get low word of number again
mov ebx, edx ; save high word of answer
mul ecx ; do next part
add edi, eax ; add into answer
mov eax, esi ; get high word of number
adc ebx, edx ; add it in to answer #####? possible carry to higher word? probably not..
mul ecx ; do final mul
mov ecx, 1000000 ; ready for later
add ebx, eax ; add in result
adc edx, 0 ; and carry
add edi, 16384 ; round up last bit to decrease error to within that required.
adc ebx, 0
adc edx, 0
shrd edi, ebx, 14 ;correct for the 14 bit shift used to increase accuracy
shrd ebx, edx, 14
shr edx, 14 ; edx contains top 6 digits, edi:ebx contain the information to get the next 6 digits
; edx = 2D093h ebx= 70D42573h edi=603A5EDAh
; 64 bit multiply done
; result in edx:ebx:edi , original number in esi:ebp
; x 1000000 to get next 6 digits
mov eax, edi
mov esi, edx ;save top6 in esi
mul ecx
mov eax, ebx ;do low word x1 000 000
mov ebx, edx ;
mul ecx ;do high word x 1 000 000
add eax, ebx ;add both together
adc edx, 0
mov ebx, edx ;save 2nd 6 in ebx
; now get ((top6 x 1e6) + next6)*1e8 and sub from original number to leave last 8 digits
; since the 8 digits we want are all contained in the low word we can completely ignore the high word
; this allows imul and does away with carries to the high word
mov eax, esi ;get top 6
imul eax, ecx ;top 6 x 1 000 000
mov ecx, 100000000
add eax, ebx ;top6 x 1 000 000 + next 6
imul eax, ecx
sub ebp, eax ;ebp=last 8 digits
; 20 digits broken into 6,6,8 now display them
; esi=top6, ebx=next 6, ebp=last 8
; do top 6 digits
mov eax, esi ; get top 6 digits
mov edi, 68DB9h ; =2^32\10000+1
mul edi
mov esi, pAnswer ;offset TimeBuffer ;get pointer into answer buffer
mov ecx, 100 ;multiplier for later
jnc qnextrw1 ;if zero, supress them by ignoring
cmp edx, 9 ;1 digit or 2?
ja qZeroSupressedo ;2 digits, just continue with pairs of digits to the end
mov edx, dword ptr chartab[edx+edx] ;look up 2 digits
mov [esi], dh ;but only write the 1 we need, supress the leading zero
inc esi ;update pointer by 1
jmp QZoS1 ;continue with pairs of digits to the end
qnextrw1:
mul ecx ;get next 2 digits
jnc qnextrw2 ;if zero, supress them by ignoring
cmp edx, 9 ;1 digit or 2?
ja QZoS1a ;2 digits, just continue with pairs of digits to the end
mov edx, dword ptr chartab[edx+edx] ;look up 2 digits
mov [esi], dh ;but only write the 1 we need, supress the leading zero
inc esi ;update pointer by 1
jmp QZoS2 ;continue with pairs of digits to the end
qnextrw2:
mul ecx ;get next 2 digits
jnc qnextrw3 ;if zero, supress them by ignoring
cmp edx, 9 ;1 digit or 2?
ja QZoS2a ;2 digits, just continue with pairs of digits to the end
mov edx, dword ptr chartab[edx+edx] ;look up 2 digits
mov [esi], dh ;but only write the 1 we need, supress the leading zero
inc esi ;update pointer by 1
jmp QZoS3 ;continue with pairs of digits to the end
; next 6 digits
qnextrw3:
mov eax,ebx ;get 2nd 6 digits
mov ebx, 28F5C29h ;=2^32\100+1 ready for later
mul edi ;edi=2^32\10000+1
jnc qnextrw4 ;if zero, supress them by ignoring
cmp edx, 9 ;1 digit or 2?
ja QZSo3a ;2 digits, just continue with pairs of digits to the end
mov edx, dword ptr chartab[edx+edx] ;look up 2 digits
mov [esi], dh ;but only write the 1 we need, supress the leading zero
inc esi ;update pointer by 1
jmp QZSo4 ;continue with pairs of digits to the end
qnextrw4:
mul ecx ;get next 2 digits
jnc QZSo5 ;if zero, supress them by ignoring
cmp edx, 9 ;1 digit or 2?
ja QZSo4a ;2 digits, just continue with pairs of digits to the end
mov edx, dword ptr chartab[edx+edx] ;look up 2 digits
mov [esi], dh ;but only write the 1 we need, supress the leading zero
inc esi ;update pointer by 1
jmp QZSo5 ;continue with pairs of digits to the end
; done top 10 digits
; since we took a short cut to the DWORD routine at the start we can never get beyond here
; as the DWORD routine would handle it instead.
; At this point we are guaranteed to have exactly 10 digits to print so just jump to the relevant spot.
qZeroSupressedo:
mov edx, dword ptr chartab[edx+edx] ;look up the 2 digits
mov [esi], dx
add esi, 2
QZoS1:
mul ecx
QZoS1a:
mov edx, dword ptr chartab[edx+edx] ;look up the 2 digits
mov [esi], dx
add esi, 2
QZoS2:
mul ecx
QZoS2a:
mov edx, dword ptr chartab[edx+edx] ;look up the 2 digits
mov [esi], dx
add esi, 2
sj:
QZoS3:
;do next 6 digits
mov eax, ebx ;get 2nd 6 digits
mov ebx, 28F5C29h ;=2^32\100+1 ready for later
mul edi ;edi=2^32\10000+1
QZSo3a:
mov edx, dword ptr chartab[edx+edx]
mov [esi], dx
add esi, 2
QZSo4:
mul ecx
QZSo4a:
mov edx, dword ptr chartab[edx+edx]
mov [esi], dx
add esi, 2
QZSo5:
mul ecx
;QZSo5a:
mov edx, dword ptr chartab[edx+edx]
mov [esi], dx
add esi, 2
;do final 8 digits
mov eax, ebp ;get last 8 digits
mul ebx ;ebx=2^32\100+1
mov eax, edx
mov ebx, edx
mul edi ;edi=2^32\10000+1
mov edx, dword ptr chartab[edx+edx] ;look up next 2 digits
mov [esi], dx
add esi, 2
mul ecx
mov edx, dword ptr chartab[edx+edx] ;look up next 2 digits
mov [esi], dx
add esi, 2
mul ecx
mov edx, dword ptr chartab[edx+edx] ;look up next 2 digits
mov [esi], dx
add esi, 2
mov eax, ebx ;first 6 digits of last 8
imul eax, ecx ;x100 to shift into place
pop ebx
pop edi
mov edx, ebp ;last 8 - 6 just done gives final 2 digits
sub edx, eax ;look up final 2 digits
mov edx, dword ptr chartab[edx+edx]
mov [esi], dx
add esi, 2
mov byte ptr [esi],0 ;need to zero terminate
pop esi
pop ebp
AllDone:
mov eax, [esp+4]
ret 2*4
udword:
push edi ;save registers that need to be saved
push esi
mov esi, ecx ; sptr
mov edi,eax ;eax= x ;save a copy of the number
mov edx, 0D1B71759h ;=2^45\10000 13 bit extra shift
mul edx ;gives 6 high digits in edx
mov eax,68DB9h ;=2^32\10000+1
shr edx,13 ;correct for multiplier offset used to give better accuracy
jz short skiphighdigits ;if zero then don;t need to process the top 6 digits
mov ecx,edx ;get a copy of high digits
imul ecx,10000 ;scale up high digits
sub edi,ecx ;subtract high digits from original. EDI now = lower 4 digits
mul edx ;get first 2 digits in edx
mov ecx,100 ;load ready for later
jnc short next1 ;if zero, supress them by ignoring
cmp edx,9 ;1 digit or 2?
ja short ZeroSupressed ;2 digits, just continue with pairs of digits to the end
mov edx,dword ptr chartab[edx*2] ;look up 2 digits
mov [esi],dh ;but only write the 1 we need, supress the leading zero
inc esi ;update pointer by 1
jmp short ZS1 ;continue with pairs of digits to the end
next1:
mul ecx ;get next 2 digits
jnc short next2 ;if zero, supress them by ignoring
cmp edx,9 ;1 digit or 2?
ja short ZS1a ;2 digits, just continue with pairs of digits to the end
mov edx,dword ptr chartab[edx*2] ;look up 2 digits
mov [esi],dh ;but only write the 1 we need, supress the leading zero
inc esi ;update pointer by 1
jmp short ZS2 ;continue with pairs of digits to the end
next2:
mul ecx ;get next 2 digits
jnc short next3 ;if zero, supress them by ignoring
cmp edx,9 ;1 digit or 2?
ja short ZS2a ;2 digits, just continue with pairs of digits to the end
mov edx,dword ptr chartab[edx*2] ;look up 2 digits
mov [esi],dh ;but only write the 1 we need, supress the leading zero
inc esi ;update pointer by 1
jmp short ZS3 ;continue with pairs of digits to the end
next3:
skiphighdigits:
mov eax,edi ;get lower 4 ditigs
mov ecx,100
mov edx,28F5C29h ;2^32\100 +1
mul edx
jnc short next4 ;if zero, supress them by ignoring
cmp edx,9 ;1 digit or 2?
ja short ZS3a ;2 digits, just continue with pairs of digits to the end
mov edx,dword ptr chartab[edx*2] ;look up 2 digits
mov [esi],dh ;but only write the 1 we need, supress the leading zero
inc esi ;update pointer by 1
jmp short ZS4 ;continue with pairs of digits to the end
next4:
mul ecx ;this is the last pair so don;t supress a single zero
cmp edx,9 ;1 digit or 2?
ja short ZS4a ;2 digits, just continue with pairs of digits to the end
mov edx,dword ptr chartab[edx*2] ;look up 2 digits
mov [esi],dh ;but only write the 1 we need, supress the leading zero
mov byte ptr [esi+1],0 ;zero terminate string
jmp short xit ;all done
ZeroSupressed:
mov edx,dword ptr chartab[edx*2] ;look up 2 digits
mov [esi],dx
add esi,2 ;write them to answer
ZS1:
mul ecx ;get next 2 digits
ZS1a:
mov edx,dword ptr chartab[edx*2] ;look up 2 digits
mov [esi],dx ;write them to answer
add esi,2
ZS2:
mul ecx ;get next 2 digits
ZS2a:
mov edx,dword ptr chartab[edx*2] ;look up 2 digits
mov [esi],dx ;write them to answer
add esi,2
ZS3:
mov eax,edi ;get lower 4 digits
mov edx,28F5C29h ;2^32\100 +1
mul edx ;edx= top pair
ZS3a:
mov edx,dword ptr chartab[edx*2] ;look up 2 digits
mov [esi],dx ;write to answer
add esi,2 ;update pointer
ZS4:
mul ecx ;get final 2 digits
ZS4a:
mov edx,dword ptr chartab[edx*2] ;look them up
mov [esi],dx ;write to answer
mov byte ptr [esi+2],0 ;zero terminate string
xit:
pop esi ;restore used registers
pop edi
jmp AllDone
uqword endp
Quotesub esp,12 'create a bit of workspace on the stack
mov edi,esp 'point edi at workspace
fild n&& 'load the data into FPU
mov eax,n&& 'must dereference it since it was passed as a parameter
fild qword [eax]
fbstp tbyte [edi] 'convert data to BCD and save it
mov esi,xp& 'pointer to result string
mov ecx,1 'loop counter for the 2 DWORDS holding the result
mov edx,0 'need to count backwards too as string is stored the opposite way to integer
p:
mov eax,[edi+edx*4+1] 'do conversion 4 low nibbles at a time
Above is part of Dixon's code which you posted previously.
The first red line shows that EDI is used for pointing to a workspace reserved on the stack by the previous instruction.
The following green line indicates that the target dword is loaded on the FPU.
The next red instruction specifies that the dword gets converted to the BCD format and stored in the reserved workspace on the stack.
Then, the BCD nibbles get recovered sequentially from the stack for unpacking, as pointed to by EDI/EDX in the next red instruction (and other subsequent similar instructions).
Quote from: raymond on July 11, 2017, 02:19:33 AMAbove is part of Dixon's code which you posted previously.
Oops, I forgot - the old routine he posted in 2004. Yes, that is BCD code, of course.
:dazzled: Kind of confusing when the same author issues separate procedures for the same purpose.
Anyway, my initial comment was primarily to warn jimg about the use of such instructions without knowing about its limitations, in addition to explaining what it does.
I started out trying to use the FPU, but quickly realized it takes twice as long to do a single FINIT than the whole non-FPU routine. Rather disheartening.
Quote from: jimg on July 11, 2017, 03:19:39 PM
I started out trying to use the FPU, but quickly realized it takes twice as long to do a single FINIT than the whole non-FPU routine.
You need finit only once in your program, or at the beginning of a loop. In this earlier post (http://masm32.com/board/index.php?topic=221.msg68324#msg68324), there are three FPU routines that are, for example, faster than umqtoa. Problem is that a handful of values are, ehm, incorrect because of rounding errors. Tweaking might solve this issue. As I wrote earlier, this is quite experimental :biggrin:
So I can assume that if I call some other routine in some other library not written by me, that it will not screw up the FPU? Which implies by extension that if I write a general purpose routine, it is incumbent upon me to return the FPU in a clean state?
Good question, Jim :t
There is a thread on MSDN social (https://social.msdn.microsoft.com/Forums/expression/en-US/c7538f4f-0bd4-4838-8a5a-82d6e9e6fbc7/does-the-x86-abi-permit-the-fpu-status-register-to-be-changed?forum=vclanguage) saying "The ABI requires that the control word is preserved", and a Masm32 thread (http://masm32.com/board/index.php?topic=292.0) on "Application Binary Interface (ABI), calling conventions and the like". On SOF, they discuss Is it necessary to save the FPU state here? (https://stackoverflow.com/questions/35963861/is-it-necessary-to-save-the-fpu-state-here)
If anybody has an official ABI info on the fpu in x86/x64, please post a link.
In practice, I never have seen the fpu control word change when calling a Windows function. If that function uses the fpu, and doesn't fully restore everything, then some register contents will be gone, of course.
The MasmBasic library has 65 occurrences of ffree st(7) - the usual way to ensure that the fpu behaves well when using it. Just tested with a Window application under Win7-64, and it seems that Windows doesn't touch the fpu at all. Even after the WM_PAINT handler, all content is still there. Don't rely on that, it may differ between Windows versions.
In a program I've been working on, I always do a FINIT at the start of the several procs that use the FPU, and set the control word to truncate. If the ABI requires that the control word be preserved, then I need to restore it before leaving each of these procs?
How does ffree st(7) make the fpu behave? The description says it just sets it to empty.
Sounds like you have to worry about both ends. You can't count on what state you get, but have to leave in a clean state.
Quote from: jimg on July 12, 2017, 12:37:36 AM
In a program I've been working on, I always do a FINIT at the start of the several procs that use the FPU, and set the control word to truncate. If the ABI requires that the control word be preserved
...
TheCalculator starts with FINIT and uses FINIT in the conversion routines only - string to real10 and real10 to string. And control word is preserved. It seems it works correctly always. Whenever it finds an
invalid operation it does
FCLEX and
exit with an error code.
Most Microsoft programmers working on the Windows OS probably don't know anything about FPU instructions. However, they do use (maybe unknowingly) the FPU general registers without preserving their content. I doubt that they would change any of the control registers. If you want to see Windows in action with fpu general registers, write a test program loading some values on the FPU and then use the MessageBox API to display whatever you like. Then run that program with Ollydbg and watch the content of the fpu registers when the API is called.
THEREFORE, I consider all general and control FPU registers as free to use and would personally not preserve any of them because of other external programs which may use them. I may preserve them strictly for the internal needs of my own program. (Because the Fpulib would be used within one's program, and some of the data within registers might be required to be preserved for the user, each of the functions within that library does an initial fpu save and a fpu restore on exit.)
As for using the FINIT at the very beginning of a program, its only function would be to change the precision control from double-precision (64-bit) to extended double-precision (80-bit); all other general and control registers are provided initialized to an exe upon loading it. Obviously, if you need to change any of the controls for specific purposes, you adjust them as needed but there is no need to preserve them unless other parts of your own program expects them to have them preserved.
The ffree st(7) instruction by itself would not necessarily make the fpu behave. It would only guarantee that the next immediately following fld or fild instruction would be carried out without an exception being raised.
To all those other members following this thread without much knowledge about the FPU, you may want to have an initial quick glance at http://www.ray.masmcode.com/tutorial/index.html (http://www.ray.masmcode.com/tutorial/index.html)
Ray's tutorial is an absolute "must read" indeed. Yes, I had forgotten MessageBox, it erases ST(6) and ST(7). Which is normally not a problem, nobody uses the fpu so deeply and then launches a MsgBox while intermediate results are still in the fpu.
In short: don't worry. Do what you have to do with the fpu.
Heh; nothing like reviving a 6-year-old thread. Like Lazarus.
So here's my entry for the quadword-to-ASCII conversion contest. Won't win any prizes for speed or cleverness, but I might get an honorable mention for simplicity. Tested and it works. Uses no multiplication or division.
I got the successive-subtraction idea from a link that JJ posted (https://www.andrijar.com/ascii/index.html) a long time ago.
;===============================================
; Q2A -- Unsigned quadword to ASCII converter
;
; Uses successive subtraction (+ addition),
; no multiplication or division.
;===============================================
.data
PowersOf10 LABEL QWORD
DQ 10000000000000000000
DQ 1000000000000000000
DQ 100000000000000000
DQ 10000000000000000
DQ 1000000000000000
DQ 100000000000000
DQ 10000000000000
DQ 1000000000000
DQ 100000000000
DQ 10000000000
DQ 1000000000
DQ 100000000
DQ 10000000
DQ 1000000
DQ 100000
DQ 10000
DQ 1000
DQ 100
DQ 10
ASCIIbuffer DB 22 DUP(?)
.code
;====================================================================
; Q2A()
;
; Convert 64-bit unsigned integer (QWORD) to ASCII decimal string
;
; On entry,
; EDX:EAX = QWORD value to convert
;
; Returns:
; EAX--> buffer containing numeric string (points to 1st non-zero char.)
;
; Tested 8/17/23 --works--
;====================================================================
Q2A PROC
PUSH EBX
PUSH EDI
MOV EBX, OFFSET PowersOf10
MOV EDI, OFFSET ASCIIbuffer
MOV CH, 19 ;Loop counter.
sublp: XOR CL, CL ;Subtraction counter.
resub: INC CL
SUB EAX, DWORD PTR [EBX]
SBB EDX, DWORD PTR [EBX + SIZEOF DWORD]
JNC resub
backup: DEC CL ;CL = this digit.
ADD CL, '0'
MOV [EDI], CL
INC EDI
; Add back power of 10 to make up for overshoot:
ADD EAX, DWORD PTR [EBX]
ADC EDX, DWORD PTR [EBX + SIZEOF DWORD]
ADD EBX, SIZEOF QWORD ;Next power of 10.
DEC CH
JNZ sublp
ADD AL, '0'
XOR AH, AH
STOSW ;Store last char. + terminator.
; Leading zero suppression:
MOV EDX, OFFSET ASCIIbuffer
MOV ECX, 19
zloop: CMP BYTE PTR [EDX], '0'
JNE done
INC EDX
LOOP zloop
done: MOV EAX, EDX
POP EDI
POP EBX
RET
Q2A ENDP