I have been working an assembly support library for DOS DPMI apps, and I finally finished a part of it.
;==============================================================================
include \masm32\include\masm32rt.inc
.686
include new_counter.asm
;==============================================================================
.data
buffer1 db 36 dup(0)
buffer2 db 36 dup(0)
.code
;==============================================================================
;---------------------------------------------------
; This optimizes the divide for the two most common
; power-of-2 bases. Expects EBX=base and EDX=0.
;---------------------------------------------------
DODIV MACRO
cmp ebx, 16
jne @F
mov edx, eax
and edx, 01111b
shr eax, 4
jmp LL
@@:
cmp ebx, 2
jne @F
mov edx, eax
and edx, 01b
shr eax, 1
jmp LL
@@:
idiv ebx
LL:
ENDM
;==============================================================================
;------------------------------------------------------------
; Base (radix) must be in the range 2-36, no error checking!
;------------------------------------------------------------
itoa proc value:DWORD, pstr:DWORD, base:DWORD
push ebx
push edi
mov ebx, base ; radix
mov edi, pstr
mov eax, value
xor ecx, ecx ; digit counter
test eax, eax
jns L0
cmp ebx, 10
jne L0
mov BYTE PTR [edi], '-' ; this only for negative base 10
inc edi
neg eax
L0:
xor edx, edx ; need only 32-bit dividend
DODIV ; divide value by base
push edx ; push remainder = digit value
inc ecx
test eax, eax
jnz L0 ; continue until value = 0
add edi, ecx ; rig these to use counter as
neg ecx ; both counter and index
L1:
pop edx ; pop digit value and convert to digit
cmp edx, 10
jb L2
add edx, 'a'-10 ; 'a' instead of 'A' to match crt itoa output
jmp L3
L2:
add edx, '0'
L3:
mov [edi+ecx], dl ; store digit in string
inc ecx
jnz L1
mov BYTE PTR [edi+ecx], 0 ; append null terminator
pop edi
pop ebx
ret
itoa endp
;==============================================================================
start:
;==============================================================================
;-------------------------------------------------
; Test boundary values for the most common bases.
;-------------------------------------------------
invoke itoa, 0, ADDR buffer1, 2
printf("%s\n", ADDR buffer1)
invoke crt__itoa, 0, ADDR buffer2, 2
printf("%s\n", ADDR buffer2)
invoke itoa, 7fffffffh, ADDR buffer1, 2
printf("%s\n", ADDR buffer1)
invoke crt__itoa, 7fffffffh, ADDR buffer2, 2
printf("%s\n", ADDR buffer2)
invoke itoa, 80000000h, ADDR buffer1, 2
printf("%s\n", ADDR buffer1)
invoke crt__itoa, 80000000h, ADDR buffer2, 2
printf("%s\n", ADDR buffer2)
invoke itoa, 0ffffffffh, ADDR buffer1, 2
printf("%s\n", ADDR buffer1)
invoke crt__itoa, 0ffffffffh, ADDR buffer2, 2
printf("%s\n", ADDR buffer2)
invoke itoa, 0, ADDR buffer1, 8
printf("%s\n", ADDR buffer1)
invoke crt__itoa, 0, ADDR buffer2, 8
printf("%s\n", ADDR buffer2)
invoke itoa, 7fffffffh, ADDR buffer1, 8
printf("%s\n", ADDR buffer1)
invoke crt__itoa, 7fffffffh, ADDR buffer2, 8
printf("%s\n", ADDR buffer2)
invoke itoa, 80000000h, ADDR buffer1, 8
printf("%s\n", ADDR buffer1)
invoke crt__itoa, 80000000h, ADDR buffer2, 8
printf("%s\n", ADDR buffer2)
invoke itoa, 0ffffffffh, ADDR buffer1, 8
printf("%s\n", ADDR buffer1)
invoke crt__itoa, 0ffffffffh, ADDR buffer2, 8
printf("%s\n", ADDR buffer2)
invoke itoa, 0, ADDR buffer1, 10
printf("%s\n", ADDR buffer1)
invoke crt__itoa, 0, ADDR buffer2, 10
printf("%s\n", ADDR buffer2)
invoke itoa, 7fffffffh, ADDR buffer1, 10
printf("%s\n", ADDR buffer1)
invoke crt__itoa, 7fffffffh, ADDR buffer2, 10
printf("%s\n", ADDR buffer2)
invoke itoa, 80000000h, ADDR buffer1, 10
printf("%s\n", ADDR buffer1)
invoke crt__itoa, 80000000h, ADDR buffer2, 10
printf("%s\n", ADDR buffer2)
invoke itoa, 0ffffffffh, ADDR buffer1, 10
printf("%s\n", ADDR buffer1)
invoke crt__itoa, 0ffffffffh, ADDR buffer2, 10
printf("%s\n", ADDR buffer2)
invoke itoa, 0, ADDR buffer1, 16
printf("%s\n", ADDR buffer1)
invoke crt__itoa, 0, ADDR buffer2, 16
printf("%s\n", ADDR buffer2)
invoke itoa, 7fffffffh, ADDR buffer1, 16
printf("%s\n", ADDR buffer1)
invoke crt__itoa, 7fffffffh, ADDR buffer2, 16
printf("%s\n", ADDR buffer2)
invoke itoa, 80000000h, ADDR buffer1, 16
printf("%s\n", ADDR buffer1)
invoke crt__itoa, 80000000h, ADDR buffer2, 16
printf("%s\n", ADDR buffer2)
invoke itoa, 0ffffffffh, ADDR buffer1, 16
printf("%s\n", ADDR buffer1)
invoke crt__itoa, 0ffffffffh, ADDR buffer2, 16
printf("%s\n\n", ADDR buffer2)
inkey
invoke GetCurrentProcess
invoke SetProcessAffinityMask, eax, 1
invoke Sleep, 8000
PC = REALTIME_PRIORITY_CLASS
TP = THREAD_PRIORITY_TIME_CRITICAL
REPEAT 3
counter_begin 10000000, PC, TP
invoke itoa, 12345678, ADDR buffer1, 2
counter_end
printf("\nitoa base 2 : %d cycles\n", eax)
counter_begin 10000000, PC, TP
invoke crt__itoa, 12345678, ADDR buffer2, 2
counter_end
printf("crt__itoa base 2 : %d cycles\n\n", eax)
counter_begin 10000000, PC, TP
invoke itoa, 12345678, ADDR buffer1, 10
counter_end
printf("itoa base 10 : %d cycles\n", eax)
counter_begin 10000000, PC, TP
invoke crt__itoa, 12345678, ADDR buffer2, 10
counter_end
printf("crt__itoa base 10: %d cycles\n\n", eax)
counter_begin 10000000, PC, TP
invoke itoa, 12345678, ADDR buffer1, 16
counter_end
printf("itoa base 16 : %d cycles\n", eax)
counter_begin 10000000, PC, TP
invoke crt__itoa, 12345678, ADDR buffer2, 16
counter_end
printf("crt__itoa base 16: %d cycles\n\n", eax)
ENDM
inkey
;------------------------------------------------
; Compare output to that of the CRT function for
; 1,000,000,000 random values and bases.
;------------------------------------------------
xor ebx, ebx
.WHILE ebx < 1000000000
invoke nrandom, 35 ; base must be in range 2-36
add eax, 2
mov edi, eax
invoke nrandom, -1
mov esi, eax
invoke itoa, esi, ADDR buffer1, edi
invoke crt__itoa, esi, ADDR buffer2, edi
;printf("%d\t%d\t%s\t%s\n", esi, edi, ADDR buffer1, ADDR buffer2)
mov eax, ebx
cdq
mov ecx, 1000000
div ecx
.IF edx == 0
printf("%d\n",ebx)
.ENDIF
invoke crt_strcmp, ADDR buffer1, ADDR buffer2
.IF eax != 0
printf("ERROR\n")
inkey
.ENDIF
inc ebx
.ENDW
inkey
exit
;==============================================================================
end start
char *_itoa(
int value,
char *str,
int radix
);
Optimizing the divide operation for the two most common base values 2 and 16 slowed the procedure down somewhat for the other bases, but made it much faster for bases 2 and 16. Typical cycle counts running on my P4 Northwood:
itoa base 2 : 322 cycles
crt__itoa base 2 : 1420 cycles
itoa base 10 : 515 cycles
crt__itoa base 10: 461 cycles
itoa base 16 : 91 cycles
crt__itoa base 16: 353 cycles
itoa base 2 : 322 cycles
crt__itoa base 2 : 1421 cycles
itoa base 10 : 515 cycles
crt__itoa base 10: 460 cycles
itoa base 16 : 91 cycles
crt__itoa base 16: 350 cycles
itoa base 2 : 325 cycles
crt__itoa base 2 : 1420 cycles
itoa base 10 : 515 cycles
crt__itoa base 10: 460 cycles
itoa base 16 : 87 cycles
crt__itoa base 16: 352 cycles
Michael,
thank you for providing the code. It runs fine under 64 bit. Here is the result:
itoa base 2 : 223 cycles
crt__itoa base 2 : 525 cycles
itoa base 10 : 123 cycles
crt__itoa base 10: 148 cycles
itoa base 16 : 48 cycles
crt__itoa base 16: 103 cycles
itoa base 2 : 231 cycles
crt__itoa base 2 : 553 cycles
itoa base 10 : 122 cycles
crt__itoa base 10: 148 cycles
itoa base 16 : 48 cycles
crt__itoa base 16: 103 cycles
itoa base 2 : 231 cycles
crt__itoa base 2 : 554 cycles
itoa base 10 : 122 cycles
crt__itoa base 10: 148 cycles
itoa base 16 : 48 cycles
crt__itoa base 16: 103 cycles
Press any key to continue ...
Gunther
AMD X64
itoa base 2 : 285 cycles
crt__itoa base 2 : 763 cycles
itoa base 10 : 323 cycles
crt__itoa base 10: 254 cycles
itoa base 16 : 55 cycles
crt__itoa base 16: 207 cycles
itoa base 2 : 293 cycles
crt__itoa base 2 : 763 cycles
itoa base 10 : 323 cycles
crt__itoa base 10: 254 cycles
itoa base 16 : 55 cycles
crt__itoa base 16: 207 cycles
itoa base 2 : 293 cycles
crt__itoa base 2 : 763 cycles
itoa base 10 : 323 cycles
crt__itoa base 10: 254 cycles
itoa base 16 : 55 cycles
crt__itoa base 16: 207 cycles
Press any key to continue ...
I could remark, that CRT now require support of national digit characters, so may be better to get them from local array, that fills by locale know function, as I'm using strnal(char*, size_t) for it.
this would increase the speed a little bit:
in this case it will take care of edx itself
DODIV MACRO
mov edx, eax
.if (ebx == 16)
and edx, 01111b
shr eax, 4
.elseif (ebx == 2)
and edx, 01b
shr eax, 1
.else
xor edx,edx
idiv ebx
.endif
ENDM
Quote from: habran on December 16, 2013, 06:12:16 AM
this would increase the speed a little bit:
Yes, thanks, I can now squeeze another 2 cycles out of it :biggrin:
there you go :greenclp:
if you call this macro 100 times you will squeeze 200 cycles :biggrin:
you humans are aware that no one is perfect but you never give up trying ::)