News:

Masm32 SDK description, downloads and other helpful links
Message to All Guests

Main Menu

Assembly version of CRT _itoa function

Started by MichaelW, December 15, 2013, 01:31:47 PM

Previous topic - Next topic

MichaelW

I have been working an assembly support library for DOS DPMI apps, and I finally finished a part of it.

;==============================================================================
include \masm32\include\masm32rt.inc
.686
include new_counter.asm
;==============================================================================
.data
    buffer1 db 36 dup(0)
    buffer2 db 36 dup(0)
.code
;==============================================================================
;---------------------------------------------------
; This optimizes the divide for the two most common
; power-of-2 bases. Expects EBX=base and EDX=0.
;---------------------------------------------------

DODIV MACRO
    cmp ebx, 16
    jne @F
    mov edx, eax
    and edx, 01111b
    shr eax, 4
    jmp LL
  @@:
    cmp ebx, 2
    jne @F
    mov edx, eax
    and edx, 01b
    shr eax, 1
    jmp LL
  @@:
    idiv ebx
  LL:
ENDM

;==============================================================================
;------------------------------------------------------------
; Base (radix) must be in the range 2-36, no error checking!
;------------------------------------------------------------

itoa proc value:DWORD, pstr:DWORD, base:DWORD
    push ebx
    push edi
    mov ebx, base               ; radix
    mov edi, pstr
    mov eax, value
    xor ecx, ecx                ; digit counter
    test eax, eax
    jns L0
    cmp ebx, 10
    jne L0
    mov BYTE PTR [edi], '-'     ; this only for negative base 10
    inc edi
    neg eax
  L0:
    xor edx, edx                ; need only 32-bit dividend
    DODIV                       ; divide value by base
    push edx                    ; push remainder = digit value
    inc ecx
    test eax, eax
    jnz L0                      ; continue until value = 0
    add edi, ecx                ; rig these to use counter as
    neg ecx                     ;  both counter and index
  L1:
    pop edx                     ; pop digit value and convert to digit
    cmp edx, 10
    jb  L2
    add edx, 'a'-10             ; 'a' instead of 'A' to match crt itoa output
    jmp L3
  L2:
    add edx, '0'
  L3:
    mov [edi+ecx], dl           ; store digit in string
    inc ecx
    jnz L1
    mov BYTE PTR [edi+ecx], 0   ; append null terminator
    pop edi
    pop ebx
    ret
itoa endp

;==============================================================================
start:
;==============================================================================

    ;-------------------------------------------------
    ; Test boundary values for the most common bases.
    ;-------------------------------------------------

    invoke itoa, 0, ADDR buffer1, 2
    printf("%s\n", ADDR buffer1)
    invoke crt__itoa, 0, ADDR buffer2, 2
    printf("%s\n", ADDR buffer2)
    invoke itoa, 7fffffffh, ADDR buffer1, 2
    printf("%s\n", ADDR buffer1)
    invoke crt__itoa, 7fffffffh, ADDR buffer2, 2
    printf("%s\n", ADDR buffer2)
    invoke itoa, 80000000h, ADDR buffer1, 2
    printf("%s\n", ADDR buffer1)
    invoke crt__itoa, 80000000h, ADDR buffer2, 2
    printf("%s\n", ADDR buffer2)
    invoke itoa, 0ffffffffh, ADDR buffer1, 2
    printf("%s\n", ADDR buffer1)
    invoke crt__itoa, 0ffffffffh, ADDR buffer2, 2
    printf("%s\n", ADDR buffer2)

    invoke itoa, 0, ADDR buffer1, 8
    printf("%s\n", ADDR buffer1)
    invoke crt__itoa, 0, ADDR buffer2, 8
    printf("%s\n", ADDR buffer2)
    invoke itoa, 7fffffffh, ADDR buffer1, 8
    printf("%s\n", ADDR buffer1)
    invoke crt__itoa, 7fffffffh, ADDR buffer2, 8
    printf("%s\n", ADDR buffer2)
    invoke itoa, 80000000h, ADDR buffer1, 8
    printf("%s\n", ADDR buffer1)
    invoke crt__itoa, 80000000h, ADDR buffer2, 8
    printf("%s\n", ADDR buffer2)
    invoke itoa, 0ffffffffh, ADDR buffer1, 8
    printf("%s\n", ADDR buffer1)
    invoke crt__itoa, 0ffffffffh, ADDR buffer2, 8
    printf("%s\n", ADDR buffer2)

    invoke itoa, 0, ADDR buffer1, 10
    printf("%s\n", ADDR buffer1)
    invoke crt__itoa, 0, ADDR buffer2, 10
    printf("%s\n", ADDR buffer2)
    invoke itoa, 7fffffffh, ADDR buffer1, 10
    printf("%s\n", ADDR buffer1)
    invoke crt__itoa, 7fffffffh, ADDR buffer2, 10
    printf("%s\n", ADDR buffer2)
    invoke itoa, 80000000h, ADDR buffer1, 10
    printf("%s\n", ADDR buffer1)
    invoke crt__itoa, 80000000h, ADDR buffer2, 10
    printf("%s\n", ADDR buffer2)
    invoke itoa, 0ffffffffh, ADDR buffer1, 10
    printf("%s\n", ADDR buffer1)
    invoke crt__itoa, 0ffffffffh, ADDR buffer2, 10
    printf("%s\n", ADDR buffer2)

    invoke itoa, 0, ADDR buffer1, 16
    printf("%s\n", ADDR buffer1)
    invoke crt__itoa, 0, ADDR buffer2, 16
    printf("%s\n", ADDR buffer2)
    invoke itoa, 7fffffffh, ADDR buffer1, 16
    printf("%s\n", ADDR buffer1)
    invoke crt__itoa, 7fffffffh, ADDR buffer2, 16
    printf("%s\n", ADDR buffer2)
    invoke itoa, 80000000h, ADDR buffer1, 16
    printf("%s\n", ADDR buffer1)
    invoke crt__itoa, 80000000h, ADDR buffer2, 16
    printf("%s\n", ADDR buffer2)
    invoke itoa, 0ffffffffh, ADDR buffer1, 16
    printf("%s\n", ADDR buffer1)
    invoke crt__itoa, 0ffffffffh, ADDR buffer2, 16
    printf("%s\n\n", ADDR buffer2)

    inkey

    invoke GetCurrentProcess
    invoke SetProcessAffinityMask, eax, 1

    invoke Sleep, 8000

    PC = REALTIME_PRIORITY_CLASS
    TP = THREAD_PRIORITY_TIME_CRITICAL

    REPEAT 3

        counter_begin 10000000, PC, TP
            invoke itoa, 12345678, ADDR buffer1, 2
        counter_end
        printf("\nitoa base 2      : %d cycles\n", eax)
        counter_begin 10000000, PC, TP
            invoke crt__itoa, 12345678, ADDR buffer2, 2
        counter_end
        printf("crt__itoa base 2 : %d cycles\n\n", eax)

        counter_begin 10000000, PC, TP
            invoke itoa, 12345678, ADDR buffer1, 10
        counter_end
        printf("itoa base 10     : %d cycles\n", eax)
        counter_begin 10000000, PC, TP
            invoke crt__itoa, 12345678, ADDR buffer2, 10
        counter_end
        printf("crt__itoa base 10: %d cycles\n\n", eax)

        counter_begin 10000000, PC, TP
            invoke itoa, 12345678, ADDR buffer1, 16
        counter_end
        printf("itoa base 16     : %d cycles\n", eax)
        counter_begin 10000000, PC, TP
            invoke crt__itoa, 12345678, ADDR buffer2, 16
        counter_end
        printf("crt__itoa base 16: %d cycles\n\n", eax)

    ENDM

    inkey

    ;------------------------------------------------
    ; Compare output to that of the CRT function for
    ; 1,000,000,000 random values and bases.
    ;------------------------------------------------

    xor ebx, ebx
    .WHILE ebx < 1000000000
        invoke nrandom, 35      ; base must be in range 2-36
        add eax, 2
        mov edi, eax
        invoke nrandom, -1
        mov esi, eax
        invoke itoa, esi, ADDR buffer1, edi
        invoke crt__itoa, esi, ADDR buffer2, edi
        ;printf("%d\t%d\t%s\t%s\n", esi, edi, ADDR buffer1, ADDR buffer2)
        mov eax, ebx
        cdq
        mov ecx, 1000000
        div ecx
        .IF edx == 0
            printf("%d\n",ebx)
        .ENDIF
        invoke crt_strcmp, ADDR buffer1, ADDR buffer2
        .IF eax != 0
            printf("ERROR\n")
            inkey
        .ENDIF
        inc ebx
    .ENDW

    inkey
    exit

;==============================================================================
end start

char *_itoa(
   int value,
   char *str,
   int radix
);


Optimizing the divide operation for the two most common base values 2 and 16 slowed the procedure down somewhat for the other bases, but made it much faster for bases 2 and 16. Typical cycle counts running on my P4 Northwood:

itoa base 2      : 322 cycles
crt__itoa base 2 : 1420 cycles

itoa base 10     : 515 cycles
crt__itoa base 10: 461 cycles

itoa base 16     : 91 cycles
crt__itoa base 16: 353 cycles


itoa base 2      : 322 cycles
crt__itoa base 2 : 1421 cycles

itoa base 10     : 515 cycles
crt__itoa base 10: 460 cycles

itoa base 16     : 91 cycles
crt__itoa base 16: 350 cycles


itoa base 2      : 325 cycles
crt__itoa base 2 : 1420 cycles

itoa base 10     : 515 cycles
crt__itoa base 10: 460 cycles

itoa base 16     : 87 cycles
crt__itoa base 16: 352 cycles


Well Microsoft, here's another nice mess you've gotten us into.

Gunther

Michael,

thank you for providing the code. It runs fine under 64 bit. Here is the result:

itoa base 2      : 223 cycles
crt__itoa base 2 : 525 cycles

itoa base 10     : 123 cycles
crt__itoa base 10: 148 cycles

itoa base 16     : 48 cycles
crt__itoa base 16: 103 cycles

itoa base 2      : 231 cycles
crt__itoa base 2 : 553 cycles

itoa base 10     : 122 cycles
crt__itoa base 10: 148 cycles

itoa base 16     : 48 cycles
crt__itoa base 16: 103 cycles

itoa base 2      : 231 cycles
crt__itoa base 2 : 554 cycles

itoa base 10     : 122 cycles
crt__itoa base 10: 148 cycles

itoa base 16     : 48 cycles
crt__itoa base 16: 103 cycles

Press any key to continue ...


Gunther
You have to know the facts before you can distort them.

TWell

AMD X64
itoa base 2      : 285 cycles
crt__itoa base 2 : 763 cycles

itoa base 10     : 323 cycles
crt__itoa base 10: 254 cycles

itoa base 16     : 55 cycles
crt__itoa base 16: 207 cycles


itoa base 2      : 293 cycles
crt__itoa base 2 : 763 cycles

itoa base 10     : 323 cycles
crt__itoa base 10: 254 cycles

itoa base 16     : 55 cycles
crt__itoa base 16: 207 cycles


itoa base 2      : 293 cycles
crt__itoa base 2 : 763 cycles

itoa base 10     : 323 cycles
crt__itoa base 10: 254 cycles

itoa base 16     : 55 cycles
crt__itoa base 16: 207 cycles

Press any key to continue ...

Adamanteus

I could remark, that CRT now require support of national digit characters, so may be better to get them from local array, that fills by locale know function, as I'm using strnal(char*, size_t) for it.

habran

this would increase the speed a little bit:
in this case it will take care of edx itself

DODIV MACRO
    mov edx, eax
    .if (ebx == 16)
      and edx, 01111b
      shr eax, 4
    .elseif (ebx == 2)
      and edx, 01b
      shr eax, 1
    .else
      xor edx,edx
      idiv ebx
    .endif
ENDM
Cod-Father

MichaelW

Quote from: habran on December 16, 2013, 06:12:16 AM
this would increase the speed a little bit:

Yes, thanks, I can now squeeze another 2 cycles out of it :biggrin:
Well Microsoft, here's another nice mess you've gotten us into.

habran

there you go :greenclp:
if you call this macro 100 times you will squeeze 200 cycles :biggrin:
you humans are aware that no one is perfect but you never give up trying ::)
Cod-Father