News:

Masm32 SDK description, downloads and other helpful links
Message to All Guests

Main Menu

AnsitoUTF8

Started by guga, April 18, 2013, 09:02:38 AM

Previous topic - Next topic

guga


;;
    AnsitoUTF8

    This function converts an ansi (ascii) string to UTF8 format.
   
    Arguments:
   
        pszAscii(in):   Pointer to the inputed string to be converted
       
        pszUTF8(out):   Pointer to a variable that will contains the outputed string
       
        BomFlag:        Flag responsable to convert the UTF8 using or not BOM (Byte Order Mark) character.
                        If the Flag is set to &TRUE, the outputed UTF-8 string uses BOM.
                        If the Flag is set to &FALSE, the outputed UTF-8 string don´t uses BOM.

    Return Value:   If the function suceeds it return the size of the encoded UTF-8 string.
                    If it fais, it returns &FALSE
   
    Remarks:    The byte order mark (BOM) is a Unicode character used to signal the endianness (byte order) of a text file or stream.
                It is encoded at U+FEFF byte order mark (BOM). BOM use is optional, and, if used, should appear at the start of the text stream.
                Beyond its specific use as a byte-order indicator, the BOM character may also indicate which of the several Unicode representations the text is encoded in.
               
                Because Unicode can be encoded as 16-bit or 32-bit integers, a computer receiving these encodings from arbitrary sources needs to know which
                byte order the integers are encoded in. The BOM gives the producer of the text a way to describe the text stream's endianness to the consumer
                of the text without requiring some contract or metadata outside of the text stream itself.
               
                Once the receiving computer has consumed the text stream, it presumably processes the characters in its own native byte order and no longer needs the BOM.
                Hence the need for a BOM arises in the context of text interchange, rather than in normal text processing within a closed environment.

    Author(s):  Beyond2000! (Adapted and fixed it to RosAsm)
                apr/2.013

    Original author: Neil Hodgson 1998-2.001 for Scintilla source code edit control. The original function is inside: Uniconversion.cpp


    Example of usage:

        [StringBuff: D$ 0]

        call AnsitoUTF8 {B$ "cação", 0}, StringBuff, &TRUE

;;

Proc AnsitoUTF8:
    Arguments @pszAscii, @pszUTF8, @BomFlag
    Local @lenASCII, @lenUCS2, @lenUTF8, @pszUCS2
    Uses ecx, edi, edx

    xor eax eax
    On D@pszAscii = 0, ExitP
    On D@pszUTF8 = 0, ExitP

    mov edi D@pszUTF8
    ; length of pszUTF8 must be enough; its maximum is (lenASCII*3 + 1)
    call StrLenProc D@pszAscii
    If eax = 0
        mov B$edi 0 | ExitP
    End_If

    mov ecx eax
    shl eax 1
    add eax ecx
    inc eax
    call MemAlloc edi, eax
    If eax = 0
        mov B$edi 0 | ExitP
    End_If
    mov D@pszUCS2 eax
    mov edx ecx
    inc ecx
    call 'KERNEL32.MultiByteToWideChar' &CP_ACP, 0, D@pszASCII, edx, D@pszUCS2, ecx
    mov D@lenUCS2 eax
    call UTF8Length D@pszUCS2, eax

    If D@BomFlag = &TRUE
        add eax 3
    End_If

    mov D@lenUTF8 eax
    mov edi D@pszUTF8
    call MemAlloc D@pszUTF8, eax
    If eax = 0
        mov B$edi 0
        call MemFree D@pszUCS2 | ExitP
    End_If

    If D@BomFlag = &TRUE
        mov D$eax 0BFBBEF
        add eax 3
    End_If


    ;length of pszUTF8 must be >= (lenUTF8 + 1)
    call UTF8FromUCS2 D@pszUCS2, D@lenUCS2, eax, D@lenUTF8;eax
    call MemFree D@pszUCS2
    mov eax D@lenUTF8
    mov ecx D$edi
    add ecx eax | mov B$ecx 0

EndP
___________________________________________________________________________________________
Proc UTF8Length:
    Arguments @uptr, @tlen
    Local @len, @iCounter, @uch
    Uses ecx, edx

    mov D@len 0
    mov D@iCounter 0

    .Do

        mov edx D@iCounter
        mov eax D@uptr
        movzx ecx W$eax+edx*2
        mov D@uch ecx
        mov eax D@len
        On ecx = 0, ExitP
        If D@uch < 080
            inc D@len
        Else_If D@uch < 0800
            add D@len 2
        Else
            add D@len 3
        End_If

        mov ecx D@tlen
        inc D@iCounter
    .Loop_Until D@iCounter > ecx

    mov eax D@len

EndP
___________________________________________________________________________________________
Proc UTF8FromUCS2:
    Arguments @uptr, @tlen, @putf, @len
    Local @iCounter, @kCounter, @uch
    Uses ecx, edx, eax

    mov D@iCounter 0
    mov D@kCounter 0

    .Do

        mov edx D@iCounter
        mov eax D@uptr
        movzx ecx W$eax+edx*2
        mov D@uch ecx
        mov eax D@len
        On ecx = 0, jmp L1>>;ExitP
        .If D@uch < 080

            mov edx D@putf | add edx D@kCounter
            ;mov al B$ecx | mov B$edx al
            mov al cl | mov B$edx al
            inc D@kCounter
        .Else_If D@uch < 0800

            mov edx ecx
            shr edx 6 | or edx 0C0
            mov eax D@putf | add eax D@kCounter | mov B$eax dl
            inc D@kCounter
           
            mov edx ecx
            and edx 03F | or edx 080
            mov eax D@putf | add eax D@kCounter | mov B$eax dl
            inc D@kCounter
           
        .Else

            mov edx ecx
            shr edx 0C | or edx 0E0
            mov eax D@putf | add eax D@kCounter | mov B$eax dl
            inc D@kCounter

            mov edx ecx
            shr edx 6 | and edx 03F | or edx 080
            mov eax D@putf | add eax D@kCounter | mov B$eax dl
            inc D@kCounter
           
            mov edx ecx
            and edx 03F | or edx 080
            mov eax D@putf | add eax D@kCounter | mov B$eax dl
            inc D@kCounter

        .End_If

        mov ecx D@tlen
        inc D@iCounter
    .Loop_Until D@iCounter > ecx
L1:
    mov edx D@putf | add edx D@len | mov B$edx 0

EndP


Aditional functions

; Same with regs under Proc responsability:
Proc StrLenProc:
    Arguments @Pointer
    Uses edi, ecx

    mov edi D@Pointer, ecx 0-1, al 0
    repne scasb
    mov eax 0-2 | sub eax ecx      ; Lenght in eax

EndP

Proc MemAlloc:
    Arguments @pOutput, @Size
    Uses ebx, edx

    VirtualAlloc D@pOutput, D@Size

EndP

Proc MemFree:
    Arguments @pOutput
    Uses ebx, edx, eax

    VirtualFree D@pOutput

EndP




Masm variation of this function originally made by UlliN. Converted and adapted to RosAsm by Beyond2000!
http://www.masmforum.com/board/index.php?PHPSESSID=8d46cd4ecb1688be429ab49694ec53e6&topic=6507.0;wap2


Proc AnsitoUTF8Masm:
    Arguments @pszAscii, @pszUTF8, @BomFlag
    Local @lenASCII, @lenUCS2, @lenUTF8, @pszUCS2, @pUnicode, @LenCharString, @lenUTF8Result
    Uses ecx, edi, edx

    xor eax eax
    On D@pszAscii = 0, ExitP
    On D@pszUTF8 = 0, ExitP

    mov edi D@pszUTF8
    ; length of pszUTF8 must be enough; its maximum is (lenASCII*3 + 1)
    call StrLenProc D@pszAscii
    If eax = 0
        mov B$edi 0 | ExitP
    End_If

    mov ecx eax
    shl eax 1
    add eax ecx
    inc eax
    call MemAlloc edi, eax
    If eax = 0
        mov B$edi 0 | ExitP
    End_If
    mov D@pUnicode eax
    mov D@LenCharString ecx
    inc ecx
    call 'KERNEL32.MultiByteToWideChar' &CP_ACP, &MB_PRECOMPOSED, D@pszASCII, D@LenCharString, D@pUnicode, ecx
    mov D@lenUCS2 eax
    call UTF8Length D@pUnicode, eax
    mov D@lenUTF8 eax

    If D@BomFlag = &TRUE
        add eax 3
    End_If
    mov D@lenUTF8Result eax

    mov edi D@pszUTF8
    call MemAlloc D@pszUTF8, eax
    If eax = 0
        mov B$edi 0
        call MemFree D@pUnicode | ExitP
    End_If

    If D@BomFlag = &TRUE
        mov D$eax 0BFBBEF
        add eax 3
    End_If

    ;length of pszUTF8 must be >= (lenUTF8 + 1)
    call 'KERNEL32.WideCharToMultiByte' &CP_UTF8, 0, D@pUnicode, D@LenCharString, eax, D@lenUTF8, 0, 0

    call MemFree D@pUnicode

    mov eax D@lenUTF8Result
    mov ecx D$edi
    add ecx eax | mov B$ecx 0

EndP
Coding in Assembly requires a mix of:
80% of brain, passion, intuition, creativity
10% of programming skills
10% of alcoholic levels in your blood.

My Code Sites:
http://rosasm.freeforums.org
http://winasm.tripod.com

guga

#1
Errata: Fixed UTF8Length. The return value was wrong. the correct is: "mov eax D@len"
Coding in Assembly requires a mix of:
80% of brain, passion, intuition, creativity
10% of programming skills
10% of alcoholic levels in your blood.

My Code Sites:
http://rosasm.freeforums.org
http://winasm.tripod.com