;;
AnsitoUTF8
This function converts an ansi (ascii) string to UTF8 format.
Arguments:
pszAscii(in): Pointer to the inputed string to be converted
pszUTF8(out): Pointer to a variable that will contains the outputed string
BomFlag: Flag responsable to convert the UTF8 using or not BOM (Byte Order Mark) character.
If the Flag is set to &TRUE, the outputed UTF-8 string uses BOM.
If the Flag is set to &FALSE, the outputed UTF-8 string don´t uses BOM.
Return Value: If the function suceeds it return the size of the encoded UTF-8 string.
If it fais, it returns &FALSE
Remarks: The byte order mark (BOM) is a Unicode character used to signal the endianness (byte order) of a text file or stream.
It is encoded at U+FEFF byte order mark (BOM). BOM use is optional, and, if used, should appear at the start of the text stream.
Beyond its specific use as a byte-order indicator, the BOM character may also indicate which of the several Unicode representations the text is encoded in.
Because Unicode can be encoded as 16-bit or 32-bit integers, a computer receiving these encodings from arbitrary sources needs to know which
byte order the integers are encoded in. The BOM gives the producer of the text a way to describe the text stream's endianness to the consumer
of the text without requiring some contract or metadata outside of the text stream itself.
Once the receiving computer has consumed the text stream, it presumably processes the characters in its own native byte order and no longer needs the BOM.
Hence the need for a BOM arises in the context of text interchange, rather than in normal text processing within a closed environment.
Author(s): Beyond2000! (Adapted and fixed it to RosAsm)
apr/2.013
Original author: Neil Hodgson 1998-2.001 for Scintilla source code edit control. The original function is inside: Uniconversion.cpp
Example of usage:
[StringBuff: D$ 0]
call AnsitoUTF8 {B$ "cação", 0}, StringBuff, &TRUE
;;
Proc AnsitoUTF8:
Arguments @pszAscii, @pszUTF8, @BomFlag
Local @lenASCII, @lenUCS2, @lenUTF8, @pszUCS2
Uses ecx, edi, edx
xor eax eax
On D@pszAscii = 0, ExitP
On D@pszUTF8 = 0, ExitP
mov edi D@pszUTF8
; length of pszUTF8 must be enough; its maximum is (lenASCII*3 + 1)
call StrLenProc D@pszAscii
If eax = 0
mov B$edi 0 | ExitP
End_If
mov ecx eax
shl eax 1
add eax ecx
inc eax
call MemAlloc edi, eax
If eax = 0
mov B$edi 0 | ExitP
End_If
mov D@pszUCS2 eax
mov edx ecx
inc ecx
call 'KERNEL32.MultiByteToWideChar' &CP_ACP, 0, D@pszASCII, edx, D@pszUCS2, ecx
mov D@lenUCS2 eax
call UTF8Length D@pszUCS2, eax
If D@BomFlag = &TRUE
add eax 3
End_If
mov D@lenUTF8 eax
mov edi D@pszUTF8
call MemAlloc D@pszUTF8, eax
If eax = 0
mov B$edi 0
call MemFree D@pszUCS2 | ExitP
End_If
If D@BomFlag = &TRUE
mov D$eax 0BFBBEF
add eax 3
End_If
;length of pszUTF8 must be >= (lenUTF8 + 1)
call UTF8FromUCS2 D@pszUCS2, D@lenUCS2, eax, D@lenUTF8;eax
call MemFree D@pszUCS2
mov eax D@lenUTF8
mov ecx D$edi
add ecx eax | mov B$ecx 0
EndP
___________________________________________________________________________________________
Proc UTF8Length:
Arguments @uptr, @tlen
Local @len, @iCounter, @uch
Uses ecx, edx
mov D@len 0
mov D@iCounter 0
.Do
mov edx D@iCounter
mov eax D@uptr
movzx ecx W$eax+edx*2
mov D@uch ecx
mov eax D@len
On ecx = 0, ExitP
If D@uch < 080
inc D@len
Else_If D@uch < 0800
add D@len 2
Else
add D@len 3
End_If
mov ecx D@tlen
inc D@iCounter
.Loop_Until D@iCounter > ecx
mov eax D@len
EndP
___________________________________________________________________________________________
Proc UTF8FromUCS2:
Arguments @uptr, @tlen, @putf, @len
Local @iCounter, @kCounter, @uch
Uses ecx, edx, eax
mov D@iCounter 0
mov D@kCounter 0
.Do
mov edx D@iCounter
mov eax D@uptr
movzx ecx W$eax+edx*2
mov D@uch ecx
mov eax D@len
On ecx = 0, jmp L1>>;ExitP
.If D@uch < 080
mov edx D@putf | add edx D@kCounter
;mov al B$ecx | mov B$edx al
mov al cl | mov B$edx al
inc D@kCounter
.Else_If D@uch < 0800
mov edx ecx
shr edx 6 | or edx 0C0
mov eax D@putf | add eax D@kCounter | mov B$eax dl
inc D@kCounter
mov edx ecx
and edx 03F | or edx 080
mov eax D@putf | add eax D@kCounter | mov B$eax dl
inc D@kCounter
.Else
mov edx ecx
shr edx 0C | or edx 0E0
mov eax D@putf | add eax D@kCounter | mov B$eax dl
inc D@kCounter
mov edx ecx
shr edx 6 | and edx 03F | or edx 080
mov eax D@putf | add eax D@kCounter | mov B$eax dl
inc D@kCounter
mov edx ecx
and edx 03F | or edx 080
mov eax D@putf | add eax D@kCounter | mov B$eax dl
inc D@kCounter
.End_If
mov ecx D@tlen
inc D@iCounter
.Loop_Until D@iCounter > ecx
L1:
mov edx D@putf | add edx D@len | mov B$edx 0
EndP
Aditional functions
; Same with regs under Proc responsability:
Proc StrLenProc:
Arguments @Pointer
Uses edi, ecx
mov edi D@Pointer, ecx 0-1, al 0
repne scasb
mov eax 0-2 | sub eax ecx ; Lenght in eax
EndP
Proc MemAlloc:
Arguments @pOutput, @Size
Uses ebx, edx
VirtualAlloc D@pOutput, D@Size
EndP
Proc MemFree:
Arguments @pOutput
Uses ebx, edx, eax
VirtualFree D@pOutput
EndP
Masm variation of this function originally made by UlliN. Converted and adapted to RosAsm by Beyond2000!
http://www.masmforum.com/board/index.php?PHPSESSID=8d46cd4ecb1688be429ab49694ec53e6&topic=6507.0;wap2
Proc AnsitoUTF8Masm:
Arguments @pszAscii, @pszUTF8, @BomFlag
Local @lenASCII, @lenUCS2, @lenUTF8, @pszUCS2, @pUnicode, @LenCharString, @lenUTF8Result
Uses ecx, edi, edx
xor eax eax
On D@pszAscii = 0, ExitP
On D@pszUTF8 = 0, ExitP
mov edi D@pszUTF8
; length of pszUTF8 must be enough; its maximum is (lenASCII*3 + 1)
call StrLenProc D@pszAscii
If eax = 0
mov B$edi 0 | ExitP
End_If
mov ecx eax
shl eax 1
add eax ecx
inc eax
call MemAlloc edi, eax
If eax = 0
mov B$edi 0 | ExitP
End_If
mov D@pUnicode eax
mov D@LenCharString ecx
inc ecx
call 'KERNEL32.MultiByteToWideChar' &CP_ACP, &MB_PRECOMPOSED, D@pszASCII, D@LenCharString, D@pUnicode, ecx
mov D@lenUCS2 eax
call UTF8Length D@pUnicode, eax
mov D@lenUTF8 eax
If D@BomFlag = &TRUE
add eax 3
End_If
mov D@lenUTF8Result eax
mov edi D@pszUTF8
call MemAlloc D@pszUTF8, eax
If eax = 0
mov B$edi 0
call MemFree D@pUnicode | ExitP
End_If
If D@BomFlag = &TRUE
mov D$eax 0BFBBEF
add eax 3
End_If
;length of pszUTF8 must be >= (lenUTF8 + 1)
call 'KERNEL32.WideCharToMultiByte' &CP_UTF8, 0, D@pUnicode, D@LenCharString, eax, D@lenUTF8, 0, 0
call MemFree D@pUnicode
mov eax D@lenUTF8Result
mov ecx D$edi
add ecx eax | mov B$ecx 0
EndP
Errata: Fixed UTF8Length. The return value was wrong. the correct is: "mov eax D@len"