Plain Masm32 (see MSDN (https://docs.microsoft.com/en-us/windows/desktop/api/stringapiset/nf-stringapiset-widechartomultibyte)):
include \masm32\include\masm32rt.inc
.data
HelloWBom db 0FFh, 0FEh ; Unicode BOM
HelloW dw "H", "e", "l", "l", "o", " ", "W", "o", "r", "l", "d", 13, 10, 0, 0
buffer db 100 dup(?)
.code
start:
mov edi, offset buffer
mov esi, offset HelloW
MyCP=1252 ; Windows default
invoke SetConsoleOutputCP, MyCP
dwFlags=WC_DEFAULTCHAR or WC_COMPOSITECHECK
print "HelloW, flag: ", 9
invoke WideCharToMultiByte, MyCP, dwFlags,
esi, -1, edi, 50, 0, 0
print edi
print "HelloW, 0: ", 9
invoke WideCharToMultiByte, MyCP, 0,
esi, -1, edi, 50, 0, 0
print edi
sub esi, 2 ; set on Unicode BOM
print "HelloW BOM, flag:", 9
invoke WideCharToMultiByte, MyCP, dwFlags,
esi, -1, edi, 50, 0, 0
print edi
print "HelloW BOM, 0: ", 9
invoke WideCharToMultiByte, MyCP, 0,
esi, -1, edi, 50, 0, 0
print edi
print "HelloW BOM, flag:", 9
invoke WideCharToMultiByte, MyCP, dwFlags,
esi, -1, edi, 50, 0, 0
print edi
inkey chr$(13, 10, "weird, isn't it?")
exit
end start
This is the output; the two question marks shouldn't be there:
HelloW, flag: Hello World
HelloW, 0: Hello World
HelloW BOM, flag: ?Hello World
HelloW BOM, 0: ?Hello World
HelloW BOM, flag: ?Hello World
So Microsoft's WideCharToMultiByte doesn't honour Microsoft's UTF16 BOM. Same problem on Win7-64 and WinXP. Am I overlooking something stupid...?
Im guessing thats its something you check for first (to get the byte order) and then read the rest as normal (depending on the endianness detected if BOM is there) otherwise with no BOM then read the entire stream
I have never gone for it as I have never seen unicode written anything else than left to right. Maybe on a Motorola MAC but not on any x86 based hardware.
Quote from: fearless on August 14, 2018, 03:44:14 AM
Im guessing thats its something you check for first (to get the byte order) and then read the rest as normal (depending on the endianness detected if BOM is there) otherwise with no BOM then read the entire stream
Just checked in old code that I've not touched for years:
@@: lea edx, [edi+2] ; skip the BOM <<<<<<<<<<<<<<<<<<<<<<!!
invoke WideCharToMultiByte, CP_UTF8, 0, edx, esi, ebx, eax, 0, 0 ; ebx is dest, eax is sizeof buffer
Wide Char - 16 bit character in Unicode codepage
UTF - Unicode Transfomation Format, that to safe space in text files
BOM - Byte Order Message, that to know how text encoded in text file