MASM32 Downloads
wstring UTF8toUnicode(const string& s) { wstring ws; wchar_t wc; for (int i = 0; i < s.length(); ) { char c = s[i]; if ((c & 0x80) == 0) { wc = c; ++i; } else if ((c & 0xE0) == 0xC0) { wc = (s[i] & 0x1F) << 6; wc |= (s[i + 1] & 0x3F); i += 2; } else if ((c & 0xF0) == 0xE0) { wc = (s[i] & 0xF) << 12; wc |= (s[i + 1] & 0x3F) << 6; wc |= (s[i + 2] & 0x3F); i += 3; } else if ((c & 0xF8) == 0xF0) { wc = (s[i] & 0x7) << 18; wc |= (s[i + 1] & 0x3F) << 12; wc |= (s[i + 2] & 0x3F) << 6; wc |= (s[i + 3] & 0x3F); i += 4; } else if ((c & 0xFC) == 0xF8) { wc = (s[i] & 0x3) << 24; wc |= (s[i] & 0x3F) << 18; wc |= (s[i] & 0x3F) << 12; wc |= (s[i] & 0x3F) << 6; wc |= (s[i] & 0x3F); i += 5; } else if ((c & 0xFE) == 0xFC) { wc = (s[i] & 0x1) << 30; wc |= (s[i] & 0x3F) << 24; wc |= (s[i] & 0x3F) << 18; wc |= (s[i] & 0x3F) << 12; wc |= (s[i] & 0x3F) << 6; wc |= (s[i] & 0x3F); i += 6; } ws += wc; } return ws; }
#include <Windows.h>#include <cstring>#include <iostream> std::string myutf8String = "Russian: советских\nJapanese: 私は学生です\nChinese: 你好\nTamil: ಬಾ ಇಲ್ಲಿ ಸಂಭವಿಸು\nClassical Greek: ὕαλον ϕαγεῖν\nCzech: Mohu jíst sklo\nArabic:أنا قادر على أكل الزجاج و هذا لا يؤلمني."; std::wstring myutf16; std::wstring UTF8toUnicode(const std::string& s); int main() { myutf16=UTF8toUnicode(myutf8String); MessageBoxW(0, myutf16.c_str(), L"UTF-16", 0); return 0; } using namespace std; wstring UTF8toUnicode(const string& s) { wstring ws; wchar_t wc; for (int i = 0; i < s.length(); ) { char c = s[i]; if ((c & 0x80) == 0) { wc = c; ++i; } else if ((c & 0xE0) == 0xC0) { wc = (s[i] & 0x1F) << 6; wc |= (s[i + 1] & 0x3F); i += 2; } else if ((c & 0xF0) == 0xE0) { wc = (s[i] & 0xF) << 12; wc |= (s[i + 1] & 0x3F) << 6; wc |= (s[i + 2] & 0x3F); i += 3; } else if ((c & 0xF8) == 0xF0) { wc = (s[i] & 0x7) << 18; wc |= (s[i + 1] & 0x3F) << 12; wc |= (s[i + 2] & 0x3F) << 6; wc |= (s[i + 3] & 0x3F); i += 4; } else if ((c & 0xFC) == 0xF8) { wc = (s[i] & 0x3) << 24; wc |= (s[i] & 0x3F) << 18; wc |= (s[i] & 0x3F) << 12; wc |= (s[i] & 0x3F) << 6; wc |= (s[i] & 0x3F); i += 5; } else if ((c & 0xFE) == 0xFC) { wc = (s[i] & 0x1) << 30; wc |= (s[i] & 0x3F) << 24; wc |= (s[i] & 0x3F) << 18; wc |= (s[i] & 0x3F) << 12; wc |= (s[i] & 0x3F) << 6; wc |= (s[i] & 0x3F); i += 6; } ws += wc; } return ws; }
It has been implemented some time ago, look it up in String.c from line 99I works flawlessly 8)
include \masm32\include\masm32rt.inc.datatxTitle dw "Does it work?", 0 ; Error A2055: Initializer value too largetxHelloW dw "Привет, Мир!", 0.codestart: invoke MessageBox, 0, offset txHelloW, offset txTitle, MB_OK exitend start
include \masm32\include\masm32rt.incOPTION LITERALS:ON.datatxTitle dw "Does it work?", 0txHelloW dw "Привет, Мир!", 0.codestart: invoke MessageBoxW, 0, offset txHelloW, offset txTitle, MB_OK exitend start
#define WIN32_LEAN_AND_MEAN#include <windows.h>#pragma comment(lib, "user32.lib")wchar_t *UTF8toUnicode(char *s, wchar_t *ws){ wchar_t wc; char *c = s; do { if ((*c & 0x80) == 0) { wc = *c++; } else if ((*c & 0xE0) == 0xC0) { wc = ( *c++ & 0x1F) << 6; wc |= (*c++ & 0x3F); } else if ((*c & 0xF0) == 0xE0) { wc = (*c++ & 0xF) << 12; wc |= (*c++ & 0x3F) << 6; wc |= (*c++ & 0x3F); } else if ((*c & 0xF8) == 0xF0) { wc = (*c & 0x7) << 18; wc |= (*c++ & 0x3F) << 12; wc |= (*c++ & 0x3F) << 6; wc |= (*c++ & 0x3F); } else if ((*c & 0xFC) == 0xF8) { wc = (*c++ & 0x3) << 24; wc |= (*c++ & 0x3F) << 18; wc |= (*c++ & 0x3F) << 12; wc |= (*c++ & 0x3F) << 6; wc |= (*c++ & 0x3F); } else if ((*c & 0xFE) == 0xFC) { wc = (*c++ & 0x1) << 30; wc |= (*c++ & 0x3F) << 24; wc |= (*c++ & 0x3F) << 18; wc |= (*c++ & 0x3F) << 12; wc |= (*c++ & 0x3F) << 6; wc |= (*c++ & 0x3F); } *ws++ = wc; } while (*c); return ws;}void __cdecl mainCRTStartup(void){ char *myutf8String = u8"Russian: советских\nJapanese: 私は学生です\nChinese: 你好\nTamil: ಬಾ ಇಲ್ಲಿ ಸಂಭವಿಸು\nClassical Greek: ὕαλον ϕαγεῖν\nCzech: Mohu jíst sklo\nArabic:أنا قادر على أكل الزجاج و هذا لا يؤلمني."; wchar_t myutf16[200]; UTF8toUnicode(myutf8String, myutf16); MessageBoxW(0, myutf16, L"UTF-16", 0);}
#pragma execution_character_set("utf-8")
I don't see any way of making the macro operators turn string characters into bytes (for numerical evaluation and comparison purposes) at assemble time. It may be easy, despite people being struggling with that for decades, but I believe it is impossible (at assemble time only, of course).
Perhaps it's not possible with elemental macros we build, and you are right in that sense. But I don't think is impossible with advanced macros, just boring.
A Saída dos Operários da Fábrica Lumière
#define WIN32_LEAN_AND_MEAN#include <windows.h>#pragma comment(lib, "user32.lib")char *UTF8toANSI(char *s, char *as){ char ch; char *c = s; do { if ((*c & 0x80) == 0) { ch = *c++; } else if ((*c & 0xE0) == 0xC0) { ch = ( *c++ & 0x1F) << 6; ch |= (*c++ & 0x3F); } else if ((*c & 0xF0) == 0xE0) { ch = (*c++ & 0xF) << 12; ch |= (*c++ & 0x3F) << 6; ch |= (*c++ & 0x3F); } else if ((*c & 0xF8) == 0xF0) { ch = (*c & 0x7) << 18; ch |= (*c++ & 0x3F) << 12; ch |= (*c++ & 0x3F) << 6; ch |= (*c++ & 0x3F); } else if ((*c & 0xFC) == 0xF8) { ch = (*c++ & 0x3) << 24; ch |= (*c++ & 0x3F) << 18; ch |= (*c++ & 0x3F) << 12; ch |= (*c++ & 0x3F) << 6; ch |= (*c++ & 0x3F); } else if ((*c & 0xFE) == 0xFC) { ch = (*c++ & 0x1) << 30; ch |= (*c++ & 0x3F) << 24; ch |= (*c++ & 0x3F) << 18; ch |= (*c++ & 0x3F) << 12; ch |= (*c++ & 0x3F) << 6; ch |= (*c++ & 0x3F); } *as++ = ch; } while (*c); return as;}void __cdecl mainCRTStartup(void){ //char *myutf8String = u8"A Saída dos Operários da Fábrica Lumière"; char *myutf8String = "A SaÃda dos Operários da Fábrica Lumière"; char myANSI[200]; UTF8toANSI(myutf8String, myANSI); MessageBoxA(0, myANSI, "ANSI", 0);}