News:

Masm32 SDK description, downloads and other helpful links
Message to All Guests

Main Menu

UTF-8 to UTF-16

Started by aw27, January 24, 2019, 09:06:53 PM

Previous topic - Next topic

aw27

Since the last topic was about macros...
In my opinion, a really revolutionary internal MACRO that UASM should have would be a UTF-8 to UTF-16 macro.
As we know, it is impossible to make a regular UTF-8 to UTF-16 macro that changes the data section at assemble time. What people has done so far are macros that convert ASCII characters to words and call these Unicode conversion macros but they are not.  :eusa_naughty:
The general algorythm in C++ is pretty simple, but again, impossible to convert to an assemble time ASM macro. I copied from here: https://gist.github.com/rechardchen/3321830)


wstring UTF8toUnicode(const string& s)
{
wstring ws;
wchar_t wc;
for (int i = 0; i < s.length(); )
{
char c = s[i];
if ((c & 0x80) == 0)
{
wc = c;
++i;
}
else if ((c & 0xE0) == 0xC0)
{
wc = (s[i] & 0x1F) << 6;
wc |= (s[i + 1] & 0x3F);
i += 2;
}
else if ((c & 0xF0) == 0xE0)
{
wc = (s[i] & 0xF) << 12;
wc |= (s[i + 1] & 0x3F) << 6;
wc |= (s[i + 2] & 0x3F);
i += 3;
}
else if ((c & 0xF8) == 0xF0)
{
wc = (s[i] & 0x7) << 18;
wc |= (s[i + 1] & 0x3F) << 12;
wc |= (s[i + 2] & 0x3F) << 6;
wc |= (s[i + 3] & 0x3F);
i += 4;
}
else if ((c & 0xFC) == 0xF8)
{
wc = (s[i] & 0x3) << 24;
wc |= (s[i] & 0x3F) << 18;
wc |= (s[i] & 0x3F) << 12;
wc |= (s[i] & 0x3F) << 6;
wc |= (s[i] & 0x3F);
i += 5;
}
else if ((c & 0xFE) == 0xFC)
{
wc = (s[i] & 0x1) << 30;
wc |= (s[i] & 0x3F) << 24;
wc |= (s[i] & 0x3F) << 18;
wc |= (s[i] & 0x3F) << 12;
wc |= (s[i] & 0x3F) << 6;
wc |= (s[i] & 0x3F);
i += 6;
}
ws += wc;
}
return ws;



Let's see whether it works:



So, the complete program to produce the above image:


#include <Windows.h>
#include <cstring>
#include <iostream>


std::string myutf8String = "Russian: советских\nJapanese: 私は学生です\nChinese: 你好\nTamil: ಬಾ ಇಲ್ಲಿ ಸಂಭವಿಸು\nClassical Greek: ὕαλον ϕαγεῖν\nCzech: Mohu jíst sklo\nArabic:أنا قادر على أكل الزجاج و هذا لا يؤلمني.";
std::wstring myutf16;
std::wstring UTF8toUnicode(const std::string& s);

int main()
{
myutf16=UTF8toUnicode(myutf8String);
MessageBoxW(0, myutf16.c_str(), L"UTF-16", 0);

return 0;
}

using namespace std;

wstring UTF8toUnicode(const string& s)
{
wstring ws;
wchar_t wc;
for (int i = 0; i < s.length(); )
{
char c = s[i];
if ((c & 0x80) == 0)
{
wc = c;
++i;
}
else if ((c & 0xE0) == 0xC0)
{
wc = (s[i] & 0x1F) << 6;
wc |= (s[i + 1] & 0x3F);
i += 2;
}
else if ((c & 0xF0) == 0xE0)
{
wc = (s[i] & 0xF) << 12;
wc |= (s[i + 1] & 0x3F) << 6;
wc |= (s[i + 2] & 0x3F);
i += 3;
}
else if ((c & 0xF8) == 0xF0)
{
wc = (s[i] & 0x7) << 18;
wc |= (s[i + 1] & 0x3F) << 12;
wc |= (s[i + 2] & 0x3F) << 6;
wc |= (s[i + 3] & 0x3F);
i += 4;
}
else if ((c & 0xFC) == 0xF8)
{
wc = (s[i] & 0x3) << 24;
wc |= (s[i] & 0x3F) << 18;
wc |= (s[i] & 0x3F) << 12;
wc |= (s[i] & 0x3F) << 6;
wc |= (s[i] & 0x3F);
i += 5;
}
else if ((c & 0xFE) == 0xFC)
{
wc = (s[i] & 0x1) << 30;
wc |= (s[i] & 0x3F) << 24;
wc |= (s[i] & 0x3F) << 18;
wc |= (s[i] & 0x3F) << 12;
wc |= (s[i] & 0x3F) << 6;
wc |= (s[i] & 0x3F);
i += 6;
}
ws += wc;
}
return ws;
}



habran

#1
It has been implemented some time ago, look it up in String.c from line 99
It works flawlessly 8)
Cod-Father

HSE

Hi Atelier!

What problem for a macro you see?
Equations in Assembly: SmplMath

jj2007

Quote from: habran on January 24, 2019, 09:57:09 PM
It has been implemented some time ago, look it up in String.c from line 99
I works flawlessly 8)

What's the trick then?

include \masm32\include\masm32rt.inc

.data
txTitle dw "Does it work?", 0 ; Error A2055: Initializer value too large
txHelloW dw "Привет, Мир!", 0

.code
start:
invoke MessageBox, 0, offset txHelloW, offset txTitle, MB_OK
exit

end start


Btw we had the discussion already in Summer 2017

habran

Declaring wide string data with dw will only happen with OPTION LITERALS:ON and using
command line switches –Zm or –Zne will disable this.
Cod-Father

jj2007

Yep it works, good to know :t

include \masm32\include\masm32rt.inc
OPTION LITERALS:ON

.data
txTitle dw "Does it work?", 0
txHelloW dw "Привет, Мир!", 0

.code
start:
invoke MessageBoxW, 0, offset txHelloW, offset txTitle, MB_OK
exit

end start

aw27

@habran
I was thinking of it in the form of an internal macro called on demand as needed  without the need for the OPTION LITERALS:ON. It would look more familiar to MASM users. But doing it through OPTION LITERALS:ON is already great.  :t

@HSE
I don't see any way of making the macro operators turn string characters into bytes (for numerical evaluation and comparison purposes) at assemble time. It may be easy, despite people being struggling with that for decades, but I believe it is impossible (at assemble time only, of course).

TimoVJL

#7
C++ example converted to  C#define WIN32_LEAN_AND_MEAN
#include <windows.h>

#pragma comment(lib, "user32.lib")

wchar_t *UTF8toUnicode(char *s, wchar_t *ws)
{
wchar_t wc;
char *c = s;
do
{
if ((*c & 0x80) == 0)
{
wc = *c++;
}
else if ((*c & 0xE0) == 0xC0)
{
wc = ( *c++ & 0x1F) << 6;
wc |= (*c++ & 0x3F);
}
else if ((*c & 0xF0) == 0xE0)
{
wc = (*c++ & 0xF) << 12;
wc |= (*c++ & 0x3F) << 6;
wc |= (*c++ & 0x3F);
}
else if ((*c & 0xF8) == 0xF0)
{
wc = (*c & 0x7) << 18;
wc |= (*c++ & 0x3F) << 12;
wc |= (*c++ & 0x3F) << 6;
wc |= (*c++ & 0x3F);
}
else if ((*c & 0xFC) == 0xF8)
{
wc = (*c++ & 0x3) << 24;
wc |= (*c++ & 0x3F) << 18;
wc |= (*c++ & 0x3F) << 12;
wc |= (*c++ & 0x3F) << 6;
wc |= (*c++  & 0x3F);
}
else if ((*c & 0xFE) == 0xFC)
{
wc = (*c++ & 0x1) << 30;
wc |= (*c++ & 0x3F) << 24;
wc |= (*c++ & 0x3F) << 18;
wc |= (*c++ & 0x3F) << 12;
wc |= (*c++ & 0x3F) << 6;
wc |= (*c++ & 0x3F);
}
*ws++ = wc;
} while (*c);
return ws;
}

void __cdecl mainCRTStartup(void)
{
char *myutf8String = u8"Russian: советских\nJapanese: 私は学生です\nChinese: 你好\nTamil: ಬಾ ಇಲ್ಲಿ ಸಂಭವಿಸು\nClassical Greek: ὕαλον ϕαγεῖν\nCzech: Mohu jíst sklo\nArabic:أنا قادر على أكل الزجاج و هذا لا يؤلمني.";
wchar_t myutf16[200];
UTF8toUnicode(myutf8String, myutf16);
MessageBoxW(0, myutf16, L"UTF-16", 0);
}
msvc 2010 - 2013#pragma execution_character_set("utf-8")
May the source be with you

Abdel Hamid

the sentence in arabic : أنا قادر على أكل الزجاج و هذا لا يؤلمني
says : I am able to eat glass and this doesn't hurt me
it's a little bit funny  :bgrin:

HSE

Quote from: AW on January 25, 2019, 01:46:17 AM
I don't see any way of making the macro operators turn string characters into bytes (for numerical evaluation and comparison purposes) at assemble time. It may be easy, despite people being struggling with that for decades, but I believe it is impossible (at assemble time only, of course).
Perhaps it's not possible with elemental macros we build, and you are right in that sense. But I don't think is impossible with advanced macros, just boring.
Equations in Assembly: SmplMath

aw27

Quote from: HSE on January 25, 2019, 09:39:54 PM
Perhaps it's not possible with elemental macros we build, and you are right in that sense. But I don't think is impossible with advanced macros, just boring.
Not a question of being boring, people that produced a number of macros I have seen in a few places in the masm32 SDK is vaccinated against boredom.

guga

Great work.

Someone have a working example of UTF8 to Ansi to convert things like:

A Saída dos Operários da Fábrica Lumière

to

A Saída dos Operários da Fábrica Lumière

?
Coding in Assembly requires a mix of:
80% of brain, passion, intuition, creativity
10% of programming skills
10% of alcoholic levels in your blood.

My Code Sites:
http://rosasm.freeforums.org
http://winasm.tripod.com

jj2007

include \masm32\MasmBasic\MasmBasic.inc         ; download
  Init
  Let esi="A Saída dos Operários da Fábrica Lumière"
  wPrint wRec$(esi)
EndOfCode


Output: A Saída dos Operários da Fábrica Lumière

guga

Tks, JJ.

Do you have a source example in masm how can i implement it ?
Coding in Assembly requires a mix of:
80% of brain, passion, intuition, creativity
10% of programming skills
10% of alcoholic levels in your blood.

My Code Sites:
http://rosasm.freeforums.org
http://winasm.tripod.com

TimoVJL

If don't care about codepage or WinAPI string functions:
#define WIN32_LEAN_AND_MEAN
#include <windows.h>

#pragma comment(lib, "user32.lib")

char *UTF8toANSI(char *s, char *as)
{
char ch;
char *c = s;
do
{
if ((*c & 0x80) == 0)
{
ch = *c++;
}
else if ((*c & 0xE0) == 0xC0)
{
ch = ( *c++ & 0x1F) << 6;
ch |= (*c++ & 0x3F);
}
else if ((*c & 0xF0) == 0xE0)
{
ch = (*c++ & 0xF) << 12;
ch |= (*c++ & 0x3F) << 6;
ch |= (*c++ & 0x3F);
}
else if ((*c & 0xF8) == 0xF0)
{
ch = (*c & 0x7) << 18;
ch |= (*c++ & 0x3F) << 12;
ch |= (*c++ & 0x3F) << 6;
ch |= (*c++ & 0x3F);
}
else if ((*c & 0xFC) == 0xF8)
{
ch = (*c++ & 0x3) << 24;
ch |= (*c++ & 0x3F) << 18;
ch |= (*c++ & 0x3F) << 12;
ch |= (*c++ & 0x3F) << 6;
ch |= (*c++  & 0x3F);
}
else if ((*c & 0xFE) == 0xFC)
{
ch = (*c++ & 0x1) << 30;
ch |= (*c++ & 0x3F) << 24;
ch |= (*c++ & 0x3F) << 18;
ch |= (*c++ & 0x3F) << 12;
ch |= (*c++ & 0x3F) << 6;
ch |= (*c++ & 0x3F);
}
*as++ = ch;
} while (*c);
return as;
}

void __cdecl mainCRTStartup(void)
{
//char *myutf8String = u8"A Saída dos Operários da Fábrica Lumière";
char *myutf8String = "A Saída dos Operários da Fábrica Lumière";
char myANSI[200];
UTF8toANSI(myutf8String, myANSI);
MessageBoxA(0, myANSI, "ANSI", 0);
}

otherwise MultiByteToWideChar() using CP_UTF8 and back to ANSI with WideCharToMultiByte()
May the source be with you