Since the last topic was about macros...
In my opinion, a really revolutionary internal MACRO that UASM should have would be a UTF-8 to UTF-16 macro.
As we know, it is impossible to make a regular UTF-8 to UTF-16 macro that changes the data section at assemble time. What people has done so far are macros that convert ASCII characters to words and call these Unicode conversion macros but they are not. :eusa_naughty:
The general algorythm in C++ is pretty simple, but again, impossible to convert to an assemble time ASM macro. I copied from here: https://gist.github.com/rechardchen/3321830)
wstring UTF8toUnicode(const string& s)
{
wstring ws;
wchar_t wc;
for (int i = 0; i < s.length(); )
{
char c = s[i];
if ((c & 0x80) == 0)
{
wc = c;
++i;
}
else if ((c & 0xE0) == 0xC0)
{
wc = (s[i] & 0x1F) << 6;
wc |= (s[i + 1] & 0x3F);
i += 2;
}
else if ((c & 0xF0) == 0xE0)
{
wc = (s[i] & 0xF) << 12;
wc |= (s[i + 1] & 0x3F) << 6;
wc |= (s[i + 2] & 0x3F);
i += 3;
}
else if ((c & 0xF8) == 0xF0)
{
wc = (s[i] & 0x7) << 18;
wc |= (s[i + 1] & 0x3F) << 12;
wc |= (s[i + 2] & 0x3F) << 6;
wc |= (s[i + 3] & 0x3F);
i += 4;
}
else if ((c & 0xFC) == 0xF8)
{
wc = (s[i] & 0x3) << 24;
wc |= (s[i] & 0x3F) << 18;
wc |= (s[i] & 0x3F) << 12;
wc |= (s[i] & 0x3F) << 6;
wc |= (s[i] & 0x3F);
i += 5;
}
else if ((c & 0xFE) == 0xFC)
{
wc = (s[i] & 0x1) << 30;
wc |= (s[i] & 0x3F) << 24;
wc |= (s[i] & 0x3F) << 18;
wc |= (s[i] & 0x3F) << 12;
wc |= (s[i] & 0x3F) << 6;
wc |= (s[i] & 0x3F);
i += 6;
}
ws += wc;
}
return ws;
}
Let's see whether it works:
(https://www.dropbox.com/s/rhv72pip8ok111z/utf16.png?dl=1)
So, the complete program to produce the above image:
#include <Windows.h>
#include <cstring>
#include <iostream>
std::string myutf8String = "Russian: советских\nJapanese: 私は学生です\nChinese: 你好\nTamil: ಬಾ ಇಲ್ಲಿ ಸಂಭವಿಸು\nClassical Greek: ὕαλον ϕαγεῖν\nCzech: Mohu jíst sklo\nArabic:أنا قادر على أكل الزجاج و هذا لا يؤلمني.";
std::wstring myutf16;
std::wstring UTF8toUnicode(const std::string& s);
int main()
{
myutf16=UTF8toUnicode(myutf8String);
MessageBoxW(0, myutf16.c_str(), L"UTF-16", 0);
return 0;
}
using namespace std;
wstring UTF8toUnicode(const string& s)
{
wstring ws;
wchar_t wc;
for (int i = 0; i < s.length(); )
{
char c = s[i];
if ((c & 0x80) == 0)
{
wc = c;
++i;
}
else if ((c & 0xE0) == 0xC0)
{
wc = (s[i] & 0x1F) << 6;
wc |= (s[i + 1] & 0x3F);
i += 2;
}
else if ((c & 0xF0) == 0xE0)
{
wc = (s[i] & 0xF) << 12;
wc |= (s[i + 1] & 0x3F) << 6;
wc |= (s[i + 2] & 0x3F);
i += 3;
}
else if ((c & 0xF8) == 0xF0)
{
wc = (s[i] & 0x7) << 18;
wc |= (s[i + 1] & 0x3F) << 12;
wc |= (s[i + 2] & 0x3F) << 6;
wc |= (s[i + 3] & 0x3F);
i += 4;
}
else if ((c & 0xFC) == 0xF8)
{
wc = (s[i] & 0x3) << 24;
wc |= (s[i] & 0x3F) << 18;
wc |= (s[i] & 0x3F) << 12;
wc |= (s[i] & 0x3F) << 6;
wc |= (s[i] & 0x3F);
i += 5;
}
else if ((c & 0xFE) == 0xFC)
{
wc = (s[i] & 0x1) << 30;
wc |= (s[i] & 0x3F) << 24;
wc |= (s[i] & 0x3F) << 18;
wc |= (s[i] & 0x3F) << 12;
wc |= (s[i] & 0x3F) << 6;
wc |= (s[i] & 0x3F);
i += 6;
}
ws += wc;
}
return ws;
}
It has been implemented some time ago, look it up in String.c from line 99
It works flawlessly 8)
Hi Atelier!
What problem for a macro you see?
Quote from: habran on January 24, 2019, 09:57:09 PM
It has been implemented some time ago, look it up in String.c from line 99
I works flawlessly 8)
What's the trick then?
include \masm32\include\masm32rt.inc
.data
txTitle dw "Does it work?", 0 ; Error A2055: Initializer value too large
txHelloW dw "Привет, Мир!", 0
.code
start:
invoke MessageBox, 0, offset txHelloW, offset txTitle, MB_OK
exit
end start
Btw we had the discussion already in Summer 2017 (http://masm32.com/board/index.php?topic=6435.0)
Declaring wide string data with dw will only happen with OPTION LITERALS:ON and using
command line switches –Zm or –Zne will disable this.
Yep it works, good to know :t
include \masm32\include\masm32rt.inc
OPTION LITERALS:ON
.data
txTitle dw "Does it work?", 0
txHelloW dw "Привет, Мир!", 0
.code
start:
invoke MessageBoxW, 0, offset txHelloW, offset txTitle, MB_OK
exit
end start
@habran
I was thinking of it in the form of an internal macro called on demand as needed without the need for the OPTION LITERALS:ON. It would look more familiar to MASM users. But doing it through OPTION LITERALS:ON is already great. :t
@HSE
I don't see any way of making the macro operators turn string characters into bytes (for numerical evaluation and comparison purposes) at assemble time. It may be easy, despite people being struggling with that for decades, but I believe it is impossible (at assemble time only, of course).
C++ example converted to C#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#pragma comment(lib, "user32.lib")
wchar_t *UTF8toUnicode(char *s, wchar_t *ws)
{
wchar_t wc;
char *c = s;
do
{
if ((*c & 0x80) == 0)
{
wc = *c++;
}
else if ((*c & 0xE0) == 0xC0)
{
wc = ( *c++ & 0x1F) << 6;
wc |= (*c++ & 0x3F);
}
else if ((*c & 0xF0) == 0xE0)
{
wc = (*c++ & 0xF) << 12;
wc |= (*c++ & 0x3F) << 6;
wc |= (*c++ & 0x3F);
}
else if ((*c & 0xF8) == 0xF0)
{
wc = (*c & 0x7) << 18;
wc |= (*c++ & 0x3F) << 12;
wc |= (*c++ & 0x3F) << 6;
wc |= (*c++ & 0x3F);
}
else if ((*c & 0xFC) == 0xF8)
{
wc = (*c++ & 0x3) << 24;
wc |= (*c++ & 0x3F) << 18;
wc |= (*c++ & 0x3F) << 12;
wc |= (*c++ & 0x3F) << 6;
wc |= (*c++ & 0x3F);
}
else if ((*c & 0xFE) == 0xFC)
{
wc = (*c++ & 0x1) << 30;
wc |= (*c++ & 0x3F) << 24;
wc |= (*c++ & 0x3F) << 18;
wc |= (*c++ & 0x3F) << 12;
wc |= (*c++ & 0x3F) << 6;
wc |= (*c++ & 0x3F);
}
*ws++ = wc;
} while (*c);
return ws;
}
void __cdecl mainCRTStartup(void)
{
char *myutf8String = u8"Russian: советских\nJapanese: 私は学生です\nChinese: 你好\nTamil: ಬಾ ಇಲ್ಲಿ ಸಂಭವಿಸು\nClassical Greek: ὕαλον ϕαγεῖν\nCzech: Mohu jíst sklo\nArabic:أنا قادر على أكل الزجاج و هذا لا يؤلمني.";
wchar_t myutf16[200];
UTF8toUnicode(myutf8String, myutf16);
MessageBoxW(0, myutf16, L"UTF-16", 0);
}
msvc 2010 - 2013#pragma execution_character_set("utf-8")
the sentence in arabic : أنا قادر على أكل الزجاج و هذا لا يؤلمني
says : I am able to eat glass and this doesn't hurt me
it's a little bit funny :bgrin:
Quote from: AW on January 25, 2019, 01:46:17 AM
I don't see any way of making the macro operators turn string characters into bytes (for numerical evaluation and comparison purposes) at assemble time. It may be easy, despite people being struggling with that for decades, but I believe it is impossible (at assemble time only, of course).
Perhaps it's not possible with elemental macros we build, and you are right in that sense. But I don't think is impossible with advanced macros, just boring.
Quote from: HSE on January 25, 2019, 09:39:54 PM
Perhaps it's not possible with elemental macros we build, and you are right in that sense. But I don't think is impossible with advanced macros, just boring.
Not a question of being boring, people that produced a number of macros I have seen in a few places in the masm32 SDK is vaccinated against boredom.
Great work.
Someone have a working example of UTF8 to Ansi to convert things like:
A SaÃda dos Operários da Fábrica Lumière
to
A Saída dos Operários da Fábrica Lumière
?
include \masm32\MasmBasic\MasmBasic.inc ; download (http://masm32.com/board/index.php?topic=94.0)
Init
Let esi="A SaÃda dos Operários da Fábrica Lumière"
wPrint wRec$(esi)
EndOfCode
Output: A Saída dos Operários da Fábrica Lumière
Tks, JJ.
Do you have a source example in masm how can i implement it ?
If don't care about codepage or WinAPI string functions:
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#pragma comment(lib, "user32.lib")
char *UTF8toANSI(char *s, char *as)
{
char ch;
char *c = s;
do
{
if ((*c & 0x80) == 0)
{
ch = *c++;
}
else if ((*c & 0xE0) == 0xC0)
{
ch = ( *c++ & 0x1F) << 6;
ch |= (*c++ & 0x3F);
}
else if ((*c & 0xF0) == 0xE0)
{
ch = (*c++ & 0xF) << 12;
ch |= (*c++ & 0x3F) << 6;
ch |= (*c++ & 0x3F);
}
else if ((*c & 0xF8) == 0xF0)
{
ch = (*c & 0x7) << 18;
ch |= (*c++ & 0x3F) << 12;
ch |= (*c++ & 0x3F) << 6;
ch |= (*c++ & 0x3F);
}
else if ((*c & 0xFC) == 0xF8)
{
ch = (*c++ & 0x3) << 24;
ch |= (*c++ & 0x3F) << 18;
ch |= (*c++ & 0x3F) << 12;
ch |= (*c++ & 0x3F) << 6;
ch |= (*c++ & 0x3F);
}
else if ((*c & 0xFE) == 0xFC)
{
ch = (*c++ & 0x1) << 30;
ch |= (*c++ & 0x3F) << 24;
ch |= (*c++ & 0x3F) << 18;
ch |= (*c++ & 0x3F) << 12;
ch |= (*c++ & 0x3F) << 6;
ch |= (*c++ & 0x3F);
}
*as++ = ch;
} while (*c);
return as;
}
void __cdecl mainCRTStartup(void)
{
//char *myutf8String = u8"A Saída dos Operários da Fábrica Lumière";
char *myutf8String = "A SaÃda dos Operários da Fábrica Lumière";
char myANSI[200];
UTF8toANSI(myutf8String, myANSI);
MessageBoxA(0, myANSI, "ANSI", 0);
}
otherwise MultiByteToWideChar() using CP_UTF8 and back to ANSI with WideCharToMultiByte()
Better would be using MultiByteToWideChar since the text contains chars not used in latin/portuguese
Do you have some example of it using MultiByteToWideChar and WideCharToMultiByte
I built one years ago for AnsitoUTF8, but never did the reverse operation, and don´t know how to do it:
The AnsitoUTF8, i ported was like this (RosAsm syntax):
Proc AnsitoUTF8Masm:
Arguments @pszAscii, @pszUTF8, @BomFlag
Local @lenASCII, @lenUCS2, @lenUTF8, @pszUCS2, @pUnicode, @LenCharString, @lenUTF8Result
Uses ecx, edi, edx
xor eax eax
On D@pszAscii = 0, ExitP
On D@pszUTF8 = 0, ExitP
mov edi D@pszUTF8
; length of pszUTF8 must be enough; its maximum is (lenASCII*3 + 1)
call StrLenProc D@pszAscii
If eax = 0
mov B$edi 0 | ExitP
End_If
mov ecx eax
shl eax 1
add eax ecx
inc eax
call 'RosMem.VMemAlloc' edi, eax
If eax = 0
mov B$edi 0 | ExitP
End_If
mov D@pUnicode eax
mov D@LenCharString ecx
inc ecx
call 'KERNEL32.MultiByteToWideChar' &CP_ACP, &MB_PRECOMPOSED, D@pszASCII, D@LenCharString, D@pUnicode, ecx
mov D@lenUCS2 eax
call UTF8Length D@pUnicode, eax
mov D@lenUTF8 eax
If D@BomFlag = &TRUE
add eax 3
End_If
mov D@lenUTF8Result eax
mov edi D@pszUTF8
call 'RosMem.VMemAlloc' D@pszUTF8, eax
If eax = 0
mov B$edi 0
call 'RosMem.VMemFree' D@pUnicode | ExitP
End_If
If D@BomFlag = &TRUE
mov D$eax 0BFBBEF
add eax 3
End_If
;length of pszUTF8 must be >= (lenUTF8 + 1)
call 'KERNEL32.WideCharToMultiByte' &CP_UTF8, 0, D@pUnicode, D@LenCharString, eax, D@lenUTF8, 0, 0
call 'RosMem.VMemFree' D@pUnicode
mov eax D@lenUTF8Result
mov ecx D$edi
add ecx eax | mov B$ecx 0
EndP
Ok, guys...I guess i suceed to port it. Don´t know if it is the roper way though :icon_rolleyes: :icon_rolleyes:
It should use the BOM flag (containing 0BFBBEF at the beginning), but i didn´t found an example that uses it to implement too. So, here @BomFlag argument is useless
Proc UTF8toAnsi:
Arguments @pszUTF8, @pszAscii, @BomFlag
Local @lenUTF8, @pUnicode, @lenUnicode, @pTempUnicode
Uses ecx, edx, edi, ebx
xor eax eax
On D@pszAscii = 0, ExitP
On D@pszUTF8 = 0, ExitP
mov edi D@pszAscii
; length of pszUTF8 must be enough; its maximum is (lenASCII*3 + 1)
call StrLenProc D@pszUTF8
If eax = 0
mov B$edi 0 | ExitP
End_If
mov ecx eax
mov D@lenUTF8 ecx
call 'KERNEL32.MultiByteToWideChar' &CP_UTF8, 0, D@pszUTF8, D@lenUTF8, D@pUnicode, 0
mov D@lenUnicode eax
mov ecx eax
shl eax 1
lea edi D@pTempUnicode | mov D$edi 0
call 'RosMem.VMemAlloc' edi, eax
If eax = 0
mov B$edi 0 | ExitP
End_If
mov D@pUnicode eax
call 'KERNEL32.MultiByteToWideChar' &CP_UTF8, 0, D@pszUTF8, D@lenUTF8, D@pUnicode, D@lenUnicode
call 'KERNEL32.WideCharToMultiByte' &CP_ACP, 0, D@pUnicode, D@lenUnicode, D@pszAscii, 256, 0, 0
mov ebx eax
call 'RosMem.VMemFree' D@pUnicode
mov edi D@pszAscii
add edi ebx | mov B$edi 0
mov eax ebx
EndP
Example of usage:
[GugaBuffer: B$ 0 #260]
call UTF8toAnsi {B$ "A SaÃda dos Operários da Fábrica Lumière", 0}, GugaBuffer, 0
eax will return the lenght of the converted string
and
"GugaBuffer" holds the converted string
I adapted the code from here:
http://www.masmforum.com/board/index.php?PHPSESSID=8d46cd4ecb1688be429ab49694ec53e6&topic=6507.0;wap2
But..not sure if it is the proper way :(
Quote from: guga on April 14, 2019, 11:56:43 PM
Tks, JJ.
Do you have a source example in masm how can i implement it ?
As Timo wrote, MultiByteToWideChar() using CP_UTF8 and back to ANSI with WideCharToMultiByte()
My routine uses wPrint, which UTF16 under the hood. As you can see, there are several ways to do it 8)
A bigger problem is that not all editors save their files as UTF8 or UTF16, and some editors need a BOM to recognise UTF8/UTF16, others don't need one, and others choke if they see a BOM. It's a mess :P
Tks JJ
QuoteA bigger problem is that not all editors save their files as UTF8 or UTF16, and some editors need a BOM to recognise UTF8/UTF16, others don't need one, and others choke if they see a BOM. It's a mess :P
Yeah, that´s a true mess. I gave a test on timo´s routine and ported those ones from the link to RolsAsm. It worked, but it was a hell to identify what codepage it was actually being used. I´m trying to export huge amounts of text to ani so i can translate them later to portuguese but the encodage they uses are a mess. Sometimes uses UTF8 and others i have no idea on what kind of encoding they uses :icon_mrgreen: :icon_mrgreen: :icon_mrgreen:
I succeeded to convert all of them using notepad, though. I opened them on NotePad and simply exportd it to UTF8. After that, i checked the text on the routine i ported, and so far, it was ok. But, a huge headache porting and analyzing almost 500 Mb of plain text. Ouch ! :bgrin: :bgrin: :bgrin:
Btw...if someone knows a free and good translator from english to portuguese, please let me know. It can be opensource etc. All is needed is open a huge document (in english) and translate it to portuguese (brazilian portuguese).
I´m searching on google but all the apps i found so far uses google translator rather then a offline software, and it will take ages to translate all this monster documents if i have to depend on online translations using google
Post a handful of your docs, maybe I can automate the process. Which codepage are they normally? Mostly Portuguese, sometimes Utf8?
Re Google: try DeepL (https://www.deepl.com/translator), it's better than Google Translate. Free trial for the offline version, it seems.
P.S., for the conversion:
include \masm32\MasmBasic\MasmBasic.inc ; download (http://masm32.com/board/index.php?topic=94.0)
Init
GfNoRecurse= 1 ; if you want subfolders, comment out this line
GetFiles *.asm
deb 4, "files found", eax
For_ ebx=0 To Min(9, eax-1)
Let esi=ConvertCp$(FileRead$(Files$(ebx)), 860, CP_UTF8)
FileWrite Cat$(Files$(ebx)+".utf8"), esi
Next
EndOfCode
Source & exe attached. Note the Min(9, eax).
Tks a lot, JJ
I[ll give a try (in your app and on deepl)
I uploaded the file if you want to give it a test
https://we.tl/t-LQjjMocOvR
The formart seems to be in UTF8 (If notepad converted it properly).
Quote from: guga on April 16, 2019, 05:52:35 AMThe formart seems to be in UTF8 (If notepad converted it properly).
The format seems to be plain English, and it is indeed codepage UTF8.
You wrote you had plenty of small files with unknown codepages. This is the opposite, one big file with a known codepage. What do you want to achieve?
Hi JJ.
I need to translate a huge file to portuguese. I managed to fix that utf8 problem simply opening and saving the file to notepad. Now i have a text file with 45 Mb that i need to translate from english to portuguese at once.
The deepl site has a tiny limit and after one translation i´m not allowed to do again :(
I could, however try creating a small app to translate directly from google translator (considering thee limit of 5000 chars per translation), but it probably would take a long time to translate the text, since it is huge :(
What about the free trial? https://www.deepl.com/pro.html#pricing
Couldn´pt do it. I tried and it stopped working after translating only 1 Mb
"Data Confidentiality
Your texts are deleted immediately after you've received the translation
Use the Web Translator without limits
Translate as much text as you like with your single-user license
5 document translations per month
Faster document translation via the Web Translator; fully-editable files"
Since the file is about 45 Mb (text file) it won´pt help that much :(
I´m suceeding to make it translate through google translator api opn a routine i made, but, i´m afraid it will take an endless time to finish :(