The MASM Forum

Projects => Rarely Used Projects => Archival JWASM postings => Topic started by: habran on April 27, 2014, 11:57:41 AM

Title: JWasm212pre reloaded
Post by: habran on April 27, 2014, 11:57:41 AM
Hi everyone :biggrin:

I have here my version of JWasm212pre 8)
what is the difference? ::)
  1. built in .for - .endfor hll
  2. sophisticated RSP stack base for 64 bit   
  3. more compact stack usage
  4. first variable is always aligned to 16 byte
  5. if there is no invoke it will not allocate shadow space
  6. if there are no locals and no invoke there is no need to alocate the stack
 

to use 64 bit with this version you need to use 'option win64 : 11'  (eleven)

option casemap : none
option win64 : 11
option frame : auto
option stackbase : rsp


here are some examples to speak for themselves:


test1 PROC FRAME param : QWORD        ;this function has no locals but uses param
000000013FCE1060 48 89 4C 24 08       mov         qword ptr [param],rcx  ;param is saved 
  .if (param && rax == rcx)                            ;but no need to alocate the stack
000000013FCE1065 48 83 7C 24 08 00    cmp         qword ptr [param],0 
000000013FCE106B 74 0E                je          test1+1Bh (013FCE107Bh) 
000000013FCE106D 48 3B C1             cmp         rax,rcx 
000000013FCE1070 75 09                jne         test1+1Bh (013FCE107Bh) 
    mov rdx,1
000000013FCE1072 48 C7 C2 01 00 00 00 mov         rdx,1 
  .else
000000013FCE1079 EB 05                jmp         test1+20h (013FCE1080h) 
    mov rax,param
000000013FCE107B 48 8B 44 24 08       mov         rax,qword ptr [param] 
  .endif
ret
000000013FCE1080 C3                   ret 
test1 ENDP
;---------------------------------------------------------------------------------
test1 PROC FRAME param : QWORD       ;this function has no locals but uses rcx instead of param
.if (rcx && rax == rcx)              ;no need to save rcx in the shadow space
000000013FBC1060 48 23 C9             and         rcx,rcx 
000000013FBC1063 74 0E                je          test1+13h (013FBC1073h) 
000000013FBC1065 48 3B C1             cmp         rax,rcx 
000000013FBC1068 75 09                jne         test1+13h (013FBC1073h) 
    mov rdx,1
000000013FBC106A 48 C7 C2 01 00 00 00 mov         rdx,1 
.else
000000013FBC1071 EB 03                jmp         test1+16h (013FBC1076h) 
    mov rax,rcx
000000013FBC1073 48 8B C1             mov         rax,rcx 
.endif
ret
000000013FBC1076 C3                   ret 
test1 ENDP


this version has invoke in it:

test PROC  FRAME USES rbx rbp rdi   a : QWORD   
000000013FD81128 48 89 4C 24 08       mov         qword ptr [rsp+8],rcx   ;a param is used so it's saved
000000013FD8112D 48 89 5C 24 10       mov         qword ptr [rsp+10h],rbx ;store rbx in unused space
000000013FD81132 48 89 6C 24 18       mov         qword ptr [rsp+18h],rbp ;store rbp in unused space
000000013FD81137 57                   push        rdi ;even there is still free space push it for alignment
000000013FD81138 48 83 EC 60          sub         rsp,60h  ;40h for vars + 20h for shadows
LOCAL var    : XMMWORD                            ;16 byte vars at the beginning off locals
LOCAL var1   : XMMWORD                            ;are aligned to 16 byte
LOCAL other  : DWORD 
LOCAL other1 : QWORD     
LOCAL other2 : QWORD
LOCAL var2   : WORD   
LOCAL var3   : BYTE
    mov rax,-1
000000013FD8113C 48 C7 C0 FF FF FF FF mov         rax,0FFFFFFFFFFFFFFFFh 
    mov a,rax
000000013FD81143 48 89 44 24 70       mov         qword ptr [a],rax 
    invoke function,5                 
000000013FD81148 48 C7 C1 05 00 00 00 mov         rcx,5 
000000013FD8114F E8 0C FF FF FF       call        function (013FD81060h) 
    movaps var, xmm1             
000000013FD81154 0F 29 4C 24 20       movaps      xmmword ptr [var],xmm1 
    movaps var1, xmm2       
000000013FD81159 0F 29 54 24 30       movaps      xmmword ptr [var1],xmm2 
    mov other1, -1   
000000013FD8115E 48 C7 44 24 44 FF FF FF FF mov         qword ptr [other1],0FFFFFFFFFFFFFFFFh 
    mov other2, -2   
000000013FD81167 48 C7 44 24 50 FE FF FF FF mov         qword ptr [other2],0FFFFFFFFFFFFFFFEh 
    inc rax
000000013FD81170 48 FF C0             inc         rax 
    mov other2, rax
000000013FD81173 48 89 44 24 50       mov         qword ptr [other2],rax 
    inc rax 
000000013FD81178 48 FF C0             inc         rax 
    mov other2, rax
000000013FD8117B 48 89 44 24 50       mov         qword ptr [other2],rax 
    mov rax, 1111111h
000000013FD81180 48 C7 C0 11 11 11 01 mov         rax,1111111h 
    movd xmm1, rax
000000013FD81187 66 48 0F 6E C8       movd        xmm1,rax 
    shl rax, 1 
000000013FD8118C 48 D1 E0             shl         rax,1 
    .for (rax=rcx¦r8¦rcx++, r8--)
000000013FD8118F 48 8B C1             mov         rax,rcx 
000000013FD81192 4D 23 C0             and         r8,r8 
000000013FD81195 74 0A                je          test+79h (013FD811A1h) 
      mov[rcx], dl
000000013FD81197 88 11                mov         byte ptr [rcx],dl 
    .endfor
000000013FD81199 48 FF C1             inc         rcx  ;note how it uses INC here instead of ADD RCX,1
    .endfor
000000013FD8119C 49 FF C8             dec         r8  ;note how it uses DEC here instead of SUB r8,1
000000013FD8119F EB F1                jmp         test+6Ah (013FD81192h) 
    ret   
000000013FD811A1 48 83 C4 60          add         rsp,60h 
000000013FD811A5 5F                   pop         rdi 
000000013FD811A6 48 8B 5C 24 10       mov         rbx,qword ptr [rsp+10h] 
000000013FD811AB 48 8B 6C 24 18       mov         rbp,qword ptr [rsp+18h] 
000000013FD811B0 C3                   ret 
test ENDP


and the version without invoke:

test PROC  FRAME USES rbx rbp rdi   a : QWORD
000000013F4C1128 48 89 4C 24 08       mov         qword ptr [rsp+8],rcx 
000000013F4C112D 48 89 5C 24 10       mov         qword ptr [var1],rbx 
000000013F4C1132 48 89 6C 24 18       mov         qword ptr [rsp+18h],rbp 
000000013F4C1137 57                   push        rdi 
000000013F4C1138 48 83 EC 40          sub         rsp,40h  ;only space for locals alocated
LOCAL var    : XMMWORD                                     ;because no invoke calls
LOCAL var1   : XMMWORD
LOCAL other  : DWORD
LOCAL other1 : QWORD
LOCAL other2 : QWORD
LOCAL var2   : WORD
LOCAL var3   : BYTE
mov rax, -1
000000013F4C113C 48 C7 C0 FF FF FF FF mov         rax,0FFFFFFFFFFFFFFFFh 
mov a, rax
000000013F4C1143 48 89 44 24 50       mov         qword ptr [a],rax 
   ; invoke test1, 5
    movaps var, xmm1             
000000013F4C1148 0F 29 0C 24          movaps      xmmword ptr [rsp],xmm1 
    movaps var1, xmm2       
000000013F4C114C 0F 29 54 24 10       movaps      xmmword ptr [var1],xmm2 
    mov other1, -1   
000000013F4C1151 48 C7 44 24 24 FF FF FF FF mov         qword ptr [other1],0FFFFFFFFFFFFFFFFh 
    mov other2, -2   
000000013F4C115A 48 C7 44 24 30 FE FF FF FF mov         qword ptr [other2],0FFFFFFFFFFFFFFFEh 
    inc rax
000000013F4C1163 48 FF C0             inc         rax 
    mov other2, rax
000000013F4C1166 48 89 44 24 30       mov         qword ptr [other2],rax 
    inc rax 
000000013F4C116B 48 FF C0             inc         rax 
    mov other2, rax
000000013F4C116E 48 89 44 24 30       mov         qword ptr [other2],rax 
    mov rax, 1111111h
000000013F4C1173 48 C7 C0 11 11 11 01 mov         rax,1111111h 
    movd xmm1, rax
000000013F4C117A 66 48 0F 6E C8       movd        xmm1,rax 
    shl rax, 1 
000000013F4C117F 48 D1 E0             shl         rax,1 
    .for (rax=rcx¦r8¦rcx++, r8--)
000000013F4C1182 48 8B C1             mov         rax,rcx 
000000013F4C1185 4D 23 C0             and         r8,r8 
000000013F4C1188 74 0A                je          test+6Ch (013F4C1194h) 
      mov[rcx], dl
000000013F4C118A 88 11                mov         byte ptr [rcx],dl 
    .endfor
000000013F4C118C 48 FF C1             inc         rcx 
000000013F4C118F 49 FF C8             dec         r8 
    .endfor
000000013F4C1192 EB F1                jmp         test+5Dh (013F4C1185h) 
    ret   
000000013F4C1194 48 83 C4 40          add         rsp,40h 
000000013F4C1198 5F                   pop         rdi 
000000013F4C1199 48 8B 5C 24 10       mov         rbx,qword ptr [var1] 
000000013F4C119E 48 8B 6C 24 18       mov         rbp,qword ptr [rsp+18h] 
000000013F4C11A3 C3                   ret 
test ENDP


the folder contains JWasm binaries and MVS13 Expres solution
it also contains changed sources
because of forum limitation I couldn't upload complete sources
so if someone wants to build it, must download it from Japheth site and than add missing sources
headers are all there so you don't need to touch the H folder

here is the folder:

Title: Re: JWasm212pre reloaded
Post by: habran on April 27, 2014, 07:34:29 PM
for people who are not familiar with .for - .endfor hll
can check here (http://masm32.com/board/index.php?topic=402.0) :biggrin:
Title: Re: JWasm212pre reloaded
Post by: habran on April 27, 2014, 11:07:34 PM
To be able to build it with VC13 solution you'll have to do small changes to your path
From the menu chose Project->JWasm Property Pages
C/C++->General->Aditional Include Directories
and change the path from C:\Users\Hn\Desktop\JWasm2014 - OK\H
to your H folder ;)
Title: Re: JWasm212pre reloaded
Post by: Adamanteus on April 30, 2014, 08:33:37 PM
Such line in macro, with REG stack variable gives result without and operation :
Code (asm) Select

if OPATTR (&REG) and 00000001b
Title: Re: JWasm212pre reloaded
Post by: habran on May 01, 2014, 05:51:39 AM
Hi Adamanteus
you should probably use:
  if OPATTR (REG)  :biggrin:
Title: Re: JWasm212pre reloaded
Post by: Adamanteus on May 01, 2014, 06:26:48 AM
No - REG that's macro argument and need refference on what passed, than check if it constant by and operation.
Title: Re: JWasm212pre reloaded
Post by: habran on May 01, 2014, 09:06:58 AM
the best thing is to try to assemble it with ml.exe
and see what will happen ;)
Title: Re: JWasm212pre reloaded
Post by: habran on May 06, 2014, 06:24:43 PM
I have uploaded above a new version of JWasm with some fixes :biggrin:
the problem was in the flow of .for-.endfor hll
it was working fine if no .CONTINUE was used
because .CONTINUE is a jump to the start off the loop it would skip altering counters
I have redesigned it so that now increasing counters happens straight after the start
here is a simple example:

xstrlenA PROC FRAME wpString:PTR CHAR
xor rax,rax
  .if (rcx)
    .for (rax=rcx¦BYTE PTR[rax]¦rax++)
    .endfor
    sub rax,rcx
  .endif
  ret
xstrlenA ENDP
;;//----------------------------------------------------------------------
xstrlenA PROC FRAME wpString:PTR CHARxor rax,rax
000000013F884A30 48 33 C0             xor         rax,rax 
  .if (rcx)
000000013F884A33 48 23 C9             and         rcx,rcx 
000000013F884A36 74 12                je          xstrlenA+1Ah (013F884A4Ah) 
    .for (rax=rcx¦BYTE PTR[rax]¦rax++)
000000013F884A38 48 8B C1             mov         rax,rcx 
000000013F884A3B EB 03                jmp         xstrlenA+10h (013F884A40h) 
000000013F884A3D 48 FF C0             inc         rax 
000000013F884A40 80 38 00             cmp         byte ptr [rax],0 
000000013F884A43 74 02                je          xstrlenA+17h (013F884A47h) 
    .endfor
000000013F884A45 EB F6                jmp         xstrlenA+0Dh (013F884A3Dh) 
    sub rax,rcx
000000013F884A47 48 2B C1             sub         rax,rcx 
  .endif
  ret
000000013F884A4A C3                   ret
xstrlenA ENDP 
000000013F884A4B
Title: Re: JWasm212pre reloaded
Post by: habran on May 14, 2014, 02:26:36 PM
I have built JWasm.exe 64 bit with CodeBlocks 13.12 ussing GCC compiler
and was surprised how easy it imports MSVC solutions :icon_eek:
The CodeBlocks 13.12 actually works like a charm :t
only thing I had to do is to remove JWasm resource from the project
here it is:
Title: Re: JWasm212pre reloaded
Post by: KeepingRealBusy on July 14, 2014, 05:36:09 AM
Quote from: habran on May 06, 2014, 06:24:43 PM
Ibecause .CONTINUE is a jump to the start off the loop it would skip altering counters
I have redesigned it so that now increasing counters happens straight after the start


Habran,

Won't that cause a different problem because the counters will represent the iteration to be executed next instead of the current iteration? If the counters are referenced inside of the "for" loop, then they would have the wrong value. The way to do this, IMHO, is to put the counter increment just in front of the loop start and jump to that for either a endfor or for a continue, and during initialization, jump around this increment code to go to the start point (or initialize the counters by decrementing them during initialization, and then just fall through to the increment code).

Dave.

Title: Re: JWasm212pre reloaded
Post by: habran on July 14, 2014, 06:21:38 AM
You are right Dave, that's what I have done but I expressed myself wrongly
I meant to say above the START label
thank you to point that out, that means that you have really given a thought about it

that also means that you can understand how advanced is this version of JWasm

it was a hard work to bring it to this level
however, I confess that I enjoyed it immensely :biggrin: 
Title: Re: JWasm212pre reloaded
Post by: KeepingRealBusy on July 14, 2014, 06:34:45 AM
Habran,

If I had looked at the code itself, I would have seen this. I was just commenting on the explanation (I have never made a mistake  anything like that myself  :lol:).

Dave.
Title: Re: JWasm212pre reloaded
Post by: habran on July 14, 2014, 07:06:34 AM
I forgive you Dave :P
Title: Re: JWasm212pre reloaded
Post by: habran on August 27, 2014, 09:26:49 PM
I have uploaded a new file at the top with some improvements in  .for - .endfor hll
and a new feature in invoke macro
what I actually added is a change from 'move reg, 0' to 'xor reg,reg'

file: invoke.c
line:497

        else{
          /* v2.12 added by habran : if parametar  is zero use 'xor reg,reg' instead of 'mov reg,0' */
          if ((!strcmpi(paramvalue, "0") || (!strcmpi(paramvalue, "NULL")) || (!strcmpi(paramvalue, "FALSE"))))  {
            if (ms64_regs[index + base] > T_R9D) index -= 4;
            AddLineQueueX(" xor %r, %r", ms64_regs[index + base], ms64_regs[index + base]);
            return(1);
          }
          else
            AddLineQueueX(" mov %r, %s", ms64_regs[index + base], paramvalue);
        }
        *regs_used |= ( 1 << ( index + RPAR_START ) );
        DebugMsg1(("ms64_param(%s, param=%u): size=%u flags=%X\n", proc->sym.name, index, size, *regs_used ));
    }
    return( 1 );
}

as you can see I skipped checking if register is used so that we can use it again in the same call
other vise assembler would throw error that the register is already used
EG:
invoke testproc5, rdi,FALSE,NULL, 0,rdx, rdx

I would appreciate  a feedback

if in the first for params there are no zeros you can use ZERO MACRO posted in this forum
Title: Re: JWasm212pre reloaded
Post by: Gunther on August 28, 2014, 02:20:37 AM
Hi habran,

Quote from: habran on August 27, 2014, 09:26:49 PM
I have uploaded a new file at the top with some improvements in  .for - .endfor hll
and a new feature in invoke macro
what I actually added is a change from 'move reg, 0' to 'xor reg,reg'

is it the current file under the first post?

Gunther
Title: Re: JWasm212pre reloaded
Post by: habran on August 28, 2014, 05:49:40 AM
Hi Gunther,
Yes, and I also changed the one compiled with CodeBlocks
The JWasm under the first post was compiled with MSVC13 Expres , which is free for commercial use
so fill free to work with it
Title: Re: JWasm212pre reloaded
Post by: Gunther on August 29, 2014, 02:03:55 AM
Hi habran,

Quote from: habran on August 28, 2014, 05:49:40 AM
Yes, and I also changed the one compiled with CodeBlocks
The JWasm under the first post was compiled with MSVC13 Expres , which is free for commercial use
so fill free to work with it

good to know, thank you.  :t

Another point. In that thread (http://masm32.com/board/index.php?topic=3227.msg35416#msg35416) qWord did note that jWasm's current AVX implementation is buggy. Did you fix that?

Gunther
Title: Re: JWasm212pre reloaded
Post by: habran on August 29, 2014, 06:13:11 AM
Thanks Gunther :biggrin:

I didn't fix AVX yet, I was hoping that Japhet will return (my laptop doesn't support AVX2)

I will look if I can fix it and if I can, I'll upload it here

Title: Re: JWasm212pre reloaded
Post by: Gunther on August 29, 2014, 05:51:06 PM
Hi habran,

Quote from: habran on August 29, 2014, 06:13:11 AM
I didn't fix AVX yet, I was hoping that Japhet will return (my laptop doesn't support AVX2)

I will look if I can fix it and if I can, I'll upload it here

that would be great. Are you in contact with Andreas (Japhet)? We miss him.

Gunther
Title: Re: JWasm212pre reloaded
Post by: habran on August 30, 2014, 05:52:33 AM
Unfortunately I am not in contact with Japhet
As usual bad news never comes alone:
AFAIK JWasm doesn't support AVX2 yet :(
Title: Re: JWasm212pre reloaded
Post by: Gunther on August 31, 2014, 09:11:35 AM
Hi habran,

Quote from: habran on August 30, 2014, 05:52:33 AM
Unfortunately I am not in contact with Japhet
As usual bad news never comes alone:
AFAIK JWasm doesn't support AVX2 yet :(

not so bad, because is only supported by Haswell, and AVX512 doesn't exist in hardware at the moment.

Gunther
Title: Re: JWasm212pre reloaded
Post by: habran on August 31, 2014, 10:33:29 AM
If I knew that Japhet will not return soon to JWasm I would try to add AVX2
however, I would need to change whole concept in Jwasm, because JWasm doesn't support 4 parameters yet
I would probably be able to do that with a lot of sweat because I would have to study how to implement it,
while for Japhet it would be a chickenshit :biggrin:
Even if I succeed to do that Japhet could refuse to accept it, you know him, he is very fussy ;)
Title: Re: JWasm212pre reloaded
Post by: Gunther on August 31, 2014, 07:39:46 PM
Hi habran,

Quote from: habran on August 31, 2014, 10:33:29 AM
Even if I succeed to do that Japhet could refuse to accept it, you know him, he is very fussy ;)

yes he is.  :lol:

Gunther
Title: Re: JWasm212pre reloaded
Post by: habran on September 23, 2014, 10:54:06 PM
I have uploaded on the top of this tread a new version of JWasm
The reason is because I redesigned .for loop
Now it works faster and it is better than ever and also better than MSVC version

here you can see the difference:

tihs is from before which is the same design as MSVC
    .for (rax=rcx¦BYTE PTR[rax]¦rax++)
000000013F884A38 48 8B C1             mov         rax,rcx 
000000013F884A3B EB 03                jmp         LSKIP (013F884A40h) 
000000013F884A3D 48 FF C0     LSTART: inc         rax 
000000013F884A40 80 38 00     LSKIP:  cmp         byte ptr [rax],0 
000000013F884A43 74 02                je          LEXIT (013F884A47h) 
    .endfor
000000013F884A45 EB F6                jmp         LSTART (013F884A3Dh) 
000000013F884A47              LEXIT:

this is a new redesignet version:
    .for (rax=rcx¦BYTE PTR[rax]¦rax++)
000000013F03119E 48 8B C1             mov         rax,rcx 
000000013F0311A1 EB 03                jmp         LSKIP (013F0311A6h) 
    .endfor
000000013F0311A3 48 FF C0    LSTART:  inc         rax 
000000013F0311A6 80 38 00    LSKIP:   cmp         byte ptr [rax],0 
000000013F0311A9 75 F8                jne         xstrlenA+0Dh (013F0311A3h) 
000000013F0311AB 



the .CONTINUE has got additional label to jump to

here is an example:

  .for (¦WORD PTR[rcx] >= 48 && WORD PTR[rcx] <= 57¦rcx+=2)
000000013F1111A5 EB 23                jmp         LSKIP    ;(013F1111CAh) 
  movzx eax, WPTR [rcx]
000000013F1111A7 0F B7 01    LSTART:  movzx       eax,word ptr [rcx] 
    sub eax, 48
000000013F1111AA 83 E8 30             sub         eax,30h 
    .if (WORD PTR[rcx] == 43)
000000013F1111AD 66 83 39 2B          cmp         word ptr [rcx],2Bh 
000000013F1111B1 75 08                jne         main+25h (013F1111BBh) 
    mov r10d,TRUE
000000013F1111B3 41 BA 01 00 00 00    mov         r10d,1 
      .continue
000000013F1111B9 EB 0B                jmp         LCONT       ;(013F1111C6h) 
    .endif
    movsxd rdx, eax
000000013F1111BB 48 63 D0             movsxd      rdx,eax 
    lea rax, QWORD PTR [r8+r8*4]
000000013F1111BE 4B 8D 04 80          lea         rax,[r8+r8*4] 
    lea r8, QWORD PTR [rdx+rax*2]
000000013F1111C2 4C 8D 04 42          lea         r8,[rdx+rax*2] 
  .endfor
000000013F1111C6 48 83 C1 02  LCONT:  add         rcx,2 
000000013F1111CA 66 83 39 30  LSKIP:  cmp         word ptr [rcx],30h 
000000013F1111CE 72 06                jb          main+40h (013F1111D6h) 
000000013F1111D0 66 83 39 39          cmp         word ptr [rcx],39h 
000000013F1111D4 76 D1                jbe         main+11h (013F1111A7h) 
000000013F1111D6


and here is one function to see a complete work of this version of JWasm:

UTF16toUTF32 PROC FRAME USES rbx rbp rdi rsi r12 r13 pSource:PTR WORD,nSourceLen:UINT_PTR,nSourceDone:PTR UINT_PTR,szTarget:PTR DWORD,nTargetMax:UINT_PTR
  ;//rsi=pSource
  ;//r13 =PTR to WORD pSource + nSourceLen * 2 (end of source)
  ;//rdi = pointer to DWORD destinu
  ;//r12 = end of DWORD destinu
  ;//ebx = DWORD cahr
 
  mov rsi,rcx            ;rsi=pSource
  lea r13,[rsi+rdx*2]    ;r13=pSource + nSourceLen * 2
  mov rbp,nTargetMax     ;nTargetMax in rbp
  .if (!r9 && !rbp)      ;!szTarget && !nTargetMax
    xor edi,edi          ;clear rdi
    mov r12,MAXUINT_PTR  ;0ffffffffh
  .else
    mov rdi,r9           ;destinu = szTarget
    lea r12,[rdi+rbp*4]  ;end of DWORD destinu
  .endif
  .for (¦rsi < r13 && rdi < r12¦rsi+=2)
    movzx ebx,WORD PTR[rsi]
    ;//Surrogate pair. High surrogate.
    .if (ebx >= 0d800h && ebx <= 0dbffh)
      add rsi,2
      .if (rsi >= r13)
        sub rsi,2
        .break
      .endif
      ;//Low surrogate
      .if (WORD PTR[rsi] >= 0dc00h && WORD PTR[rsi] <= 0dfffh)
        sub ebx,0d800h
        shl ebx,10
        movzx rax,WORD PTR[rsi]
        lea ebx, DWORD PTR [rbx+rax+2400h]
      .else
        .continue
      .endif
    .endif
    .if (r9)               ;szTarget
      add rdi,4
      mov [rdi],ebx
    .else
      add rdi,2
    .endif
  .endfor
  .if (r8)
    mov rax,rsi
    sub rax,rcx
    sar rax,1
    mov [r8],rax
  .endif
  mov rax,rdi
  sub rax,r9
  sar rax,2
  ret
UTF16toUTF32 ENDP

produces this:

UTF16toUTF32 PROC FRAME USES rbx rbp rdi rsi r12 r13 pSource:PTR WORD,nSourceLen:UINT_PTR,nSourceDone:PTR UINT_PTR,szTarget:PTR DWORD,nTargetMax:UINT_PTR
000000013F62B41E 48 89 5C 24 10       mov         qword ptr [rsp+10h],rbx 
000000013F62B423 48 89 6C 24 18       mov         qword ptr [rsp+18h],rbp 
000000013F62B428 48 89 7C 24 20       mov         qword ptr [pSource],rdi 
000000013F62B42D 56                   push        rsi 
000000013F62B42E 41 54                push        r12 
000000013F62B430 41 55                push        r13 
  ;//rsi=pSource
  ;//r13 =PTR to WORD pSource + nSourceLen * 2 (end of source)
  ;//rdi = pointer to DWORD destinu
  ;//r12 = end of DWORD destinu
  ;//ebx = DWORD cahr
 
  mov rsi,rcx            ;rsi=pSource
000000013F62B432 48 8B F1             mov         rsi,rcx 
  lea r13,[rsi+rdx*2]    ;r13=pSource + nSourceLen * 2
000000013F62B435 4C 8D 2C 56          lea         r13,[rsi+rdx*2] 
  mov rbp,nTargetMax     ;nTargetMax in rbp
000000013F62B439 48 8B 6C 24 40       mov         rbp,qword ptr [nTargetMax] 
  .if (!r9 && !rbp)      ;!szTarget && !nTargetMax
000000013F62B43E 4D 85 C9             test        r9,r9 
000000013F62B441 75 10                jne         UTF16toUTF32+35h (013F62B453h) 
000000013F62B443 48 85 ED             test        rbp,rbp 
000000013F62B446 75 0B                jne         UTF16toUTF32+35h (013F62B453h) 
    xor edi,edi          ;clear rdi
000000013F62B448 33 FF                xor         edi,edi 
    mov r12,MAXUINT_PTR  ;0ffffffffh
000000013F62B44A 49 C7 C4 FF FF FF FF mov         r12,0FFFFFFFFFFFFFFFFh 
  .else
000000013F62B451 EB 07                jmp         UTF16toUTF32+3Ch (013F62B45Ah) 
    mov rdi,r9           ;destinu = szTarget
000000013F62B453 49 8B F9             mov         rdi,r9 
    lea r12,[rdi+rbp*4]  ;end of DWORD destinu
000000013F62B456 4C 8D 24 AF          lea         r12,[rdi+rbp*4] 
  .endif
  .for (¦rsi < r13 && rdi < r12¦rsi+=2)
000000013F62B45A EB 5D                jmp         UTF16toUTF32+9Bh (013F62B4B9h) 
    movzx ebx,WORD PTR[rsi]
000000013F62B45C 0F B7 1E             movzx       ebx,word ptr [rsi] 
    ;//Surrogate pair. High surrogate.
    .if (ebx >= 0d800h && ebx <= 0dbffh)
000000013F62B45F 81 FB 00 D8 00 00    cmp         ebx,0D800h 
000000013F62B465 72 3D                jb          UTF16toUTF32+86h (013F62B4A4h) 
000000013F62B467 81 FB FF DB 00 00    cmp         ebx,0DBFFh 
000000013F62B46D 77 35                ja          UTF16toUTF32+86h (013F62B4A4h) 
      add rsi,2
000000013F62B46F 48 83 C6 02          add         rsi,2 
      .if (rsi >= r13)
000000013F62B473 49 3B F5             cmp         rsi,r13 
000000013F62B476 72 06                jb          UTF16toUTF32+60h (013F62B47Eh) 
        sub rsi,2
000000013F62B478 48 83 EE 02          sub         rsi,2 
        .break
000000013F62B47C EB 45                jmp         UTF16toUTF32+0A5h (013F62B4C3h) 
      .endif
      ;//Low surrogate
      .if (WORD PTR[rsi] >= 0dc00h && WORD PTR[rsi] <= 0dfffh)
000000013F62B47E 66 81 3E 00 DC       cmp         word ptr [rsi],0DC00h 
000000013F62B483 72 1D                jb          UTF16toUTF32+84h (013F62B4A2h) 
000000013F62B485 66 81 3E FF DF       cmp         word ptr [rsi],0DFFFh 
000000013F62B48A 77 16                ja          UTF16toUTF32+84h (013F62B4A2h) 
        sub ebx,0d800h
000000013F62B48C 81 EB 00 D8 00 00    sub         ebx,0D800h 
        shl ebx,10
000000013F62B492 C1 E3 0A             shl         ebx,0Ah 
        movzx rax,WORD PTR[rsi]
000000013F62B495 48 0F B7 06          movzx       rax,word ptr [rsi] 
        lea ebx, DWORD PTR [rbx+rax+2400h]
000000013F62B499 8D 9C 18 00 24 00 00 lea         ebx,[rax+rbx+2400h] 
      .else
000000013F62B4A0 EB 02                jmp         UTF16toUTF32+86h (013F62B4A4h) 
        .continue
000000013F62B4A2 EB 11                jmp         UTF16toUTF32+97h (013F62B4B5h) 
      .endif
    .endif
    .if (r9)               ;szTarget
000000013F62B4A4 4D 85 C9             test        r9,r9 
000000013F62B4A7 74 08                je          UTF16toUTF32+93h (013F62B4B1h) 
      add rdi,4
000000013F62B4A9 48 83 C7 04          add         rdi,4 
      mov [rdi],ebx
000000013F62B4AD 89 1F                mov         dword ptr [rdi],ebx 
    .else
000000013F62B4AF EB 04                jmp         UTF16toUTF32+97h (013F62B4B5h) 
      add rdi,2
000000013F62B4B1 48 83 C7 02          add         rdi,2 
    .endif
  .endfor
000000013F62B4B5 48 83 C6 02          add         rsi,2 
000000013F62B4B9 49 3B F5             cmp         rsi,r13 
000000013F62B4BC 73 05                jae         UTF16toUTF32+0A5h (013F62B4C3h) 
000000013F62B4BE 49 3B FC             cmp         rdi,r12 
000000013F62B4C1 72 99                jb          UTF16toUTF32+3Eh (013F62B45Ch) 
  .if (r8)
000000013F62B4C3 4D 85 C0             test        r8,r8 
000000013F62B4C6 74 0C                je          UTF16toUTF32+0B6h (013F62B4D4h) 
    mov rax,rsi
000000013F62B4C8 48 8B C6             mov         rax,rsi 
    sub rax,rcx
000000013F62B4CB 48 2B C1             sub         rax,rcx 
    sar rax,1
000000013F62B4CE 48 D1 F8             sar         rax,1 
    mov [r8],rax
000000013F62B4D1 49 89 00             mov         qword ptr [r8],rax 
  .endif
  mov rax,rdi
000000013F62B4D4 48 8B C7             mov         rax,rdi 
  sub rax,r9
000000013F62B4D7 49 2B C1             sub         rax,r9 
  sar rax,2
000000013F62B4DA 48 C1 F8 02          sar         rax,2 
  ret
000000013F62B4DE 41 5D                pop         r13 
000000013F62B4E0 41 5C                pop         r12 
000000013F62B4E2 5E                   pop         rsi 
000000013F62B4E3 48 8B 5C 24 10       mov         rbx,qword ptr [rsp+10h] 
000000013F62B4E8 48 8B 6C 24 18       mov         rbp,qword ptr [rsp+18h] 
000000013F62B4ED 48 8B 7C 24 20       mov         rdi,qword ptr [pSource] 
000000013F62B4F2 C3                   ret 
UTF16toUTF32 ENDP