Author Topic: Asmc source and binaries  (Read 13152 times)

nidud

  • Member
  • *****
  • Posts: 1565
    • https://github.com/nidud/asmc
Re: Asmc source and binaries
« Reply #75 on: May 27, 2018, 07:26:06 PM »
Microsoft sample using __ImageBase.
Code: [Select]
    page    ,132
    title   memset - set sections of memory all to one byte
;***
;memset.asm - set a section of memory to all one byte
;
;   Copyright (c) Microsoft Corporation. All rights reserved.
;
;Purpose:
;   contains the memset() routine
;
;*******************************************************************************

include ksamd64.inc
        subttl  "memset"
;***
;char *memset(dst, value, count) - sets "count" bytes at "dst" to "value"
;
;Purpose:
;   Sets the first "count" bytes of the memory starting
;   at "dst" to the character value "value".
;
;   Algorithm:
;   char *
;   memset (dst, value, count)
;       char *dst;
;       char value;
;       size_t count;
;       {
;       char *start = dst;
;
;       while (count--)
;           *dst++ = value;
;       return(start);
;       }
;
;Entry:
;   char *dst - pointer to memory to fill with value
;   char value - value to put in dst bytes
;   size_t count - number of bytes of dst to fill
;
;Exit:
;   returns dst, with filled bytes
;
;Uses:
;
;Exceptions:
;
;*******************************************************************************
    extrn   __favor:dword
    extrn   __ImageBase:byte

__FAVOR_ENFSTRG equ 1
__FAVOR_SMSTRG  equ 2

        LEAF_ENTRY_ARG3 memset, _TEXT, buf:ptr byte, value:byte, count:dword

        OPTION PROLOGUE:NONE, EPILOGUE:NONE

        mov     r11, rcx                        ; save destination address
        movzx   edx, dl                         ; set fill pattern
        mov     r9, 0101010101010101h           ; replicate fill over 8 bytes
        imul    r9, rdx
        cmp     r8, 16
        jbe     SetBytes16                      ; if count <= 15 use GPR stores to set
        movd    xmm0, r9                        ; bytes to store in bits [0:63]
        punpcklbw xmm0, xmm0                    ; dup bytes to [127:64]

;
; Check if strings should be used
;
        cmp     r8, 128                         ; is this a small set, size <= 128?
        ja      XmmSet                          ; if large set, use XMM set
        bt      __favor, __FAVOR_SMSTRG         ; check if string set should be used
        jnc     XmmSetSmall                     ; otherwise, use a 16-byte block set

StringSet:
        mov     eax, edx                        ; set byte to move
        mov     rdx, rdi                        ; save rdi
        mov     rdi, rcx                        ; set destination
        mov     rcx, r8                         ; set count
        rep     stosb                           ; store the bytes
        mov     rdi, rdx                        ; restore rdi
        mov     rax, r11
        ret                                     ; return

;
; Fill using SSE instructions - size must be 16 or more.
;
        ; xmm0 has the byte to store replicated to all byte positions
        ; rcx has the destination, can be overwritten
        ; r11 has the destination, must be preserved for return value
        ; r8  has the count
        align   16
XmmSet:
        bt      __favor, __FAVOR_ENFSTRG        ; check if string set should be used
        jc      StringSet

        ; Aligned stores are much faster on AMD hardware. We need to do an unaligned
        ; store of (16 - (dest mod 16)) bytes, but it's faster to just store 16 bytes
        ; and then start the aligned loop as usual at ((dest + 16) - (dest mod 16)).
        ; This results in (dest mod 16) bytes being stored twice. This is a lot faster
        ; than a bunch of code to store maybe 8 then maybe 4 then maybe 2 then maybe 1
        ; byte to achieve alignement. It could cause data breakpoints to trigger twice,
        ; but they will hit here first and hopefully you will read this comment.
        ; The fastest way to subtract (16 - (dest mod 16)) from the length is to add
        ; (original dest - aligned dest). This avoids having to calculate the value.

        movups  [rcx], xmm0                     ; store 16 unaligned from start
        add     r8, rcx                         ; r8 = dest + length
        add     rcx, 16                         ; dest = (dest + 16)
        and     rcx, -16                        ; dest = (dest + 16) - (dest mod 16)
        sub     r8, rcx                         ; r8 = remaining length

; Attempt to set 128-byte blocks.
;
XmmSetLarge:
        mov     r9, r8                          ; copy count of bytes remaining
        shr     r9, 7                           ; compute number of 128-byte blocks
        jz      XmmSetSmall                     ; if z, no 128-byte blocks to fill
;
; Set 128-byte blocks
        align   16
XmmSetLargeLoop:
        movaps  0[rcx], xmm0
        movaps  16[rcx], xmm0
        add     rcx, 128                        ; advance destination address early
        movaps  (32 - 128)[rcx], xmm0
        movaps  (48 - 128)[rcx], xmm0
        dec     r9                              ; dec block counter (set cc for jnz)
        movaps  (64 - 128)[rcx], xmm0
        movaps  (80 - 128)[rcx], xmm0
        movaps  (96 - 128)[rcx], xmm0
        movapd  (112 - 128)[rcx], xmm0
        ; to avoid generating a one-byte NOP for the 'align 16' below the previous
        ; instruction is movapd instead of movaps which is one byte longer but
        ; performs exactly the same operation.
        jnz     XmmSetLargeLoop                 ; loop if more blocks

        and     r8, 127                         ; compute remaining byte count
;
; Attempt to set 16-byte blocks
        align   16
XmmSetSmall:
        mov     r9, r8                          ; copy count of bytes remaining
        shr     r9, 4                           ; compute number of 16-byte blocks
        jz      short XmmSetTrailing

        ; This generates an 8-byte nop, which we execute once. This will change only if
        ; any of the code from msetxmm30 down is modified. The following loop thus is
        ; completely contained within one instruction decode buffer on AMD hardware.
        align   16

;
; Set 16-byte blocks
;
XmmSetSmallLoop:
        movups  [rcx], xmm0
        add     rcx, 16
        dec     r9
        jnz     short XmmSetSmallLoop

XmmSetTrailing:
        and     r8, 15                          ; compute remaining length
        jz      XmmSetReturn                    ; skip over movups if done, we could just do it anyway

        ; As at the start, we are going to do an unaligned store of 16 bytes which will overwrite
        ; some bytes already stored. The math is easier, rcx+r8 is one byte past the end, just
        ; back up 16 from there and store 16.

        movups  [rcx+r8-16], xmm0               ; write remainder, overwriting 16-r8 bytes we already wrote

XmmSetReturn:
        mov     rax, r11                        ; must return original dest that we saved in r11
        ret

;
; Jump table for fills of 15 bytes or fewer
;
        align 4
MsetTab dd  IMAGEREL msetTab00
        dd  IMAGEREL msetTab01
        dd  IMAGEREL msetTab02
        dd  IMAGEREL msetTab03
        dd  IMAGEREL msetTab04
        dd  IMAGEREL msetTab05
        dd  IMAGEREL msetTab06
        dd  IMAGEREL msetTab07
        dd  IMAGEREL msetTab08
        dd  IMAGEREL msetTab09
        dd  IMAGEREL msetTab10
        dd  IMAGEREL msetTab11
        dd  IMAGEREL msetTab12
        dd  IMAGEREL msetTab13
        dd  IMAGEREL msetTab14
        dd  IMAGEREL msetTab15
        dd  IMAGEREL msetTab16

        ; Set blocks that are less than 16 bytes long.
        ; Preconditions:
        ; rdx has the byte to fill and has been zero extended (ready for imul)
        ; rcx has dest
        ; r8 has len, r8 < 16
        ; r11 has the dest
        align   16
SetBytes16:
        mov     rdx, r9
        lea     r9, OFFSET __ImageBase
        mov     eax, [(IMAGEREL  MsetTab) + r9 +r8*4]
        add     r9, rax
        add     rcx, r8                         ; rcx is now 1 past last byte to set
        mov     rax, r11                        ; set return value
        jmp     r9


        align   16

        ; Code for setting various sized blocks up to 15 bytes long.
        ; preconditions:
        ; rcx points 1 byte beyond end of bytes to set
        ; rax has the correct return value (the original dest)
        ; each byte of the rdx reg is set to the byte to store
msetTab15:
        mov     (-15)[rcx], rdx
        ; fallthrough to 7
msetTab07:
        mov     (-7)[rcx], edx
        ;; fallthrough to 3
msetTab03:
        mov     (-3)[rcx], dx
        ; fallthrough to 1
msetTab01:
        mov     (-1)[rcx], dl
msetTab00:
        ret

        align   16
msetTab12:
        mov     (-12)[rcx], rdx
        ; fallthrough to 4
msetTab04:
        mov     (-4)[rcx], edx
        ret

msetTab09:
        mov     (-9)[rcx], rdx
        mov     (-1)[rcx], dl
        ret

        align   16
msetTab13:
        mov     (-13)[rcx], rdx
        ; fallthrough to 5
msetTab05:
        mov     (-5)[rcx], edx
        mov     (-1)[rcx], dl
        ret

        align   16
msetTab14:
        mov     (-14)[rcx], rdx
        ; fallthrough to 6
msetTab06:
        mov     (-6)[rcx], edx
        ; fallthrough to 2
msetTab02:
        mov     (-2)[rcx], dx
        ret

msetTab08:
        mov     [rax], rdx
        ret

msetTab11:
        mov     [rax], rdx
        mov     (8)[rax], dx
        mov     (10)[rax], dl
        ret

        align   16
msetTab10:
        mov     [rax], rdx
        mov     (8)[rax], dx
        ret

msetTab16:
        mov     [rax], rdx
        mov     (8)[rax], rdx
        ret

        LEAF_END memset, _TEXT

    end

jj2007

  • Member
  • *****
  • Posts: 8600
  • Assembler is fun ;-)
    • MasmBasic
Re: Asmc source and binaries
« Reply #76 on: May 27, 2018, 09:18:23 PM »
Let me spare some time -and embarrassment- to the newcomers of Asmc. The help file is located at:

asmc-master\source\asmc\asmc.chm

Is there another version? https://github.com/nidud/asmc/raw/master/source/asmc/asmc.chm looks a bit empty :(

nidud

  • Member
  • *****
  • Posts: 1565
    • https://github.com/nidud/asmc
Re: Asmc source and binaries
« Reply #77 on: May 27, 2018, 10:17:51 PM »
Think the OS puts a block on the file if you download it directly. Try right-click->property->General and see if there is a "Remove Block" option there.

jj2007

  • Member
  • *****
  • Posts: 8600
  • Assembler is fun ;-)
    • MasmBasic
Re: Asmc source and binaries
« Reply #78 on: May 27, 2018, 10:32:08 PM »
It's not that empty, actually. The table of contents is there, but no content. Do you have a proper installer for AsmC which puts inc and lib files into their proper folders etc?

P.S.: This is odd. I just re-opened asmc-master\source\asmc\asmc.chm, and now it shows content, too ::)
No mentioning of the includes and libraries, though - is there a separate help file? Examples?

nidud

  • Member
  • *****
  • Posts: 1565
    • https://github.com/nidud/asmc
Re: Asmc source and binaries
« Reply #79 on: May 27, 2018, 10:45:02 PM »
No.

nidud

  • Member
  • *****
  • Posts: 1565
    • https://github.com/nidud/asmc
Re: Asmc source and binaries
« Reply #80 on: May 27, 2018, 11:50:54 PM »
No mentioning of the includes and libraries, though - is there a separate help file? Examples?

Assuming you have downloaded and unzipped the archive you should have something like this:

.\asmc-master
  bin
  include
  lib
  source
  dz.exe


- the include files are in the include directory
- the lib files are in the lib directory
- the source files are in the source directory

The simplest way to build the import libraries is to run the shell (dz.exe). This will setup the environment for the base directory wherever that may be.

From the shell, enter into the lib directory and select the makefile(s) and hit enter. To build the static libraries browse into the source directory and repeat the above. The examples are in the ./source/test directory.

nidud

  • Member
  • *****
  • Posts: 1565
    • https://github.com/nidud/asmc
Re: Asmc source and binaries
« Reply #81 on: June 13, 2018, 11:16:23 PM »
Added a few updates in an attempt to build a 64-bit version of Asmc using the 64-bit library. This uses CL.EXE version 18 (Visual Studio 12) by adding the __ImageBase logic as explain above.

I converted a few of the bottleneck-function from the Asmc source, mostly ASCII to binary (numbers) and HASH algos.

The benchmark for the C/ASM implementation:

19532 ClockTicks: Asmc  32-bit asmc-lib 342K
21871 ClockTicks: Asmc  64-bit asmc-lib 448K
28392 ClockTicks: JWasm 32-bit watcom   345K
30327 ClockTicks: Uasm  64-bit windows  844K
40466 ClockTicks: Uasm  32-bit windows  723K
49375 ClockTicks: Asmc  64-bit windows  531K

Uasm uses a newer version of VS so it optimizes better than the C version of Asmc. Did a fatal mistake when installing Windows on the new (lunch) box, opted for the partition of C/D with the default size and ended up with a rather small C drive. This in turn prohibit the installation of the newest version of Visual Studio.

Asmc do not use any registers in a .switch unless the option REGAX is added. Some changes is added for the 64-bit version where R11 is used instead of RAX.

EDIT: REGAX is now set as default if -win64 used and will use R10 and R11 to render the switch.

A direct jump without testing (NOTEST) is not really possible given the code is unknown at the top so the switch will always start with a jump to the test label at the end of the switch. However, it is possible to use the list file to get the name of the label for the jump-table and use this directly:

EDIT: The NOTEST option will now issue a direct jump if used.

Code: [Select]
    .code

memcpy::
memmove::

    mov rax,rcx

    .if r8 <= 32

        option switch:table, switch:notest, switch:regax

        .switch r8

          .case 0
            ret

          .case 1
            mov cl,[rdx]
            mov [rax],cl
            ret

          .case 2,3,4
            mov cx,[rdx]
            mov dx,[rdx+r8-2]
            mov [rax+r8-2],dx
            mov [rax],cx
            ret

          .case 5,6,7,8
            mov ecx,[rdx]
            mov edx,[rdx+r8-4]
            mov [rax+r8-4],edx
            mov [rax],ecx
            ret

          .case 9,10,11,12,13,14,15,16
            mov rcx,[rdx]
            mov rdx,[rdx+r8-8]
            mov [rax],rcx
            mov [rax+r8-8],rdx
            ret

          .case 17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32
            movdqu xmm0,[rdx]
            movdqu xmm1,[rdx+r8-16]
            movups [rax],xmm0
            movups [rax+r8-16],xmm1
            ret
        .endsw
    .endif

    movups xmm2,[rdx]
    movdqu xmm3,[rdx+16]
    movdqu xmm4,[rdx+r8-16]
    movdqu xmm5,[rdx+r8-32]
    .if r8 > 64

        mov ecx,eax
        neg ecx
        and ecx,32-1
        add rdx,rcx
        mov r9,r8
        sub r9,rcx
        add rcx,rax
        and r9b,-32

        .if rcx > rdx

            .repeat
                sub r9,32
                movups xmm0,[rdx+r9]
                movups xmm1,[rdx+r9+16]
                movaps [rcx+r9],xmm0
                movaps [rcx+r9+16],xmm1
            .untilz
            movups [rax],xmm2
            movups [rax+16],xmm3
            movups [rax+r8-16],xmm4
            movups [rax+r8-32],xmm5
            ret
            db 5 dup(0x90)
        .endif

        lea rcx,[rcx+r9]
        lea rdx,[rdx+r9]
        neg r9
        .repeat
            movups xmm0,[rdx+r9]
            movups xmm1,[rdx+r9+16]
            movaps [rcx+r9],xmm0
            movaps [rcx+r9+16],xmm1
            add r9,32
        .untilz
    .endif
    movups [rax],xmm2
    movups [rax+16],xmm3
    movups [rax+r8-16],xmm4
    movups [rax+r8-32],xmm5
    ret

    end
« Last Edit: June 23, 2018, 08:15:42 AM by nidud »

nidud

  • Member
  • *****
  • Posts: 1565
    • https://github.com/nidud/asmc
Re: Asmc source and binaries
« Reply #82 on: June 21, 2018, 09:16:32 PM »
Two more switches:
- added command line option /stackalign
- added command line option /autostack

Note: Only the first 4 characters are tested for these including /homeparams.

-home -auto -stac

I also implemented defined() as a inline directive suggested by Biterider. This will only work if defined is not, well, defined.

Test case:
Code: [Select]
    .486
    .model flat
    .code

__SYM__ equ 3

if defined(__SYM__) and __SYM__ ge 3
    mov eax,__SYM__
else
    .err <error>
endif

undef __SYM__

if defined(__SYM__) and __SYM__ ge 3
    .err <error>
else
    nop
endif

__SYM__ equ 2
if defined(__SYM__) and __SYM__ eq 2
    mov eax,__SYM__
else
    .err <error>
endif

undef __SYM__
__SYM__ equ <>
if defined(__SYM__)
    nop
else
    .err <error>
endif

if defined(__SYM2__) or defined(__SYM__)
    nop
else
    .err <error>
endif

if not defined(__SYM2__) and defined(__SYM__)
    nop
else
    .err <error>
endif

if defined(__SYM2__) and not defined(__SYM__)
    .err <error>
else
    nop
endif

    end

Source code for the implementation.

nidud

  • Member
  • *****
  • Posts: 1565
    • https://github.com/nidud/asmc
Re: Asmc source and binaries
« Reply #83 on: June 23, 2018, 08:53:15 AM »
Made some changes to the .switch implementation.

The direct jump logic using NOTEST as explained above is now implemented. The actual jump table in 64-bit is reduced to 32-bit and there is no image relative addressing involved so the code produced may be relocated and still work.

The table now holds the size measured from the .CASE label to the .ENDSW label as appose to a QWORD address table resolved by the linker. Reducing it further to 16-bit would limit the total code size of the switch to 64K which may be more than enough in most cases, but as for now the DWORD size is used.

The minimum code in 32-bit is just on line:

    jmp table[reg*4-(min*4)]


The 64-bit version uses R10 and R11:

    lea r11,l_exit
    mov r10d,[reg*4+r11-(min*4)+(table-l_exit)]
    sub r11,r10
    jmp r11


nidud

  • Member
  • *****
  • Posts: 1565
    • https://github.com/nidud/asmc
Re: Asmc source and binaries
« Reply #84 on: June 23, 2018, 09:35:43 AM »
Simple test case where the binary of the memcpy routine is loaded into the code segment and executed.

total [1 .. 64], 1++
  1690700 cycles 1.asm: switch
  3493186 cycles 0.asm: msvcrt.dll
hit any key to continue...


The benchmark result also begs the question if Microsoft actually use ml64 at all. I'v long suspected they never did.

Caché GB

  • Member
  • **
  • Posts: 58
  • MASM IS HOT
Re: Asmc source and binaries
« Reply #85 on: June 23, 2018, 09:54:38 PM »
Hello nidud.

If it helps, there is a  memcpy routine using SSE by MS.

Here - :\Program Files (x86)\Microsoft Visual Studio 14.0\VC\crt\src\amd64\memcpy.asm
Caché GB's 1 and 0-nly language:MASM

nidud

  • Member
  • *****
  • Posts: 1565
    • https://github.com/nidud/asmc
Re: Asmc source and binaries
« Reply #86 on: June 23, 2018, 11:42:36 PM »
We have been testing many of these algoes for various platforms and I just posted one of them in this tread, but the include files seems to be missing.

However, I don't think you will find any of these algos included in msvcrt.dll.

Vortex

  • Member
  • *****
  • Posts: 1821
Re: Asmc source and binaries
« Reply #87 on: June 24, 2018, 12:49:24 AM »
However, I don't think you will find any of these algos included in msvcrt.dll.

Hi nidud,

Sorry if I am missing something but msvcrt.dll exports memcpy and a lot of traditional C functions :
Code: [Select]
\PellesC\bin\podump.exe /exports C:\WINDOWS\system32\msvcrt.dll
.
.
            ordinal  hint  address   name

            2E5   2E4  77BBD020  malloc
            2E6   2E5  77BAD656  mblen
            2E7   2E6  77BAD7EA  mbstowcs
            2E8   2E7  77BAD8E4  mbtowc
            2E9   2E8  77BD7C70  memchr
            2EA   2E9  77BD7D30  memcmp
            2EB   2EA  77BD7DF0  memcpy
            2EC   2EB  77BD8140  memmove
            2ED   2EC  77BD8490  memset
            2EE   2ED  77BDACF7  mktime
            2EF   2EE  77BE1AA0  modf
            2F0   2EF  77BC7E96  perror
            2F1   2F0  77BDE1C0  pow
            2F2   2F1  77BD27C2  printf
            2F3   2F2  77BD1DB4  putc
            2F4   2F3  77BCFED9  putchar
            2F5   2F4  77BD282D  puts
            2F6   2F5  77BD1FB7  putwc
            2F7   2F6  77BCFF04  putwchar
            2F8   2F7  77BC7FF0  qsort
            2F9   2F8  77BC612E  raise
            2FA   2F9  77BC8273  rand
            2FB   2FA  77BBD0C0  realloc
            2FC   2FB  77BB0A35  remove
            2FD   2FC  77BB1B2C  rename
            2FE   2FD  77BD28DB  rewind
            2FF   2FE  77BD2969  scanf

nidud

  • Member
  • *****
  • Posts: 1565
    • https://github.com/nidud/asmc
Re: Asmc source and binaries
« Reply #88 on: June 28, 2018, 04:02:51 AM »
Fixed some bugs in name-mangling for vectorcall. The mangler used the global calling convention so mixing languages failed, vectorcall and fastcall in this case. This may still be a problem in 32-bit with regards to fastcall.

Added some test cases using vectorcall for linking with MSVC version 12. The DirectXMath library default to vectorcall so this was used for testing the mangler.

The _mm_ macros should be portable and work on most platforms (IE. 32/64-bit C/std/sys/fast/vector call) when fully implemented but for now only XMM are used in the test cases. The DirectXMath headers are now split into two files, one for function declarations and one for inline macros. The latter (DirectXMath.inl) will have inl_<name> macros corresponded to the function names in DirectXMath.inc.

The mangler tests:

Using VC:
Code: [Select]
#include <stdio.h>

void __vectorcall XMScalarSinCos(float* pSin, float* pCos, float  Value);

int main(void)
{

  float pSin, pCos, Value;
  double Sin, Cos;

    Value = 0.5f;

    XMScalarSinCos(&pSin, &pCos, Value);

    Sin = (double)pSin;
    Cos = (double)pCos;

    printf("Sin: %f\nCos: %f\n", Sin, Cos);

    return 0;
}

Using Asmc:
Code: [Select]
include stdio.inc
include directxmath.inc

.code

main proc

  local pSin:float, pCos:float, Value:float
  local Sin:double, Cos:double

    mov Value,0.5

    XMScalarSinCos(&pSin, &pCos, Value)

    _mm_store_sd(Sin, _mm_cvtss_sd(_mm_load_ss(xmm0, pSin)))
    _mm_store_sd(Cos, _mm_cvtss_sd(_mm_load_ss(xmm1, pCos)))

    printf("Sin: %f\nCos: %f\n", Sin, Cos)

    xor eax,eax
    ret

main endp

    end

Code: [Select]
include stdio.inc
include directxmath.inc

.code

main proc

  local x:XMUINT4, Value:UINT, result:double

    mov Value,1
    mov x.x,5
    mov x.y,5
    mov x.z,5
    mov x.w,5

    XMConvertVectorIntToFloat(x, Value)

    movaps x,xmm0
    _mm_store_sd(result, _mm_cvtss_sd(_mm_load_ss(xmm0, x.x)))

    printf("result: %f\n", result)

    xor eax,eax
    ret

main endp

    end

Using inline macros for building the library:
Code: [Select]
include DirectXMath.inc

    .code

    option win64:rsp nosave noauto

XMMatrixPerspectiveFovLH proc XM_CALLCONV XMTHISPTR, FovAngleY:float, AspectRatio:float, NearZ:float, FarZ:float
if _XM_VECTORCALL_
    inl_XMMatrixPerspectiveFovLH(xmm0,xmm1,xmm2,xmm3)
else
    assume rcx:ptr XMMATRIX
    inl_XMMatrixPerspectiveFovLH(xmm1,xmm2,xmm3,xmm4,[rcx])
    mov rax,rcx
endif
    ret

XMMatrixPerspectiveFovLH endp

    end

Code: [Select]
include DirectXMath.inc

    .code

    option win64:rsp nosave noauto

XMMatrixTranspose proc XM_CALLCONV XMTHISPTR, AXMMATRIX
if _XM_VECTORCALL_
    inl_XMMatrixTranspose()
else
    assume rcx:ptr XMMATRIX
    assume rdx:ptr XMMATRIX
    inl_XMMatrixTranspose([rcx],[rdx])
    mov rax,rcx
endif
    ret

XMMatrixTranspose endp

    end

nidud

  • Member
  • *****
  • Posts: 1565
    • https://github.com/nidud/asmc
Re: Asmc source and binaries
« Reply #89 on: July 04, 2018, 09:18:42 PM »
Added some extensions for return values from function calls.

Nested function calls adapts to argument size, now extended to XMM. Assigned values adapts to target size if possible, else default is used ([R|E]AX).

Code: [Select]
id1  label byte
id2  label word
id4  label dword
id8  label qword
id16 label oword

foo proc
    ret
foo endp

    add     al,   foo()
    add     ax,   foo()
    add     eax,  foo()
    add     rax,  foo()
    addps   xmm2, foo()

    mov     id1,  foo()
    mov     id2,  foo()
    mov     id4,  foo()
    mov     id8,  foo()
    movups  id16, foo()

bar proc x:real4, y:real16
    ret
bar endp

    bar(bar(xmm0, xmm1), xmm1)