Author Topic: Asmc source and binaries  (Read 11002 times)

nidud

  • Member
  • *****
  • Posts: 1494
    • https://github.com/nidud/asmc
Re: Asmc source and binaries
« Reply #75 on: May 27, 2018, 07:26:06 PM »
Microsoft sample using __ImageBase.
Code: [Select]
    page    ,132
    title   memset - set sections of memory all to one byte
;***
;memset.asm - set a section of memory to all one byte
;
;   Copyright (c) Microsoft Corporation. All rights reserved.
;
;Purpose:
;   contains the memset() routine
;
;*******************************************************************************

include ksamd64.inc
        subttl  "memset"
;***
;char *memset(dst, value, count) - sets "count" bytes at "dst" to "value"
;
;Purpose:
;   Sets the first "count" bytes of the memory starting
;   at "dst" to the character value "value".
;
;   Algorithm:
;   char *
;   memset (dst, value, count)
;       char *dst;
;       char value;
;       size_t count;
;       {
;       char *start = dst;
;
;       while (count--)
;           *dst++ = value;
;       return(start);
;       }
;
;Entry:
;   char *dst - pointer to memory to fill with value
;   char value - value to put in dst bytes
;   size_t count - number of bytes of dst to fill
;
;Exit:
;   returns dst, with filled bytes
;
;Uses:
;
;Exceptions:
;
;*******************************************************************************
    extrn   __favor:dword
    extrn   __ImageBase:byte

__FAVOR_ENFSTRG equ 1
__FAVOR_SMSTRG  equ 2

        LEAF_ENTRY_ARG3 memset, _TEXT, buf:ptr byte, value:byte, count:dword

        OPTION PROLOGUE:NONE, EPILOGUE:NONE

        mov     r11, rcx                        ; save destination address
        movzx   edx, dl                         ; set fill pattern
        mov     r9, 0101010101010101h           ; replicate fill over 8 bytes
        imul    r9, rdx
        cmp     r8, 16
        jbe     SetBytes16                      ; if count <= 15 use GPR stores to set
        movd    xmm0, r9                        ; bytes to store in bits [0:63]
        punpcklbw xmm0, xmm0                    ; dup bytes to [127:64]

;
; Check if strings should be used
;
        cmp     r8, 128                         ; is this a small set, size <= 128?
        ja      XmmSet                          ; if large set, use XMM set
        bt      __favor, __FAVOR_SMSTRG         ; check if string set should be used
        jnc     XmmSetSmall                     ; otherwise, use a 16-byte block set

StringSet:
        mov     eax, edx                        ; set byte to move
        mov     rdx, rdi                        ; save rdi
        mov     rdi, rcx                        ; set destination
        mov     rcx, r8                         ; set count
        rep     stosb                           ; store the bytes
        mov     rdi, rdx                        ; restore rdi
        mov     rax, r11
        ret                                     ; return

;
; Fill using SSE instructions - size must be 16 or more.
;
        ; xmm0 has the byte to store replicated to all byte positions
        ; rcx has the destination, can be overwritten
        ; r11 has the destination, must be preserved for return value
        ; r8  has the count
        align   16
XmmSet:
        bt      __favor, __FAVOR_ENFSTRG        ; check if string set should be used
        jc      StringSet

        ; Aligned stores are much faster on AMD hardware. We need to do an unaligned
        ; store of (16 - (dest mod 16)) bytes, but it's faster to just store 16 bytes
        ; and then start the aligned loop as usual at ((dest + 16) - (dest mod 16)).
        ; This results in (dest mod 16) bytes being stored twice. This is a lot faster
        ; than a bunch of code to store maybe 8 then maybe 4 then maybe 2 then maybe 1
        ; byte to achieve alignement. It could cause data breakpoints to trigger twice,
        ; but they will hit here first and hopefully you will read this comment.
        ; The fastest way to subtract (16 - (dest mod 16)) from the length is to add
        ; (original dest - aligned dest). This avoids having to calculate the value.

        movups  [rcx], xmm0                     ; store 16 unaligned from start
        add     r8, rcx                         ; r8 = dest + length
        add     rcx, 16                         ; dest = (dest + 16)
        and     rcx, -16                        ; dest = (dest + 16) - (dest mod 16)
        sub     r8, rcx                         ; r8 = remaining length

; Attempt to set 128-byte blocks.
;
XmmSetLarge:
        mov     r9, r8                          ; copy count of bytes remaining
        shr     r9, 7                           ; compute number of 128-byte blocks
        jz      XmmSetSmall                     ; if z, no 128-byte blocks to fill
;
; Set 128-byte blocks
        align   16
XmmSetLargeLoop:
        movaps  0[rcx], xmm0
        movaps  16[rcx], xmm0
        add     rcx, 128                        ; advance destination address early
        movaps  (32 - 128)[rcx], xmm0
        movaps  (48 - 128)[rcx], xmm0
        dec     r9                              ; dec block counter (set cc for jnz)
        movaps  (64 - 128)[rcx], xmm0
        movaps  (80 - 128)[rcx], xmm0
        movaps  (96 - 128)[rcx], xmm0
        movapd  (112 - 128)[rcx], xmm0
        ; to avoid generating a one-byte NOP for the 'align 16' below the previous
        ; instruction is movapd instead of movaps which is one byte longer but
        ; performs exactly the same operation.
        jnz     XmmSetLargeLoop                 ; loop if more blocks

        and     r8, 127                         ; compute remaining byte count
;
; Attempt to set 16-byte blocks
        align   16
XmmSetSmall:
        mov     r9, r8                          ; copy count of bytes remaining
        shr     r9, 4                           ; compute number of 16-byte blocks
        jz      short XmmSetTrailing

        ; This generates an 8-byte nop, which we execute once. This will change only if
        ; any of the code from msetxmm30 down is modified. The following loop thus is
        ; completely contained within one instruction decode buffer on AMD hardware.
        align   16

;
; Set 16-byte blocks
;
XmmSetSmallLoop:
        movups  [rcx], xmm0
        add     rcx, 16
        dec     r9
        jnz     short XmmSetSmallLoop

XmmSetTrailing:
        and     r8, 15                          ; compute remaining length
        jz      XmmSetReturn                    ; skip over movups if done, we could just do it anyway

        ; As at the start, we are going to do an unaligned store of 16 bytes which will overwrite
        ; some bytes already stored. The math is easier, rcx+r8 is one byte past the end, just
        ; back up 16 from there and store 16.

        movups  [rcx+r8-16], xmm0               ; write remainder, overwriting 16-r8 bytes we already wrote

XmmSetReturn:
        mov     rax, r11                        ; must return original dest that we saved in r11
        ret

;
; Jump table for fills of 15 bytes or fewer
;
        align 4
MsetTab dd  IMAGEREL msetTab00
        dd  IMAGEREL msetTab01
        dd  IMAGEREL msetTab02
        dd  IMAGEREL msetTab03
        dd  IMAGEREL msetTab04
        dd  IMAGEREL msetTab05
        dd  IMAGEREL msetTab06
        dd  IMAGEREL msetTab07
        dd  IMAGEREL msetTab08
        dd  IMAGEREL msetTab09
        dd  IMAGEREL msetTab10
        dd  IMAGEREL msetTab11
        dd  IMAGEREL msetTab12
        dd  IMAGEREL msetTab13
        dd  IMAGEREL msetTab14
        dd  IMAGEREL msetTab15
        dd  IMAGEREL msetTab16

        ; Set blocks that are less than 16 bytes long.
        ; Preconditions:
        ; rdx has the byte to fill and has been zero extended (ready for imul)
        ; rcx has dest
        ; r8 has len, r8 < 16
        ; r11 has the dest
        align   16
SetBytes16:
        mov     rdx, r9
        lea     r9, OFFSET __ImageBase
        mov     eax, [(IMAGEREL  MsetTab) + r9 +r8*4]
        add     r9, rax
        add     rcx, r8                         ; rcx is now 1 past last byte to set
        mov     rax, r11                        ; set return value
        jmp     r9


        align   16

        ; Code for setting various sized blocks up to 15 bytes long.
        ; preconditions:
        ; rcx points 1 byte beyond end of bytes to set
        ; rax has the correct return value (the original dest)
        ; each byte of the rdx reg is set to the byte to store
msetTab15:
        mov     (-15)[rcx], rdx
        ; fallthrough to 7
msetTab07:
        mov     (-7)[rcx], edx
        ;; fallthrough to 3
msetTab03:
        mov     (-3)[rcx], dx
        ; fallthrough to 1
msetTab01:
        mov     (-1)[rcx], dl
msetTab00:
        ret

        align   16
msetTab12:
        mov     (-12)[rcx], rdx
        ; fallthrough to 4
msetTab04:
        mov     (-4)[rcx], edx
        ret

msetTab09:
        mov     (-9)[rcx], rdx
        mov     (-1)[rcx], dl
        ret

        align   16
msetTab13:
        mov     (-13)[rcx], rdx
        ; fallthrough to 5
msetTab05:
        mov     (-5)[rcx], edx
        mov     (-1)[rcx], dl
        ret

        align   16
msetTab14:
        mov     (-14)[rcx], rdx
        ; fallthrough to 6
msetTab06:
        mov     (-6)[rcx], edx
        ; fallthrough to 2
msetTab02:
        mov     (-2)[rcx], dx
        ret

msetTab08:
        mov     [rax], rdx
        ret

msetTab11:
        mov     [rax], rdx
        mov     (8)[rax], dx
        mov     (10)[rax], dl
        ret

        align   16
msetTab10:
        mov     [rax], rdx
        mov     (8)[rax], dx
        ret

msetTab16:
        mov     [rax], rdx
        mov     (8)[rax], rdx
        ret

        LEAF_END memset, _TEXT

    end

jj2007

  • Member
  • *****
  • Posts: 8424
  • Assembler is fun ;-)
    • MasmBasic
Re: Asmc source and binaries
« Reply #76 on: May 27, 2018, 09:18:23 PM »
Let me spare some time -and embarrassment- to the newcomers of Asmc. The help file is located at:

asmc-master\source\asmc\asmc.chm

Is there another version? https://github.com/nidud/asmc/raw/master/source/asmc/asmc.chm looks a bit empty :(

nidud

  • Member
  • *****
  • Posts: 1494
    • https://github.com/nidud/asmc
Re: Asmc source and binaries
« Reply #77 on: May 27, 2018, 10:17:51 PM »
Think the OS puts a block on the file if you download it directly. Try right-click->property->General and see if there is a "Remove Block" option there.

jj2007

  • Member
  • *****
  • Posts: 8424
  • Assembler is fun ;-)
    • MasmBasic
Re: Asmc source and binaries
« Reply #78 on: May 27, 2018, 10:32:08 PM »
It's not that empty, actually. The table of contents is there, but no content. Do you have a proper installer for AsmC which puts inc and lib files into their proper folders etc?

P.S.: This is odd. I just re-opened asmc-master\source\asmc\asmc.chm, and now it shows content, too ::)
No mentioning of the includes and libraries, though - is there a separate help file? Examples?

nidud

  • Member
  • *****
  • Posts: 1494
    • https://github.com/nidud/asmc
Re: Asmc source and binaries
« Reply #79 on: May 27, 2018, 10:45:02 PM »
No.

nidud

  • Member
  • *****
  • Posts: 1494
    • https://github.com/nidud/asmc
Re: Asmc source and binaries
« Reply #80 on: May 27, 2018, 11:50:54 PM »
No mentioning of the includes and libraries, though - is there a separate help file? Examples?

Assuming you have downloaded and unzipped the archive you should have something like this:

.\asmc-master
  bin
  include
  lib
  source
  dz.exe


- the include files are in the include directory
- the lib files are in the lib directory
- the source files are in the source directory

The simplest way to build the import libraries is to run the shell (dz.exe). This will setup the environment for the base directory wherever that may be.

From the shell, enter into the lib directory and select the makefile(s) and hit enter. To build the static libraries browse into the source directory and repeat the above. The examples are in the ./source/test directory.

nidud

  • Member
  • *****
  • Posts: 1494
    • https://github.com/nidud/asmc
Re: Asmc source and binaries
« Reply #81 on: June 13, 2018, 11:16:23 PM »
Added a few updates in an attempt to build a 64-bit version of Asmc using the 64-bit library. This uses CL.EXE version 18 (Visual Studio 12) by adding the __ImageBase logic as explain above.

I converted a few of the bottleneck-function from the Asmc source, mostly ASCII to binary (numbers) and HASH algos.

The benchmark for the C/ASM implementation:

19532 ClockTicks: Asmc  32-bit asmc-lib 342K
21871 ClockTicks: Asmc  64-bit asmc-lib 448K
28392 ClockTicks: JWasm 32-bit watcom   345K
30327 ClockTicks: Uasm  64-bit windows  844K
40466 ClockTicks: Uasm  32-bit windows  723K
49375 ClockTicks: Asmc  64-bit windows  531K

Uasm uses a newer version of VS so it optimizes better than the C version of Asmc. Did a fatal mistake when installing Windows on the new (lunch) box, opted for the partition of C/D with the default size and ended up with a rather small C drive. This in turn prohibit the installation of the newest version of Visual Studio.

Asmc do not use any registers in a .switch unless the option REGAX is added. Some changes is added for the 64-bit version where R11 is used instead of RAX.

A direct jump without testing (NOTEST) is not really possible given the code is unknown at the top so the switch will always start with a jump to the test label at the end of the switch. However, it is possible to use the list file to get the name of the label for the jump-table and use this directly:

Code: [Select]
    .code

memcpy::
memmove::

    mov rax,rcx

    .if r8 < 32

        option switch:table, switch:notest, switch:regax

        lea r11,@C0024
        jmp qword ptr [r8*8+r11]

        .switch r8

          .case 0
            ret

          .case 1
            mov cl,[rdx]
            mov [rax],cl
            ret

          .case 2,3
            mov cx,[rdx]
            mov dx,[rdx+r8-2]
            mov [rax+r8-2],dx
            mov [rax],cx
            ret

          .case 4,5,6,7
            mov ecx,[rdx]
            mov edx,[rdx+r8-4]
            mov [rax+r8-4],edx
            mov [rax],ecx
            ret

          .case 8,9,10,11,12,13,14,15
            movq xmm0,[rdx]
            movq xmm1,[rdx+r8-8]
            movq [rax],xmm0
            movq [rax+r8-8],xmm1
            ret

          .case 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
            movdqu xmm0,[rdx]
            movdqu xmm1,[rdx+r8-16]
            movups [rax],xmm0
            movups [rax+r8-16],xmm1
            ret
        .endsw
    .endif

    movdqu xmm2,[rdx]
    movdqu xmm3,[rdx+16]
    movdqu xmm4,[rdx+r8-16]
    movdqu xmm5,[rdx+r8-32]
    .if r8 < 64
        movups [rax],xmm2
        movups [rax+16],xmm3
        movups [rax+r8-16],xmm4
        movups [rax+r8-32],xmm5
        ret
    .endif

    mov ecx,eax
    neg ecx
    and ecx,32-1
    add rdx,rcx
    mov r9,r8
    sub r9,rcx
    add rcx,rax
    and r9b,-32

    .if rcx > rdx

        .repeat
            sub r9,32
            movups xmm0,[rdx+r9]
            movups xmm1,[rdx+r9+16]
            movaps [rcx+r9],xmm0
            movaps [rcx+r9+16],xmm1
        .untilz
        movups [rax],xmm2
        movups [rax+16],xmm3
        movups [rax+r8-16],xmm4
        movups [rax+r8-32],xmm5
        ret
    .endif

    lea rcx,[rcx+r9]
    lea rdx,[rdx+r9]
    neg r9
    .repeat
        movups xmm0,[rdx+r9]
        movups xmm1,[rdx+r9+16]
        movaps [rcx+r9],xmm0
        movaps [rcx+r9+16],xmm1
        add r9,32
    .untilz
    movups [rax],xmm2
    movups [rax+16],xmm3
    movups [rax+r8-16],xmm4
    movups [rax+r8-32],xmm5
    ret

    end