Recent Posts

Pages: [1] 2 3 ... 10
1
Romper Room / Re: How programming works.....
« Last post by AW on Today at 05:52:56 AM »
Saying hello from London, the most religion and race tolerant place in the World. Arghhh.  :icon_rolleyes:

2
The Laboratory / Re: Simple floating point macros.
« Last post by hutch-- on Today at 05:26:56 AM »
Ray,

The target market for 64 bit MASM is different to the 32 bit version, it is not recommended to beginners at all but folks who already know how to write 32 bit MASM code. The difference with macros is the reference material and its easy enough to specify a "fstp variable" when the data needs to be placed in a variable but the more efficient form without redundant loads and stores is in the direction that many who use legacy code like this want.

I am just about clapped out and ready to sleep but I will have a look at your suggestion when I get up later today.
3
ASMC Development / Re: Asmc source and binaries
« Last post by nidud on Today at 05:25:10 AM »
Added a AVX implementation of the memcpy() using a switch with overlapping moves. The overhang for small counts are basically removed by using this method.

The AVX 32 byte version:
Code: [Select]
    .code

    mov rax,rcx

    .if r8 <= 32

        option switch:notest

        .switch r8

          .case 0
            ret

          .case 1
            mov cl,[rdx]
            mov [rax],cl
            ret

          .case 2,3,4
            mov cx,[rdx]
            mov dx,[rdx+r8-2]
            mov [rax+r8-2],dx
            mov [rax],cx
            ret

          .case 5,6,7,8
            mov ecx,[rdx]
            mov edx,[rdx+r8-4]
            mov [rax+r8-4],edx
            mov [rax],ecx
            ret

          .case 9,10,11,12,13,14,15,16
            mov rcx,[rdx]
            mov rdx,[rdx+r8-8]
            mov [rax],rcx
            mov [rax+r8-8],rdx
            ret

          .case 17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32
            movdqu xmm0,[rdx]
            movdqu xmm1,[rdx+r8-16]
            movups [rax],xmm0
            movups [rax+r8-16],xmm1
            ret
        .endsw
    .endif

    vmovdqu ymm1,[rdx]
    vmovdqu ymm2,[rdx+r8-32]
    .if r8 > 64

        mov ecx,eax
        neg ecx
        and ecx,32-1
        add rdx,rcx
        mov r9,r8
        sub r9,rcx
        add rcx,rax
        and r9b,-32

        .if rcx > rdx

            .repeat
                sub r9,32
                vmovdqu ymm0,[rdx+r9]
                vmovdqa [rcx+r9],ymm0
            .untilz
            vmovdqu [rax],ymm1
            vmovdqu [rax+r8-32],ymm2
            ret
        .endif

        lea rcx,[rcx+r9]
        lea rdx,[rdx+r9]
        neg r9
        .repeat
            vmovdqu ymm0,[rdx+r9]
            vmovdqa [rcx+r9],ymm0
            add r9,32
        .untilz
    .endif
    vmovdqu [rax],ymm1
    vmovdqu [rax+r8-32],ymm2
    ret

    end

The AVX 64 byte version:
Code: [Select]
    .code

    mov rax,rcx

    .if r8 <= 64

        option switch:notest

        .switch r8

          .case 0
            ret

          .case 1
            mov cl,[rdx]
            mov [rax],cl
            ret

          .case 2,3,4
            mov cx,[rdx]
            mov dx,[rdx+r8-2]
            mov [rax+r8-2],dx
            mov [rax],cx
            ret

          .case 5,6,7,8
            mov ecx,[rdx]
            mov edx,[rdx+r8-4]
            mov [rax+r8-4],edx
            mov [rax],ecx
            ret

          .case 9,10,11,12,13,14,15,16
            mov rcx,[rdx]
            mov rdx,[rdx+r8-8]
            mov [rax],rcx
            mov [rax+r8-8],rdx
            ret

          .case 17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32
            movdqu xmm0,[rdx]
            movdqu xmm1,[rdx+r8-16]
            movups [rax],xmm0
            movups [rax+r8-16],xmm1
            ret

          .case 33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,\
                49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64
            vmovdqu ymm0,[rdx]
            vmovdqu ymm1,[rdx+r8-32]
            vmovups [rax],ymm0
            vmovups [rax+r8-32],ymm1
            ret
        .endsw
    .endif

    vmovdqu ymm2,[rdx]
    vmovdqu ymm3,[rdx+32]
    vmovdqu ymm4,[rdx+r8-32]
    vmovdqu ymm5,[rdx+r8-64]

    .if r8 > 128

        mov ecx,eax
        neg ecx
        and ecx,64-1
        add rdx,rcx
        mov r9,r8
        sub r9,rcx
        add rcx,rax
        and r9b,-64

        .if rcx > rdx

            .repeat
                sub r9,64
                vmovdqu ymm0,[rdx+r9]
                vmovdqu ymm1,[rdx+r9+32]
                vmovdqa [rcx+r9],ymm0
                vmovdqa [rcx+r9+32],ymm1
            .untilz
            vmovdqu [rax],ymm2
            vmovdqu [rax+32],ymm3
            vmovdqu [rax+r8-32],ymm4
            vmovdqu [rax+r8-64],ymm5
            ret
            db 13 dup(0x90)
        .endif

        lea rcx,[rcx+r9]
        lea rdx,[rdx+r9]
        neg r9
        .repeat
            vmovdqu ymm0,[rdx+r9]
            vmovdqu ymm1,[rdx+r9+32]
            vmovdqa [rcx+r9],ymm0
            vmovdqa [rcx+r9+32],ymm1
            add r9,64
        .untilz
    .endif
    vmovdqu [rax],ymm2
    vmovdqu [rax+32],ymm3
    vmovdqu [rax+r8-32],ymm4
    vmovdqu [rax+r8-64],ymm5
    ret

    end


total [1 .. 4], 1++
    25764 cycles 2.asm: switch 32 AVX
    25788 cycles 1.asm: switch 32 SSE
    27684 cycles 3.asm: switch 64 AVX
    47541 cycles 0.asm: msvcrt.memcpy()

total [15 .. 17], 1++
    30200 cycles 2.asm: switch 32 AVX
    30364 cycles 3.asm: switch 64 AVX
    33621 cycles 1.asm: switch 32 SSE
    64903 cycles 0.asm: msvcrt.memcpy()

total [63 .. 65], 1++
    32243 cycles 3.asm: switch 64 AVX
    32869 cycles 2.asm: switch 32 AVX
    49630 cycles 1.asm: switch 32 SSE
    90890 cycles 0.asm: msvcrt.memcpy()

total [127 .. 129], 1++
    38979 cycles 3.asm: switch 64 AVX
    41102 cycles 2.asm: switch 32 AVX
    71012 cycles 1.asm: switch 32 SSE
   131579 cycles 0.asm: msvcrt.memcpy()

total [511 .. 513], 1++
    84769 cycles 3.asm: switch 64 AVX
    86763 cycles 2.asm: switch 32 AVX
   126420 cycles 1.asm: switch 32 SSE
   226737 cycles 0.asm: msvcrt.memcpy()

total [1023 .. 1025], 1++
   129894 cycles 3.asm: switch 64 AVX
   156393 cycles 2.asm: switch 32 AVX
   240375 cycles 1.asm: switch 32 SSE
   420802 cycles 0.asm: msvcrt.memcpy()
4
Wide Char - 16 bit character in Unicode codepage
UTF - Unicode Transfomation Format, that to safe space in text files
BOM - Byte Order Message, that to know how text encoded in text file
5
The Laboratory / Re: Simple floating point macros.
« Last post by HSE on Today at 04:49:14 AM »
Why do I get the impression that this last post was not all that serious ? 
You can change the names  :biggrin:

But there is a problem. I  will change.
6
The Laboratory / Re: Simple floating point macros.
« Last post by raymond on Today at 04:23:37 AM »
Quote
fld   number    ; load number
fld   st(0)        ; get/make a copy of number
fmulp             ; st(0)= number ^2

Even more simple, there's not even any need to make a second copy in another FPU register:

fld   number    ; load number
fmul st,st        ; st(0)= number ^2


I still don't like the idea of leaving data in FPU registers with macros. In my opinion, the risk of generating garbage is too high for unaware users. Results should be stored immediately in a memory variable defined by the user in an additional arg.
7
Romper Room / Re: How programming works.....
« Last post by Siekmanski on Today at 04:22:04 AM »
I like the morals and ethics from Starfleet ( Star Trek ) based on learning, logic and reasoning.  :biggrin:

We come in peace.
8
The Laboratory / Re: Simple floating point macros.
« Last post by hutch-- on Today at 04:18:36 AM »
 :biggrin:

Why do I get the impression that this last post was not all that serious ?  :P
9
The Laboratory / Re: Simple floating point macros.
« Last post by HSE on Today at 04:05:05 AM »
 :biggrin: What a nightmare!

 fpinit MACRO                ;; initialise the x87 co-processor
      fninit
      fldz
      HutchsoniansFP = 1
    ENDM

 fpadd MACRO arg1, arg2   
      fld arg1
      IFNB <arg2>
        fld arg2
        faddp                ; this is arg1+arg2
      ENDIF
      if HutchsoniansFP    ; if there is zero in st(0) [ or something else]
         faddp               ; this (arg1[+arg2]) + original st(0) [now in st(1)] 
      endif
    ENDM

   fpclose macro
      HutchsoniansFP =0
    endm
10
What happened to your old avatar ?
Pages: [1] 2 3 ... 10