Author Topic: x86 instruction generator - Checking Correctness of Code Generator  (Read 998 times)

LiaoMi

  • Member
  • ****
  • Posts: 593
Hello,

cool project in Haskell, allows you to generate instruction sets for testing. For 64 bits we need to add parameters, it is very easy thanks to Haskell language. I have not yet compared the opcode tables with a page - http://ref.x86asm.net/coder64.html, but it can be done for quality control.
 
Piece of output:
Code: [Select]
; #########################################################################

      .486
      .model flat, stdcall
      option casemap :none   ; case sensitive

; #########################################################################

      include \masm32\include\windows.inc
      include \masm32\include\user32.inc
      include \masm32\include\kernel32.inc

      includelib \masm32\lib\user32.lib
      includelib \masm32\lib\kernel32.lib

; #########################################################################

    .data

    someDword dd 5
   
    .code

start:

jcxz start
jecxz start
loop start
loope start
loopne start
adc al, 12h
adc bl, 12h
adc cl, 12h
adc dl, 12h
adc ah, 12h
adc bh, 12h
adc ch, 12h
adc dh, 12h
adc ax, 12h
adc bx, 12h
adc cx, 12h
adc dx, 12h
adc si, 12h
adc di, 12h
adc sp, 12h
adc bp, 12h
adc ax, 512h
adc bx, 512h
adc cx, 512h
adc dx, 512h
adc si, 512h
adc di, 512h
adc sp, 512h
adc bp, 512h
adc eax, 12h
adc ebx, 12h
adc ecx, 12h
adc edx, 12h
adc esi, 12h
adc edi, 12h
adc esp, 12h
adc ebp, 12h
adc eax, 6237512h
adc ebx, 6237512h
adc ecx, 6237512h
adc edx, 6237512h
adc esi, 6237512h
adc edi, 6237512h
adc esp, 6237512h
adc ebp, 6237512h
adc byte ptr [someDword], 12h
adc byte ptr [eax], 12h
adc byte ptr [ebx], 12h
adc byte ptr [ecx], 12h
adc byte ptr [edx], 12h
adc byte ptr [esi], 12h
adc byte ptr [edi], 12h
adc byte ptr [esp], 12h
adc byte ptr [ebp], 12h
adc byte ptr [eax + 123456h], 12h
adc byte ptr [ebx + 123456h], 12h
adc byte ptr [ecx + 123456h], 12h
adc byte ptr [edx + 123456h], 12h
adc byte ptr [esi + 123456h], 12h
adc byte ptr [edi + 123456h], 12h
adc byte ptr [esp + 123456h], 12h
adc byte ptr [ebp + 123456h], 12h
adc byte ptr [4 * eax + 123456h], 12h
adc byte ptr [4 * ebx + 123456h], 12h
adc byte ptr [4 * ecx + 123456h], 12h
adc byte ptr [4 * edx + 123456h], 12h
adc byte ptr [4 * esi + 123456h], 12h
adc byte ptr [4 * edi + 123456h], 12h
adc byte ptr [4 * ebp + 123456h], 12h
adc byte ptr [eax + eax], 12h
adc byte ptr [eax + ebx], 12h
adc byte ptr [eax + ecx], 12h
adc byte ptr [eax + edx], 12h
adc byte ptr [eax + esi], 12h
adc byte ptr [eax + edi], 12h
adc byte ptr [eax + esp], 12h
adc byte ptr [eax + ebp], 12h
adc byte ptr [ebx + eax], 12h
adc byte ptr [ebx + ebx], 12h
adc byte ptr [ebx + ecx], 12h
adc byte ptr [ebx + edx], 12h
adc byte ptr [ebx + esi], 12h
adc byte ptr [ebx + edi], 12h
adc byte ptr [ebx + esp], 12h
adc byte ptr [ebx + ebp], 12h
adc byte ptr [ecx + eax], 12h
adc byte ptr [ecx + ebx], 12h
adc byte ptr [ecx + ecx], 12h
adc byte ptr [ecx + edx], 12h
adc byte ptr [ecx + esi], 12h
adc byte ptr [ecx + edi], 12h
adc byte ptr [ecx + esp], 12h
adc byte ptr [ecx + ebp], 12h
adc byte ptr [edx + eax], 12h
adc byte ptr [edx + ebx], 12h
adc byte ptr [edx + ecx], 12h
adc byte ptr [edx + edx], 12h
adc byte ptr [edx + esi], 12h
adc byte ptr [edx + edi], 12h
adc byte ptr [edx + esp], 12h
adc byte ptr [edx + ebp], 12h
adc byte ptr [esi + eax], 12h
adc byte ptr [esi + ebx], 12h
adc byte ptr [esi + ecx], 12h
adc byte ptr [esi + edx], 12h
adc byte ptr [esi + esi], 12h
adc byte ptr [esi + edi], 12h
adc byte ptr [esi + esp], 12h
adc byte ptr [esi + ebp], 12h
adc byte ptr [edi + eax], 12h
adc byte ptr [edi + ebx], 12h
adc byte ptr [edi + ecx], 12h
adc byte ptr [edi + edx], 12h
adc byte ptr [edi + esi], 12h
adc byte ptr [edi + edi], 12h
adc byte ptr [edi + esp], 12h
adc byte ptr [edi + ebp], 12h
adc byte ptr [esp + eax], 12h
adc byte ptr [esp + ebx], 12h
adc byte ptr [esp + ecx], 12h
adc byte ptr [esp + edx], 12h
adc byte ptr [esp + esi], 12h
adc byte ptr [esp + edi], 12h
adc byte ptr [esp + ebp], 12h
adc byte ptr [ebp + eax], 12h
adc byte ptr [ebp + ebx], 12h
adc byte ptr [ebp + ecx], 12h
adc byte ptr [ebp + edx], 12h
adc byte ptr [ebp + esi], 12h
adc byte ptr [ebp + edi], 12h
adc byte ptr [ebp + esp], 12h
adc byte ptr [ebp + ebp], 12h
adc byte ptr [4 * eax + eax], 12h
adc byte ptr [4 * eax + ebx], 12h
adc byte ptr [4 * eax + ecx], 12h
adc byte ptr [4 * eax + edx], 12h
adc byte ptr [4 * eax + esi], 12h
adc byte ptr [4 * eax + edi], 12h
adc byte ptr [4 * eax + esp], 12h
adc byte ptr [4 * eax + ebp], 12h
adc byte ptr [4 * ebx + eax], 12h
adc byte ptr [4 * ebx + ebx], 12h
adc byte ptr [4 * ebx + ecx], 12h
adc byte ptr [4 * ebx + edx], 12h
adc byte ptr [4 * ebx + esi], 12h
adc byte ptr [4 * ebx + edi], 12h
adc byte ptr [4 * ebx + esp], 12h
adc byte ptr [4 * ebx + ebp], 12h
adc byte ptr [4 * ecx + eax], 12h
adc byte ptr [4 * ecx + ebx], 12h
adc byte ptr [4 * ecx + ecx], 12h
adc byte ptr [4 * ecx + edx], 12h
adc byte ptr [4 * ecx + esi], 12h
adc byte ptr [4 * ecx + edi], 12h
adc byte ptr [4 * ecx + esp], 12h
adc byte ptr [4 * ecx + ebp], 12h
adc byte ptr [4 * edx + eax], 12h
adc byte ptr [4 * edx + ebx], 12h
adc byte ptr [4 * edx + ecx], 12h
adc byte ptr [4 * edx + edx], 12h
adc byte ptr [4 * edx + esi], 12h
adc byte ptr [4 * edx + edi], 12h
adc byte ptr [4 * edx + esp], 12h
adc byte ptr [4 * edx + ebp], 12h
adc byte ptr [4 * esi + eax], 12h
adc byte ptr [4 * esi + ebx], 12h
adc byte ptr [4 * esi + ecx], 12h
adc byte ptr [4 * esi + edx], 12h
adc byte ptr [4 * esi + esi], 12h
adc byte ptr [4 * esi + edi], 12h
adc byte ptr [4 * esi + esp], 12h
adc byte ptr [4 * esi + ebp], 12h
adc byte ptr [4 * edi + eax], 12h
adc byte ptr [4 * edi + ebx], 12h
adc byte ptr [4 * edi + ecx], 12h
adc byte ptr [4 * edi + edx], 12h
adc byte ptr [4 * edi + esi], 12h
adc byte ptr [4 * edi + edi], 12h
adc byte ptr [4 * edi + esp], 12h
adc byte ptr [4 * edi + ebp], 12h
adc byte ptr [4 * ebp + eax], 12h
adc byte ptr [4 * ebp + ebx], 12h
adc byte ptr [4 * ebp + ecx], 12h
adc byte ptr [4 * ebp + edx], 12h
adc byte ptr [4 * ebp + esi], 12h
adc byte ptr [4 * ebp + edi], 12h
adc byte ptr [4 * ebp + esp], 12h
adc byte ptr [4 * ebp + ebp], 12h
adc word ptr [someDword], 12h

This helper can detect errors like here (mov byte ptr [rax] Bug) - http://masm32.com/board/index.php?topic=7829.0

Haskell - An advanced, purely functional programming language - https://www.haskell.org/platform/windows.html
Documentation - https://haskell.org/definition/haskell2010.pdf
Leksah - Haskell Free IDE in Haskell - https://github.com/leksah/leksah/wiki/Leksah-0.15.2.0

 :t

AW

  • Member
  • *****
  • Posts: 2435
  • Let's Make ASM Great Again!
Re: x86 instruction generator - Checking Correctness of Code Generator
« Reply #1 on: May 08, 2019, 12:29:09 AM »
Interesting and potentially useful.
I see that you have included 16-bit 8086-only instructions, like pop ax, together with 32-bit instructions. However no 64-bit instructions were generated.

LiaoMi

  • Member
  • ****
  • Posts: 593
Re: x86 instruction generator - Checking Correctness of Code Generator
« Reply #2 on: May 08, 2019, 05:11:58 AM »
x86 instruction generator

A piece of the old discussion can be found here - http://masm32.com/board/index.php?topic=8015.45

Changes
- Fixed generation of instructions with one operand (An erroneous array size was specified)
- Register configuration for the AVX512 has been added (ZMM0 - ZMM31, YMM0 - YMM31, XMM0 - XMM31). Details here - https://en.wikipedia.org/wiki/AVX-512

Encoding and features
The VEX prefix used by AVX and AVX2, while flexible, did not leave enough room for the features Intel wanted to add to AVX-512. This has led them to define a new prefix called EVEX.

Compared to VEX, EVEX adds the following benefits:

Expanded register encoding allowing 32 512-bit registers.
Support up to 4 operands.
Adds 7 new opmask registers for masking most AVX-512 instructions.
Adds a new scalar memory mode that automatically performs a broadcast.
Adds room for explicit rounding control in each instruction.
Adds a new compressed displacement memory addressing mode.
The extended registers, SIMD width bit, and opmask registers of AVX-512 are mandatory and all require support from the OS.


- Formatting errors in the database are fixed, empty lines are cleared.
- Checkbox handlers were implemented but not yet active.
- Three operand instructions are implemented.
- 5 operand instructions are canceled, I did not find any instructions with 5 operands, except VPERMIL2PS ymm1, ymm2, ymm3, ymm4/m256, imz2 - but these instructions are no longer supported. This means that it remains to complete the data types, since all instruction lengths can now be generated.
- imul instructions were added to the database, the table was not processed by the automatic parsing, due to the presence of a footnote.



Leave your feedback! Your opinion interests me  :wink2: ..

P.S> please take a look at the document with types for implementation, I'm not sure that all encoding is important for assembly.
« Last Edit: August 29, 2019, 05:41:00 AM by LiaoMi »

LiaoMi

  • Member
  • ****
  • Posts: 593
Re: x86 instruction generator - Checking Correctness of Code Generator
« Reply #3 on: August 29, 2019, 02:30:43 AM »
Hi,

the update can be downloaded above (http://masm32.com/board/index.php?topic=7833.msg85925#msg85925), now questions, how to implement ?!

m64bcst - 512-bit vector loaded from a 64-bit memory location.
Code: [Select]
.data
align 32
Ymmsk ymmword 000000000100000006000000070000000C0000000D0000000A0000000B000000h

.code
ymmword ptr [ Ymmsk ]
?!

vm32{x,y, z}
vm64{x,y, z}

I think there are no such registers on my laptop  :biggrin:
k1 — A mask register used as a regular operand (either destination or source). The 64-bit k registers are: k0 through k7.
bnd — A 128-bit bounds register. BND0 through BND3 (Intel MPX).
{k1}{z} A mask register used as instruction writemask. The 64-bit k registers are: k1 through k7.
{k1} A mask register used as instruction writemask. The 64-bit k registers are: k1 through k7.

VMAXPD xmm1{k1}{z},xmm2,xmm3/m128/m64bcst  - How will the assembler code look like?

Code: [Select]
.data
align 32
Xmmsk xmmword 7FFFFFFFFFFFFFFF7FFFFFFFFFFFFFFFh

.code
VMAXPD xmm1{k1}{z},xmm2,xmmword ptr [ Xmmsk ]

What to do with k1? We need to initialize K1 first ..

Will Oracle VM VirtualBox be useful for testing purposes? K registers and bnd registers I can implement, in general, this is not a problem. vm32 - vm64 I don’t know what it is ... m64bcst - I don’t know how this memory access should look like.
« Last Edit: August 29, 2019, 05:42:28 AM by LiaoMi »

LiaoMi

  • Member
  • ****
  • Posts: 593
Re: x86 instruction generator - Checking Correctness of Code Generator
« Reply #4 on: August 29, 2019, 09:19:44 PM »
I took the source codes from the book as an example - Modern X86 Assembly Language Programming Covers x86 64-bit, AVX, AVX2, and AVX-512 Authors: Kusswurm, Daniel

Source code
https://github.com/Apress/modern-x86-assembly-language-programming-2e

From chapter 13  :tongue:

Code: [Select]
;-------------------------------------------------
;               Ch13_01.asm
;-------------------------------------------------

            include <cmpequ.asmh>
            .const
r8_three    real8 3.0
r8_four     real8 4.0

            extern g_PI:real8

; extern "C" bool Avx512CalcSphereAreaVol_(double* sa, double* v, double r, double error_val);
;
; Returns:  false = invalid radius, true = valid radius

        .code
Avx512CalcSphereAreaVol_ proc

; Test radius for value >= 0.0
        vmovsd xmm0,xmm0,xmm2               ;xmm0 = radius
        vxorpd xmm5,xmm5,xmm5               ;xmm5 = 0.0
        vmovsd xmm16,xmm16,xmm3             ;xmm16 = error_val
        vcmpsd k1,xmm0,xmm5,CMP_GE          ;k1[0] = 1 if radius >= 0.0

; Calculate surface area and volume using mask from compare
        vmulsd xmm1{k1},xmm0,xmm0           ;xmm1 = r * r
        vmulsd xmm2{k1},xmm1,[r8_four]      ;xmm2 = 4 * r * r
        vmulsd xmm3{k1},xmm2,[g_PI]         ;xmm3 = 4 * PI * r * r (sa)
        vmulsd xmm4{k1},xmm3,xmm0           ;xmm4 = 4 * PI * r * r * r
        vdivsd xmm5{k1},xmm4,[r8_three]     ;xmm5 = 4 * PI * r * r * r / 3 (vol)

; Set surface area and volume to error_val if radius < 0.0 is true
        knotw k2,k1                         ;k2[0] = 1 if radius < 0.0
        vmovsd xmm3{k2},xmm3,xmm16          ;xmm3 = error_val if radius < 0.0
        vmovsd xmm5{k2},xmm5,xmm16          ;xmm5 = error_val if radius < 0.0

; Save results
        vmovsd real8 ptr [rcx],xmm3         ;save surface area
        vmovsd real8 ptr [rdx],xmm5         ;save volume

        kmovw eax,k1                        ;eax = return code
        ret
Avx512CalcSphereAreaVol_ endp
        end

How k1 register is initialized can be seen from the example  :thumbsup:


One more example, {k1}{z}, this means that the generator must generate all variations of the K register, vsqrtsd xmm3{k1}{z}, vsqrtsd xmm3{k2}{z}, vsqrtsd xmm3{k3}{z} etc ...{z} The type of masking is determined by using the EVEX.z bit.
Code: [Select]
;-------------------------------------------------
;               Ch13_02.asm
;-------------------------------------------------

        include <cmpequ.asmh>

; extern "C" bool Avx512CalcValues_(double* c, const double* a, const double* b, size_t n);

        .code
Avx512CalcValues_ proc

; Validate n and initialize array index i
        xor eax,eax                         ;set error return code (also i = 0)
        test r9,r9                          ;is n == 0?
        jz Done                             ;jump if n is zero

        vxorpd xmm5,xmm5,xmm5               ;xmm5 = 0.0

; Load next a[i] and b[i], calculate val
@@:     vmovsd xmm0,real8 ptr [rdx+rax*8]   ;xmm0 = a[i];
        vmovsd xmm1,real8 ptr [r8+rax*8]    ;xmm1 = b[i];
        vmulsd xmm2,xmm0,xmm1               ;val = a[i] * b[i]

; Calculate c[i] = (val >= 0.0) ? sqrt(val) : val * val
        vcmpsd k1,xmm2,xmm5,CMP_GE          ;k1[0] = 1 if val >= 0.0
        vsqrtsd xmm3{k1}{z},xmm3,xmm2       ;xmm3 = (val > 0.0) ? sqrt(val) : 0.0
        knotw k2,k1                         ;k2[0] = 1 if val < 0.0
        vmulsd xmm4{k2}{z},xmm2,xmm2        ;xmm4 = (val < 0.0) ? val * val : 0.0
        vorpd xmm0,xmm4,xmm3                ;xmm0 = (val >= 0.0) ? sqrt(val) : val * val
        vmovsd real8 ptr [rcx+rax*8],xmm0   ;save result to c[i]

; Update index i and repeat until done
        inc rax                             ;i += 1
        cmp rax,r9
        jl @B
        mov eax,1                          ;set success return code

Done:   ret
Avx512CalcValues_ endp
        end


vcmppd k1,zmm0,real8 bcst [r13],CMP_GE  ;k1 = mask of values >= x_min - UASM doesn't understand what is bcst

Code: [Select]
;-------------------------------------------------
;               Ch13_06.asm
;-------------------------------------------------
; Update col_means and col_counts using next eight columns
        vmovupd zmm0,zmmword ptr [rcx]          ;load next 8 cols of cur row
        vcmppd k1,zmm0,real8 bcst [r13],CMP_GE  ;k1 = mask of values >= x_min
        vmovupd zmm1{k1}{z},zmm0                ;values >= x_min or 0.0
        vaddpd zmm2,zmm1,zmmword ptr [r11]      ;add values to col_means
        vmovupd zmmword ptr [r11],zmm2          ;save updated col_means