Author Topic: Problem  (Read 882 times)

JK

  • Member
  • **
  • Posts: 145
Problem
« on: April 09, 2021, 10:23:23 PM »
Hi nidud,

sorry, but i found another problem with this code:
Code: [Select]
OPTION STACKBASE:RBP
option win64:5
option cstack:on


include windows.inc
includelib kernel32.lib


.code
testit proc uses rbx, y1:RECT, d4:real4, d8:real8, d10:real10, x:dword, z:Qword, y:RECT
;***************************************************************************
;
;***************************************************************************
local tx :dword
LOCAL ty :RECT
LOCAL tz :dword
 

int 3

  mov eax, x
  mov eax, d4

  mov Rax, y1
;  mov Rax, [RAX]
;  mov eax, RECT [RAX].left

  mov eax, (RECT PTR [RAX]).left

  lea rax, x                                 
  lea rdx, d4
  lea rsi, d8
  lea rdi, d10

  lea rax, tx
  lea rbx, ty                                 
  lea rcx, tz


ret


testit endp


;*************************************************************************************


start proc uses rbx rsi rdi ;r15
;***************************************************************************
;
;***************************************************************************
local x :dword
LOCAL y :RECT
LOCAL z :qword
local d4:real4
local d8:real8
local d10:real10


int 3 
  mov x, 16
  mov z, 1
  mov rax, -1
  mov y.left, 227

  invoke testit, y, 1.1, d8, d10, x , z, y

  lea rax, x
  lea rbx, y                                       
  lea rcx, z
  lea rdx, d4
  lea rsi, d8
  lea rdi, d10

  invoke ExitProcess, NULL
  ret

start endp


end start

Argument 5 and 7 get overwritten, and adresses of arguments in called procedure are not correct, see disassembly:

Code: [Select]
;000000013F491000 | 4C:894C24 38             | mov qword ptr ss:[rsp+38],r9      | overwrite arg 7, should be rsp+20
;000000013F491005 | 66:0FD65424 28           | movq qword ptr ss:[rsp+28],xmm2   | overwrite arg 5  should be rsp+18
;000000013F49100B | 66:0F7E4C24 18           | movd dword ptr ss:[rsp+18],xmm1   |                  should be rsp+10
;000000013F491011 | 48:894C24 08             | mov qword ptr ss:[rsp+8],rcx      | arg = rsp + 28
;000000013F491016 | 53                       | push rbx                          | rsp = rsp - 8
;000000013F491017 | 55                       | push rbp                          | rsp = rsp - 8 -> arg 5 = rsp - 38
;000000013F491018 | 48:8BEC                  | mov rbp,rsp                       | arg 5 = rbp - 38
;000000013F49101B | 48:83EC 20               | sub rsp,20                        |
;000000013F49101F | CC                       | int3                              |
;000000013F491020 | 8B45 48                  | mov eax,dword ptr ss:[rbp+48]     | arg 5, should be rbp - 38
;000000013F491023 | 8B45 20                  | mov eax,dword ptr ss:[rbp+20]     |
;000000013F491026 | 48:8B45 10               | mov rax,qword ptr ss:[rbp+10]     |
;000000013F49102A | 8B00                     | mov eax,dword ptr ds:[rax]        |
;000000013F49102C | 48:8D45 48               | lea rax,qword ptr ss:[rbp+48]     |
;000000013F491030 | 48:8D55 20               | lea rdx,qword ptr ss:[rbp+20]     |
;000000013F491034 | 48:8D75 30               | lea rsi,qword ptr ss:[rbp+30]     |
;000000013F491038 | 48:8D7D 40               | lea rdi,qword ptr ss:[rbp+40]     |
;000000013F49103C | 48:8D45 FC               | lea rax,qword ptr ss:[rbp-4]      |
;000000013F491040 | 48:8D5D E8               | lea rbx,qword ptr ss:[rbp-18]     |
;000000013F491044 | 48:8D4D E4               | lea rcx,qword ptr ss:[rbp-1C]     |
;000000013F491048 | C9                       | leave                             |
;000000013F491049 | 5B                       | pop rbx                           |
;000000013F49104A | C3                       | ret                               |
;000000013F49104B | 53                       | push rbx                          |
;000000013F49104C | 56                       | push rsi                          |
;000000013F49104D | 57                       | push rdi                          |
;000000013F49104E | 55                       | push rbp                          |
;000000013F49104F | 48:8BEC                  | mov rbp,rsp                       |
;000000013F491052 | 48:83EC 40               | sub rsp,40                        | should be 48h for correct RSP alignment
;000000013F491056 | CC                       | int3                              |
;000000013F491057 | C745 FC 10000000         | mov dword ptr ss:[rbp-4],10       |
;000000013F49105E | 48:C745 E0 01000000      | mov qword ptr ss:[rbp-20],1       |
;000000013F491066 | 48:C7C0 FFFFFFFF         | mov rax,FFFFFFFFFFFFFFFF          |
;000000013F49106D | C745 E8 E3000000         | mov dword ptr ss:[rbp-18],E3      |
;000000013F491074 | 48:83EC 48               | sub rsp,48                        | why 48? 7x8 = 38h, should be 40h for alignment
;000000013F491078 | 48:8D45 E8               | lea rax,qword ptr ss:[rbp-18]     | arg 7
;000000013F49107C | 48:894424 30             | mov qword ptr ss:[rsp+30],rax     |
;000000013F491081 | 48:8B45 E0               | mov rax,qword ptr ss:[rbp-20]     | arg 6
;000000013F491085 | 48:894424 28             | mov qword ptr ss:[rsp+28],rax     |
;000000013F49108A | 8B45 FC                  | mov eax,dword ptr ss:[rbp-4]      | arg 5
;000000013F49108D | 894424 20                | mov dword ptr ss:[rsp+20],eax     | -> rsp+20
;000000013F491091 | 4C:8D4D C0               | lea r9,qword ptr ss:[rbp-40]      | arg 4
;000000013F491095 | F3:0F7E55 D0             | movq xmm2,qword ptr ss:[rbp-30]   | arg 3
;000000013F49109A | 66:0F6E0D 5E1F0000       | movd xmm1,dword ptr ds:[13F493000 | arg 2
;000000013F4910A2 | 48:8D4D E8               | lea rcx,qword ptr ss:[rbp-18]     | arg 1
;000000013F4910A6 | E8 55FFFFFF              | call asmc_test2_64.13F491000      | rsp = rsp - 8
;000000013F4910AB | 48:83C4 48               | add rsp,48                        |
;000000013F4910AF | 48:8D45 FC               | lea rax,qword ptr ss:[rbp-4]      |
;000000013F4910B3 | 48:8D5D E8               | lea rbx,qword ptr ss:[rbp-18]     |
;000000013F4910B7 | 48:8D4D E0               | lea rcx,qword ptr ss:[rbp-20]     |
;000000013F4910BB | 48:8D55 DC               | lea rdx,qword ptr ss:[rbp-24]     |
;000000013F4910BF | 48:8D75 D0               | lea rsi,qword ptr ss:[rbp-30]     |
;000000013F4910C3 | 48:8D7D C0               | lea rdi,qword ptr ss:[rbp-40]     |
;000000013F4910C7 | 48:83EC 20               | sub rsp,20                        |
;000000013F4910CB | 33C9                     | xor ecx,ecx                       |
;000000013F4910CD | FF15 2D0F0000            | call qword ptr ds:[<&RtlExitUserP |
;000000013F4910D3 | 48:83C4 20               | add rsp,20                        |
;000000013F4910D7 | C9                       | leave                             |
;000000013F4910D8 | 5F                       | pop rdi                           |
;000000013F4910D9 | 5E                       | pop rsi                           |
;000000013F4910DA | 5B                       | pop rbx                           |
;000000013F4910DB | C3                       | ret                               |

It seems arguments in the called procedure are off by 10h in general.

Regarding RSP being always 16 bit aligned, it would be enough to align RSP when reserving space for locals at procedure entry and each time when reserving space for the called procedure. In the latter case "value" (sub RSP, value) only needs to be rounded up to the next 16 bit border. In case of procedure entry registers to save and space for locals must be taken into account. Otherwise i would have to code "and RSP,-16" right after each procedure entry and use a wrapper for INVOKE like this:
Code: [Select]
_call macro pname:req, args:VARARG
LOCAL cnt
  cnt = 0
  FOR item, <args>
    cnt = cnt + 1
  ENDM

  if cnt ge 4
    sub rsp, 8
    invoke pname, args
    add rsp, 8
 
  else
    invoke pname, args
  endif
endm

adding "sub RSP, 8  ... add RSP, 8" where necessary. Note: this is built upon the fact, that ASMC always adds a number ending with 8h, if the number of arguments exceeds 3. This sounds strange, because to my understanding it is not necessary, but in fact ASMC codes it this way.


JK

nidud

  • Member
  • *****
  • Posts: 2242
    • https://github.com/nidud/asmc
Re: Problem
« Reply #1 on: April 10, 2021, 02:49:54 AM »
I had to revise the changes made to REAL10 as it messes up the whole rule-based float implementation, as all floats in 64-bit use SIMD registers. REAL10 is sort of a misfit as it extend beyond the 8-byte regular size.

If one of the parameters extend this size the shadow space needs to double up, so it has to be like this:

TypeSizeReg(1)Stack
--------------------------------------------
REAL22XMM08
REAL44XMM08
REAL88XMM08
REAL1010XMM016
REAL1616XMM016
YWORD32YMM032
ZWORD64ZMM064

So REAL10 has to trigger the chunk size in the same way as REAL16. Also notice that Asmc uses RBP to access arguments by default (same as Masm).

    option win64:1

    .code

foo proc a1:real2, a2:real4, a3:real10

    lea rax,a1
    lea rax,a2
    lea rax,a3
    ret

foo endp

        movaps  xmmword ptr [rsp+28H], xmm2
        movd    dword ptr [rsp+18H], xmm1
        movd    dword ptr [rsp+8H], xmm0
        push    rbp
        mov     rbp, rsp
        lea     rax, [rbp+10H]
        lea     rax, [rbp+20H]
        lea     rax, [rbp+30H]
        leave
        ret

Quote
Argument 5 and 7 get overwritten, and adresses of arguments in called procedure are not correct, see disassembly:

Yes, this was wrongly calculated (using 10) so it have to follow the same logic as above. To render a call to this also means the allocated shadow space needs to extend from 32 to 64.

main proc

    foo(1.0, 2.0, 3.0)
    ret

main endp

        sub     rsp, 64
        movaps  xmm2, xmmword ptr [F0000]
        movd    xmm1, dword ptr [F0001]
        mov     ax, 15360
        movd    xmm0, eax
        call    foo
        add     rsp, 64
        ret

However, without the auto option the stack will not be aligned so this will fail.

    option win64:3

        sub     rsp, 72
        movaps  xmm2, xmmword ptr [F0000]
        movd    xmm1, dword ptr [F0001]
        mov     ax, 15360
        movd    xmm0, eax
        call    foo
        add     rsp, 72
        ret

JK

  • Member
  • **
  • Posts: 145
Re: Problem
« Reply #2 on: April 10, 2021, 04:40:55 AM »
Thanks for again looking after this!

Neither MS (https://docs.microsoft.com/en-us/cpp/build/x64-calling-convention?view=msvc-160#parameter-passing):
Quote
Any floating-point and double-precision arguments in the first four parameters are passed in XMM0 - XMM3, depending on position. Floating-point values are only placed in the integer registers RCX, RDX, R8, and R9 when there are varargs arguments. For details, see Varargs. Similarly, the XMM0 - XMM3 registers are ignored when the corresponding argument is an integer or pointer type.

__m128 types, arrays, and strings are never passed by immediate value. Instead, a pointer is passed to memory allocated by the caller. Structs and unions of size 8, 16, 32, or 64 bits, and __m64 types, are passed as if they were integers of the same size. Structs or unions of other sizes are passed as a pointer to memory allocated by the caller. For these aggregate types passed as a pointer, including __m128, the caller-allocated temporary memory must be 16-byte aligned.

nor Agner Fog´s Calling Conventions manual (page 18):
Quote
Parameters in registers specifies which registers are used for transferring parameters. ecx,
edx means that the first parameter goes into ecx, the second parameter goes into edx, and
subsequent parameters are stored on the stack. Parameter types that do not fit into the
registers are stored on the stack. In general, all integer types, bool, enum and pointers can
be transferred in the general purpose registers. References are treated as identical to
pointers in all respects. Arrays are transferred as pointers. Float and double types are
transferred in XMM registers in 64 bit mode, otherwise on the stack. Long doubles,
structures, classes and unions may be transferred on the stack or through pointers if they
do not fit into registers

seem absolutely clear about it. While the last sentence of Agner Fog implies, that long doubles "may be" transferred in registers, MS doesn´t even mention long doubles. But it makes sense, that every floating point number, which actually fits into an XMM register, should be transferred with such a register instead of using a pointer, even if it´s size exceeds 8 Byte.


Quote
However, without the auto option the stack will not be aligned so this will fail.
So, what about taking RSP alignment into account when calculating the space needed for a call (sub rsp,64 vs. sub rsp,72)? Both values (64 and 72) are calculated somehow, would it be much of an effort to always calculate it to a value, which ensures correct stack alignment? Correct stack alignment can never be wrong!


JK



HSE

  • Member
  • *****
  • Posts: 1765
  • <AMD>< 7-32>
Re: Problem
« Reply #3 on: April 10, 2021, 05:55:40 AM »
 :biggrin: M$ is very clear from the beginning :
Code: [Select]
Any argument that doesn't fit in 8 bytes, or isn't 1, 2, 4, or 8 bytes, must be passed by reference

nidud

  • Member
  • *****
  • Posts: 2242
    • https://github.com/nidud/asmc
Re: Problem
« Reply #4 on: April 10, 2021, 06:01:06 AM »
As for Microsoft they don't support long double at all as it's just defined as double in the header files. Asmc use REAL16 internally for all float operations and also support this natively.

MS have another calling conventions for vectors (VECTOCALL) that is similar to FASTCALL but with 6 SIMD registers and a 96 bytes sized shadow space. This will normally be used in situations like the one above.
   
quadmath.inc

Immediate values passed to INVOKE are created in the CONST/DATA segment. The two above will be rendered like this:

.data
 align 16
 F0000 label real10
 oword 0x0000000000004000C000000000000000
 align 4
 F0001 dd 0x40000000

Quote
So, what about taking RSP alignment into account when calculating the space needed for a call (sub rsp,64 vs. sub rsp,72)? Both values (64 and 72) are calculated somehow, would it be much of an effort to always calculate it to a value, which ensures correct stack alignment? Correct stack alignment can never be wrong!

The idea here is that the prologue is handled manually and INVOKE handle the calls individually by subtracting/adding the stack for each call.

    push    rbp
    mov     rbp,rsp
    sub     rsp,64
    ...
    call    foo
    add     rsp,64
    sub     rsp,32
    ...
    call    bar
    add     rsp,32
    ...
    leave
    ret

As there may be hundreds of calls within one PROC/ENDP it's more rational to do this only ones for all.

JK

  • Member
  • **
  • Posts: 145
Re: Problem
« Reply #5 on: April 10, 2021, 08:43:38 AM »
Sorry to say that, but it still overwrites argument 5 and following. And even with option auto it crashes at:
Code: [Select]
movaps xmm3,xmmword ptr ss:[rbp-3A] |with R15 included in the "uses" clause.



nidud

  • Member
  • *****
  • Posts: 2242
    • https://github.com/nidud/asmc
Re: Problem
« Reply #6 on: April 10, 2021, 08:25:57 PM »
Quote
16-byte Alignment for Local Stack Variables [bit 2]:
0: standard 8-byte alignment for local variables.
1: 16-byte alignment for local variables. This setting is useful if you want to load or store XMM registers with instructions that expect aligned memory references ( i.e. MOVAPS ). Note that variables with size < 16 are not affected.

So to the sample code.

    option win64:7

    .code

main proc uses rsi rdi rbx

  local b:dword
  local o:RECT
  local d:qword
  local f:real4
  local r:real8
  local q:real16

    lea rax,b
    lea rax,o
    lea rax,d
    lea rax,f
    lea rax,r
    movaps xmm0,q
    ret

main endp

    end main

option cstack:off

        push    rbp
        mov     rbp, rsp
        push    rsi
        push    rdi
        push    rbx
        sub     rsp, 104
        lea     rax, [rbp-1CH] ; 4
        lea     rax, [rbp-30H] ; 16
        lea     rax, [rbp-38H] ; 8
        lea     rax, [rbp-3CH] ; 4
        lea     rax, [rbp-48H] ; 8
        movaps  xmm0,[rbp-60H] ; 16
        add     rsp, 104
        pop     rbx
        pop     rdi
        pop     rsi
        leave
        ret

option cstack:on

        push    rsi
        push    rdi
        push    rbx
        push    rbp
        mov     rbp, rsp
        sub     rsp, 104
        lea     rax, [rbp-4H]  ; 4
        lea     rax, [rbp-18H] ; 16
        lea     rax, [rbp-20H] ; 8
        lea     rax, [rbp-24H] ; 4
        lea     rax, [rbp-30H] ; 8
        movaps  xmm0,[rbp-48H] ; 16
        leave
        pop     rbx
        pop     rdi
        pop     rsi
        ret

So this works so lets try to mess things up a bit.

foo proc uses rbx a:RECT, b:real4, c:real8, d:real16, e:dword, f:qword, g:RECT

  local x:dword
  local y:RECT
  local z:dword

    lea rax,x
    lea rax,y
    lea rax,z

    lea rax,a
    lea rax,b
    lea rax,c
    lea rax,d
    lea rax,e
    lea rax,f
    lea rax,g
    ret
foo endp

        movaps  [rsp+38H], xmm3 ; d rel16
        movq    [rsp+28H], xmm2 ; c real8
        movd    [rsp+18H], xmm1 ; b real4
        mov     [rsp+8H], rcx   ; a RECT
        push    rbp
        mov     rbp, rsp
        push    rbx
        sub     rsp, 72
        lea     rax, [rbp-0CH]  ; x 4
        lea     rax, [rbp-20H]  ; y 16
        lea     rax, [rbp-24H]  ; z 4
        lea     rax, [rbp+10H]  ; a
        lea     rax, [rbp+20H]  ; b
        lea     rax, [rbp+30H]  ; c
        lea     rax, [rbp+40H]  ; d
        lea     rax, [rbp+40H]  ; e [rbp+50H]
        lea     rax, [rbp+48H]  ; f [rbp+60H]
        lea     rax, [rbp+50H]  ; g [rbp+70H]
        leave
        pop     rbx
        ret

So FASTCALL fails on extended stack (as expected) but so do VECTORCALL, so there is definitely something wrong there.

        lea     rax, [rbp+10H]
        lea     rax, [rbp+20H]
        lea     rax, [rbp+30H]
        lea     rax, [rbp+40H]
        lea     rax, [rbp+50H]
        lea     rax, [rbp+60H]
        lea     rax, [rbp+50H]

nidud

  • Member
  • *****
  • Posts: 2242
    • https://github.com/nidud/asmc
Re: Problem
« Reply #7 on: April 11, 2021, 11:50:15 PM »
Cleaned up some of the stack issues above and added a stress-test for RBP/RSP frames with extended stack-chunk size.

asmc64 test.asm
objconv -fasm test.obj _test.asm

Code: [Select]

    ; Extended stack size

    option frame:auto

    RECT    struc
    left    dd ?
    top     dd ?
    right   dd ?
    bottom  dd ?
    RECT    ends

type2 macro id, type, frame
&type&_&id&_2&frame& proc type frame uses rbx a:RECT, b:real4
    lea rax,a
    lea rax,b
    ret
&type&_&id&_2&frame& endp
    endm

type7 macro id, type, frame
&type&_&id&_7&frame& proc type frame uses rbx a:RECT, b:real4, c:real8, d:real16, e:dword, f:qword, g:RECT
    lea rax,a
    lea rax,b
    lea rax,c
    lea rax,d
    lea rax,e
    lea rax,f
    lea rax,g
    ret
&type&_&id&_7&frame& endp
    endm

makeid macro id
    type2 id, fastcall
    type2 id, fastcall, frame
    type2 id, vectorcall
    type2 id, vectorcall, frame
    type7 id, fastcall
    type7 id, fastcall, frame
    type7 id, vectorcall
    type7 id, vectorcall, frame
    endm

callid macro id
    fastcall_&id&_2(a, b)
    fastcall_&id&_2frame(a, b)
    vectorcall_&id&_2(a, b)
    vectorcall_&id&_2frame(a, b)
    fastcall_&id&_7(a, b, c, d, e, f, a)
    fastcall_&id&_7frame(a, b, c, d, e, f, a)
    vectorcall_&id&_7(a, b, c, d, e, f, a)
    vectorcall_&id&_7frame(a, b, c, d, e, f, a)
    endm

    .code

    option win64:1
    option cstack:off
    makeid rbp_1_0
    option cstack:on
    makeid rbp_1_1

    option win64:3
    option cstack:off
    makeid rbp_3_0
    option cstack:on
    makeid rbp_3_1

    option win64:rsp save noauto
    option cstack:off
    makeid rsp_1_0
    option cstack:on
    makeid rsp_1_1

    option win64:rsp save auto
    option cstack:off
    makeid rsp_3_0
    option cstack:on
    makeid rsp_3_1

    option win64:rbp auto save align

main proc

  local a:RECT, b:real4, c:real8, d:real16, e:dword, f:qword

    for q,<a,b,c,d,e,f>
        lea rax,q
        endm
    for i,<rbp_1_0,rbp_1_1,rbp_3_0,rbp_3_1,rsp_1_0,rsp_1_1,rsp_3_0,rsp_3_1>
        callid i
        endm
    ret

main endp

    end

The sample should work now provided a few changes is added.

- change the order of the options:

include windows.inc
includelib kernel32.lib

OPTION STACKBASE:RBP
option win64:7
option cstack:on

- use real16 as local storage

local d10:real16

Code: [Select]
testit  PROC
        movaps  xmmword ptr [rsp+38H], xmm3
        movq    qword ptr [rsp+28H], xmm2
        movd    dword ptr [rsp+18H], xmm1
        mov     qword ptr [rsp+8H], rcx
        push    rbx
        push    rbp
        mov     rbp, rsp
        sub     rsp, 72
        mov     eax, dword ptr [rbp+58H]
        mov     eax, dword ptr [rbp+28H]
        mov     rax, qword ptr [rbp+18H]
        mov     eax, dword ptr [rax]
        lea     rax, [rbp+58H]
        lea     rdx, [rbp+28H]
        lea     rsi, [rbp+38H]
        lea     rdi, [rbp+48H]
        lea     rax, [rbp-4H]
        lea     rbx, [rbp-18H]
        lea     rcx, [rbp-1CH]
        leave
        pop     rbx
        ret
testit  ENDP

start   PROC
        push    rbx
        push    rsi
        push    rdi
        push    rbp
        mov     rbp, rsp
        sub     rsp, 168
        mov     dword ptr [rbp-4H], 16
        mov     qword ptr [rbp-20H], 1
        mov     rax, -1
        mov     dword ptr [rbp-18H], 227
        lea     rax, [rbp-18H]
        mov     qword ptr [rsp+60H], rax
        mov     rax, qword ptr [rbp-20H]
        mov     qword ptr [rsp+50H], rax
        mov     eax, dword ptr [rbp-4H]
        mov     dword ptr [rsp+40H], eax
        movaps  xmm3, xmmword ptr [rbp-48H]
        movq    xmm2, qword ptr [rbp-30H]
        movd    xmm1, dword ptr [F0000]
        lea     rcx, [rbp-18H]
        call    testit
        lea     rax, [rbp-4H]
        lea     rbx, [rbp-18H]
        lea     rcx, [rbp-20H]
        lea     rdx, [rbp-24H]
        lea     rsi, [rbp-30H]
        lea     rdi, [rbp-48H]
        xor     ecx, ecx
        call    ExitProcess
        leave
        pop     rdi
        pop     rsi
        pop     rbx
        ret
start   ENDP

JK

  • Member
  • **
  • Posts: 145
Re: Problem
« Reply #8 on: April 12, 2021, 12:32:59 AM »
Tanks nidud!

Indeed with a REAL16 it works now. Maybe you should align a REAL10 just like a REAL16 on the stack, then it wouldn´t fail with a REAL10. But i can live with REAL16


JK