This is what I get with 64 bit MASM using a custom prologue/epilogue. The entry/exit code is small and for high level code, its easily fast enough. For low level code you don't use a stack frame.
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
entry_point proc
LOCAL a1 :QWORD
LOCAL a2 :QWORD
LOCAL a3 :QWORD
LOCAL a4 :QWORD
mov a1, 1
mov a2, 2
mov a3, 3
mov a4, 4
xor rcx, rcx
call ExitProcess
ret
entry_point endp
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
end
comment * +++++++++++++++++++++++++++
segment .text
enter 0x80, 0x0
sub rsp, 0x80
mov qword ptr [rbp-0x68], 0x1
mov qword ptr [rbp-0x70], 2
mov qword ptr [rbp-0x78], 3
mov qword ptr [rbp-0x80], 4
xor rcx, rcx
call qword ptr [ExitProcess]
leave
ret
* +++++++++++++++++++++++++++++++++++
The disassembly in detail.
.text:0000000140001000 C8800000 enter 0x80, 0x0
.text:0000000140001004 4881EC80000000 sub rsp, 0x80
.text:000000014000100b 48C7459801000000 mov qword ptr [rbp-0x68], 0x1
.text:0000000140001013 48C7459002000000 mov qword ptr [rbp-0x70], 2
.text:000000014000101b 48C7458803000000 mov qword ptr [rbp-0x78], 3
.text:0000000140001023 48C7458004000000 mov qword ptr [rbp-0x80], 4
.text:000000014000102b 4833C9 xor rcx, rcx
.text:000000014000102e FF1560100000 call qword ptr [ExitProcess]
.text:0000000140001034 C9 leave
.text:0000000140001035 C3 ret