News:

Masm32 SDK description, downloads and other helpful links
Message to All Guests
NB: Posting URL's See here: Posted URL Change

Main Menu

ASM alloca/_alloca

Started by aw27, May 19, 2017, 04:25:42 PM

Previous topic - Next topic

aw27

alloca/_alloca is a C/C++ function that allocates space in the stack of the caller.
alloca/_alloca is extremely fast when compared with the heap allocation functions.
Of course, alloca/_alloca is not suitable for large allocations.
The space allocated by alloca/_alloca does not need to be freed, it is released on function exit.

What I present here is a demo that includes an _alloca 32-bit ASM function with alignment.
As you can see this _alloca depends on the calling convention because is not inlined.
The advantage on not being inlined is that can be used from high level languages that do not have an alloca function (Delphi, for example).

The demo is short but fairly advanced (in requirements  :lol:) and the only assembler up to the task I could find is HASM.
The reason is that no other has reliable support for alignment above 16-byte, good support for AVX instructions and good support for calling conventions.

Here is the demo:


.686
.XMM

;_BORL=1
;_CDEC=1
;_STD=1
;_FAST=1
_PASC=1


IFDEF _BORL
.MODEL FLAT, BORLAND
ELSEIFDEF _CDEC
.MODEL FLAT, C
ELSEIFDEF _STD
.MODEL FLAT, STDCALL
ELSEIFDEF _FAST
.MODEL FLAT, FASTCALL
ELSEIFDEF _PASC
.MODEL FLAT, PASCAL
ENDIF

OPTION CASEMAP:NONE


option dllimport:<msvcrt.dll>
printf PROTO C arg1:Ptr Byte, printlist: VARARG
getchar PROTO C
option dllimport:<kernel32.dll>
ExitProcess PROTO STDCALL :dword

Reals8ToYmm MACRO par1, par2, par3, par4
Local ymmValue
  .data
  align 32
  ymmValue real8 par1, par2, par3, par4
  .code
  exitm <ymmValue>
ENDM

.data
value1 db "Value 1 is: double1 %.3lf double2 %.3lf double3 %.3lf double4 %.3lf",13,10,0
value2 db "Value 2 is :double1 %.3lf double2 %.3lff double3 %.3lf double4 %.3lf",13,10,0
result db "Result is double1 %lf double2 %lf double3 %lf double4 %lf",13,10,0

.code

OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE

IFDEF _BORL
_alloca proc public thesize:dword, alignm:dword
pop ecx ; pops the return address
sub esp, thesize
neg alignm
and esp, alignm
mov eax, esp
push ecx ; re-push the return address to the top of stack
ret
_alloca endp

ELSEIFDEF _FAST
_alloca proc public thesize:dword, alignm:dword
pop eax ; pops the return address
sub esp, thesize
neg alignm
and esp, alignm
mov ecx, eax
mov eax, esp
push ecx ; re-push the return address in the top of stack
ret
_alloca endp

ELSEIFDEF _CDEC
_alloca proc public thesize:dword, alignm:dword
pop ecx ; pops the return address
pop eax ; thesize
pop edx ; align
sub esp, eax
neg edx
and esp, edx
mov eax, esp
push edx ; re-push arguments to the stack for caller to clean them
push edx ; ditto
push ecx ; re-push the return address in the top of stack
ret
_alloca endp

ELSEIFDEF _STD
_alloca proc public thesize:dword, alignm:dword
pop ecx ; pops the return address
pop eax ; thesize
pop edx ; align
sub esp, eax
neg edx
and esp, edx
mov eax, esp
push ecx ; re-push the return address in the top of stack
ret
_alloca endp

ELSEIFDEF _PASC
_alloca proc public thesize:dword, alignm:dword
pop ecx ; pops the return address
pop eax ; align
pop edx ; thesize
sub esp, edx
neg eax
and esp, eax
mov eax, esp
push ecx ; re-push the return address in the top of stack
ret
_alloca endp
ENDIF

OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef

; Calculate sqrt of sum of 2 vectors of 4 doubles
AVXArithFloat proc public val1:ptr, val2: ptr, res:ptr
mov eax, val1
vmovapd ymm0,ymmword ptr [eax]
mov eax, val2
vmovapd ymm1,ymmword ptr [eax]

vaddpd ymm2,ymm0,ymm1
vsqrtpd ymm3, ymm2

mov eax, res
vmovapd ymmword ptr [eax],ymm3

vzeroupper

ret
AVXArithFloat endp

start proc

LOCAL _ymm0 : PTR YMMWORD
LOCAL _ymm1 : PTR YMMWORD
LOCAL _ymm2 : PTR YMMWORD
;int 3

vmovapd ymm6, YMMWORD ptr Reals8ToYmm(771.3101, -101.544, 9221.34, -341.7773)
INVOKE _alloca, 32, 32 ; allocate 32 bytes on the stack with 32 byte alignment
vmovapd ymmword ptr [eax], ymm6 ; copy to 32-byte aligned memory
mov _ymm0, eax

vmovapd ymm7, YMMWORD ptr Reals8ToYmm(333.3101, 701.544, -348.84, 421.6599)
INVOKE _alloca, 32, 32 ; allocate 32 bytes on the stack with 32 byte alignment
vmovapd ymmword ptr [eax], ymm7 ; copy to 32-byte aligned memory
mov _ymm1, eax

INVOKE _alloca, 32, 32 ; allocate 32 bytes on the stack with 32 byte alignment
mov _ymm2, eax

INVOKE AVXArithFloat, _ymm0, _ymm1, _ymm2

mov eax, _ymm0
INVOKE printf, addr value1, real8 ptr [eax], real8 ptr [eax+8], real8 ptr [eax+16], real8 ptr [eax+24]
mov eax, _ymm1
INVOKE printf, addr value2, real8 ptr [eax], real8 ptr [eax+8], real8 ptr [eax+16], real8 ptr [eax+24]
mov eax, _ymm2
INVOKE printf, addr result, real8 ptr [eax], real8 ptr [eax+8], real8 ptr [eax+16], real8 ptr [eax+24]
INVOKE getchar
INVOKE ExitProcess, 0

start endp

end start


end




jj2007

It works, but option dllimport had no effect: I had to add
includelib \Masm32\lib\msvcrt.lib
includelib \Masm32\lib\Kernel32.lib


Value 1 is: double1 771.310 double2 -101.544 double3 9221.340 double4 -341.777
Value 2 is :double1 333.310 double2 701.544f double3 -348.840 double4 421.660
Result is double1 33.235827 double2 24.494897 double3 94.193949 double4 8.937707


Tested with Hasm64 and the older HJWasm32. Re speed, see StackBuffer:
Quoteup to about half a megabyte, it is significantly faster than HeapAlloc

No miracles, though.

P.S.: Do I use it correctly? This fails: int 3
invoke _alloca, 80000, 32
mov dword ptr [eax], 12345678h
mov dword ptr [eax+80000], 12345678h

nidud

#2
deleted

aw27

Quote from: jj2007 on May 19, 2017, 05:21:37 PM
It works, but option dllimport had no effect: I had to add
includelib \Masm32\lib\msvcrt.lib
includelib \Masm32\lib\Kernel32.lib
I built directly without linker: hjwasm64 -pe test.asm

Quote
P.S.: Do I use it correctly? This fails:
   int 3
   invoke _alloca, 80000, 32
   mov dword ptr [eax], 12345678h
   mov dword ptr [eax+80000], 12345678h

It appears that the stack is way too small. It works with:
        invoke _alloca, 10000, 32
   mov dword ptr [eax], 12345678h
   mov dword ptr [eax+9996], 12345678h

I was expecting more stack, actually close to 1 MB. Something looks wrong here.

aw27

Quote from: nidud on May 19, 2017, 06:08:19 PM
Test case fast malloc vs Heap/Global/Virtual Alloc:
http://masm32.com/board/index.php?topic=4940.msg53093#msg53093

Test case alloca/malloc:
http://masm32.com/board/index.php?topic=5622.msg60517#msg60517

The link option:
http://masm32.com/board/index.php?topic=5849.msg62302#msg62302

--
_aligned_malloc.asm, _chkstk.asm, alloca.asm.

Very nice!

hutch--

Ah, the joys of a MACRO assembler.  :biggrin:

; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤

    include \masm32\include64\masm64rt.inc

    .code

; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤

entry_point proc

    conout "Howdy, your new console template here.",lf,lf

    call aligned_stack

    waitkey
    invoke ExitProcess,0

    ret

entry_point endp

; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤

CUSTOMSTACK 4096, 128, 4096

aligned_stack proc

    LOCAL BigVar[4096]:BYTE     ; first LOCAL is aligned at 4096 bytes

    conout "Aligned stack here",lf,lf

    ret

aligned_stack endp

; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤

    end

jj2007

Default stack should indeed be around 1MB. I've put together a little testbed for timings, it's interesting:Intel(R) Core(TM) i5-2450M CPU @ 2.50GHz (MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX)

Allocating 200000 bytes:
StackBuffer z:  721 ms  a16: 0
StackBuffer nz: 1 ms    a16: 0
GlobalAlloc z:  800 ms  a16: 0
HeapAlloc nz:   23 ms   a16: 0
SysAlloc nz:    30 ms   a16: 1

Allocating 400000 bytes:
StackBuffer z:  1429 ms a16: 0
StackBuffer nz: 1 ms    a16: 0
GlobalAlloc z:  1455 ms a16: 0
HeapAlloc nz:   23 ms   a16: 0
SysAlloc nz:    30 ms   a16: 1

Allocating 800000 bytes:
StackBuffer z:  2848 ms a16: 0
StackBuffer nz: 1 ms    a16: 0
GlobalAlloc z:  509 ms  a16: 0
HeapAlloc nz:   502 ms  a16: 0
SysAlloc nz:    644 ms  a16: 1


z means buffer got initialised with zeros, nz means not initialised.
The a16 tells me if the buffer is aligned to 16 bytes for use with movaps & friends. In the case of StackBuffer, the zero result is by design, in all other cases by accident: HeapAlloc guarantees align 8 only.

Note the performance of GlobalAlloc above the 400k mark: As Hutch keeps hammering, GlobalAlloc is damn efficient. What is less obvious is the bad performance of HeapAlloc at this point: Apparently it switches to GlobalAlloc. And initialises :P

I would like to include José's function, too, but so far I don't fully understand what it does. All functions in the testbed have this format:p2 proc dummy1, dummy2, allocSize
Local pFatBuffer, a16:BYTE
  mov pFatBuffer, alloc(allocSize) ; GlobalAlloc GPTR = GMEM_FIXED and GMEM_ZEROINIT
  test al, 7
  setne a16
  .if Zero?
movaps [eax], xmm0
  .else
movups [eax], xmm0
  .endif
  mov [eax], 11111111h
  mov [eax+20000], 22222222h
  mov [eax+40000], 33333333h
  free pFatBuffer
  movsx ecx, a16
  ret
p2 endp

hutch--

I think from memory that HeapAlloc() is OK as long as you stay away from the hand holding options like the low fragmentation option that is REALLLY SLOWWWWWWWWWW.  :P

aw27

Quote from: hutch-- on May 19, 2017, 07:27:46 PM
Ah, the joys of a MACRO assembler.  :biggrin:

; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤

    include \masm32\include64\masm64rt.inc

    .code

; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤

entry_point proc

    conout "Howdy, your new console template here.",lf,lf

    call aligned_stack

    waitkey
    invoke ExitProcess,0

    ret

entry_point endp

; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤

CUSTOMSTACK 4096, 128, 4096

aligned_stack proc

    LOCAL BigVar[4096]:BYTE     ; first LOCAL is aligned at 4096 bytes

    conout "Aligned stack here",lf,lf

    ret

aligned_stack endp

; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤

    end


64-bit always align the stack to 16 bytes anyway.  :P
Where is the part where you align to 32 bytes?

jj2007

I attach a version that shows also the alignment of the buffer. To trigger the misalignment that one can expect from Global and HeapAlloc, there is a void alloc(1117) before the start of the loop.

Allocating 200000 bytes:
0015F200        StackBuffer z:  717 ms  a16: 0
0015F200        StackBuffer nz: 1 ms    a16: 0
00304408        GlobalAlloc z:  777 ms  a16: 1
00304408        HeapAlloc nz:   25 ms   a16: 1
0030440C        SysAlloc nz:    30 ms   a16: 1
00304410        HeapAlloc16 z:  759 ms  a16: 0
00304410        HeapAlloc16 nz: 759 ms  a16: 0

Allocating 400000 bytes:
0012E4C0        StackBuffer z:  1387 ms a16: 0
0012E4C0        StackBuffer nz: 1 ms    a16: 0
00304408        GlobalAlloc z:  1482 ms a16: 1
00304408        HeapAlloc nz:   25 ms   a16: 1
0030440C        SysAlloc nz:    30 ms   a16: 1
00304410        HeapAlloc16 z:  1480 ms a16: 0
00304410        HeapAlloc16 nz: 1486 ms a16: 0

Allocating 800000 bytes:
000CCA40        StackBuffer z:  2823 ms a16: 0
000CCA40        StackBuffer nz: 1 ms    a16: 0
00A10020        GlobalAlloc z:  422 ms  a16: 0
00A10020        HeapAlloc nz:   419 ms  a16: 0
00A10024        SysAlloc nz:    554 ms  a16: 1
00A10020        HeapAlloc16 z:  421 ms  a16: 0
00A10020        HeapAlloc16 nz: 421 ms  a16: 0


The zeroinit version of StackBuffer() is OK up to around 400k, above that it gets much slower than GlobalAlloc & friends.

Note the shift to "better aligned" buffers for *Alloc above 400k. As regards alloc$ aka SysAllocStringByteLen, it is always align 4, simply because it needs [eax-4] for the length of the string. I also added the z and nz versions for aligned HeapAlloc, Alloc16.

@José: I'd like to add your version - grateful if you could provide one that fits the format
pxxx proc dummy1, dummy2, allocSize
Local pFatBuffer, a16:BYTE
  mov pFatBuffer, YOURFUNCTION(allocSize)


aw27

Quote from: jj2007 on May 19, 2017, 07:45:50 PM
Default stack should indeed be around 1MB.

I know, but HASM does not provide 1 MB using -pe switch, I wonder how much does it provide? Johnsa, please help?
With Microsoft Linker i can get 1 MB or whatever I want but I can not make it understand what is align 32 for data. They live in the last century as far as ASM is concerned.
Anyway, I can use the Microsoft Linker without align 32, but can not use the vmovapd (Move Aligned Packed Double-Precision Floating-Point Values) to take data that is not 32-byte aligned in memory, only vmovupd.
This means replacing vmovapd ymm6, YMMWORD ptr Reals8ToYmm(771.3101, -101.544, 9221.34, -341.7773) with vmovupd ymm6, YMMWORD ptr Reals8ToYmm(771.3101, -101.544, 9221.34, -341.7773) and vmovapd ymm7, YMMWORD ptr Reals8ToYmm(333.3101, 701.544, -348.84, 421.6599) with vmovupd ymm7, YMMWORD ptr Reals8ToYmm(333.3101, 701.544, -348.84, 421.6599)
Sucks!

Quote
I would like to include José's function, too, but so far I don't fully understand what it does. All functions in the testbed have this format:

p2 proc dummy1, dummy2, allocSize
Local pFatBuffer, a16:BYTE
  mov pFatBuffer, alloc(allocSize)   ; GlobalAlloc GPTR = GMEM_FIXED and GMEM_ZEROINIT
  test al, 7
  setne a16
  .if Zero?
   movaps [eax], xmm0
  .else
   movups [eax], xmm0
  .endif
  mov [eax], 11111111h
  mov [eax+20000], 22222222h
  mov [eax+40000], 33333333h
  free pFatBuffer
  movsx ecx, a16
  ret
p2 endp


_alloc does not explicitly release memory. It is released when the function returns.

aw27

Actually there is a way to make Microsoft Linker accept ALIGN 32, ALIGN 64, ALIGN 256, ALIGN 512, ALIGN 4096 for data

I just found out this Redmond's well kept secret  :lol:

Instead of .data, use DOS like segments. In this case
_DATA1 SEGMENT PAGE FLAT 'DATA' will be just fine.

In my example will be:

_DATA1 SEGMENT PAGE FLAT 'DATA'
align 16
value1 db "Value 1 is: double1 %.3lf double2 %.3lf double3 %.3lf double4 %.3lf",13,10,0
value2 db "Value 2 is :double1 %.3lf double2 %.3lff double3 %.3lf double4 %.3lf",13,10,0
result db "Result is double1 %lf double2 %lf double3 %lf double4 %lf",13,10,0
_DATA1 ends

Reals8ToYmm MACRO par1, par2, par3, par4
Local ymmValue
_DATA1 SEGMENT
align 32
  ymmValue real8 par1, par2, par3, par4
_DATA1 ends 
  .code
  exitm <ymmValue>
ENDM   



hutch--

> Where is the part where you align to 32 bytes?

No, 4096 bytes. Its done with the stack frame macro.

aw27

Quote from: hutch-- on May 20, 2017, 12:15:24 AM
> Where is the part where you align to 32 bytes?

No, 4096 bytes. Its done with the stack frame macro.

I see, I was just unable to locate the CUSTOMSTACK macro. Probably my stuff is outdated.

jj2007

Quote from: aw27 on May 19, 2017, 11:38:45 PMI just found out this Redmond's well kept secret  :lol:

Instead of .data, use DOS like segments.

include \masm32\include\masm32rt.inc

_DATA1 SEGMENT PAGE FLAT 'DATA'
align 16
test1 db "Test1", 0
_DATA1 ends

_DATA2 SEGMENT PAGE FLAT 'DATA'
align 16
test2 db "Test2", 0
_DATA2 ends

_DATA3 SEGMENT PAGE FLAT 'DATA'
align 16
test3 db "Test2", 0
_DATA3 ends

.code
start:
  mov esi, offset test1
  print esi, 13, 10
  mov edi, offset test2
  print edi, 13, 10
  mov ebx, offset test3
  print ebx, 13, 10
  print hex$(esi), 13, 10
  print hex$(edi), 13, 10
  print hex$(ebx), 13, 10
  inkey
  exit

end start


It works :t
Test1
Test2
Test2
00403000
00404000
00405000


But it means 4096 bytes allocated for every aligned variable...

P.S.: I still have no clue how to use _alloca properly. In a loop, I get this:Value 1 is: double1 771.310 double2 -101.544 double3 9221.340 double4 -341.777
Value 2 is :double1 333.310 double2 701.544f double3 -348.840 double4 421.660
Result is double1 33.235827 double2 24.494897 double3 94.193949 double4 8.937707
stack is 18ff00

Value 1 is: double1 771.310 double2 -101.544 double3 9221.340 double4 -341.777
Value 2 is :double1 333.310 double2 701.544f double3 -348.840 double4 421.660
Result is double1 33.235827 double2 24.494897 double3 94.193949 double4 8.937707
stack is 18fea0

Value 1 is: double1 771.310 double2 -101.544 double3 9221.340 double4 -341.777
Value 2 is :double1 333.310 double2 701.544f double3 -348.840 double4 421.660
Result is double1 33.235827 double2 24.494897 double3 94.193949 double4 8.937707
stack is 18fe40

Value 1 is: double1 771.310 double2 -101.544 double3 9221.340 double4 -341.777
Value 2 is :double1 333.310 double2 701.544f double3 -348.840 double4 421.660
Result is double1 33.235827 double2 24.494897 double3 94.193949 double4 8.937707
stack is 18fde0


So the stack changes each time ::)

This works: INVOKE _alloca, 32, 32 ; allocate 32 bytes on the stack with 32 byte alignment
vmovapd ymmword ptr [eax], ymm6 ; copy to 32-byte aligned memory


This crashes: INVOKE _alloca, 132, 32 ; allocate 132 bytes on the stack with 32 byte alignment
vmovapd ymmword ptr [eax+100], ymm6 ; copy to 32-byte aligned memory