alloca/_alloca is a C/C++ function that allocates space in the stack of the caller.
alloca/_alloca is extremely fast when compared with the heap allocation functions.
Of course, alloca/_alloca is not suitable for large allocations.
The space allocated by alloca/_alloca does not need to be freed, it is released on function exit.
What I present here is a demo that includes an _alloca 32-bit ASM function with alignment.
As you can see this _alloca depends on the calling convention because is not inlined.
The advantage on not being inlined is that can be used from high level languages that do not have an alloca function (Delphi, for example).
The demo is short but fairly advanced (in requirements :lol:) and the only assembler up to the task I could find is HASM.
The reason is that no other has reliable support for alignment above 16-byte, good support for AVX instructions and good support for calling conventions.
Here is the demo:
.686
.XMM
;_BORL=1
;_CDEC=1
;_STD=1
;_FAST=1
_PASC=1
IFDEF _BORL
.MODEL FLAT, BORLAND
ELSEIFDEF _CDEC
.MODEL FLAT, C
ELSEIFDEF _STD
.MODEL FLAT, STDCALL
ELSEIFDEF _FAST
.MODEL FLAT, FASTCALL
ELSEIFDEF _PASC
.MODEL FLAT, PASCAL
ENDIF
OPTION CASEMAP:NONE
option dllimport:<msvcrt.dll>
printf PROTO C arg1:Ptr Byte, printlist: VARARG
getchar PROTO C
option dllimport:<kernel32.dll>
ExitProcess PROTO STDCALL :dword
Reals8ToYmm MACRO par1, par2, par3, par4
Local ymmValue
.data
align 32
ymmValue real8 par1, par2, par3, par4
.code
exitm <ymmValue>
ENDM
.data
value1 db "Value 1 is: double1 %.3lf double2 %.3lf double3 %.3lf double4 %.3lf",13,10,0
value2 db "Value 2 is :double1 %.3lf double2 %.3lff double3 %.3lf double4 %.3lf",13,10,0
result db "Result is double1 %lf double2 %lf double3 %lf double4 %lf",13,10,0
.code
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
IFDEF _BORL
_alloca proc public thesize:dword, alignm:dword
pop ecx ; pops the return address
sub esp, thesize
neg alignm
and esp, alignm
mov eax, esp
push ecx ; re-push the return address to the top of stack
ret
_alloca endp
ELSEIFDEF _FAST
_alloca proc public thesize:dword, alignm:dword
pop eax ; pops the return address
sub esp, thesize
neg alignm
and esp, alignm
mov ecx, eax
mov eax, esp
push ecx ; re-push the return address in the top of stack
ret
_alloca endp
ELSEIFDEF _CDEC
_alloca proc public thesize:dword, alignm:dword
pop ecx ; pops the return address
pop eax ; thesize
pop edx ; align
sub esp, eax
neg edx
and esp, edx
mov eax, esp
push edx ; re-push arguments to the stack for caller to clean them
push edx ; ditto
push ecx ; re-push the return address in the top of stack
ret
_alloca endp
ELSEIFDEF _STD
_alloca proc public thesize:dword, alignm:dword
pop ecx ; pops the return address
pop eax ; thesize
pop edx ; align
sub esp, eax
neg edx
and esp, edx
mov eax, esp
push ecx ; re-push the return address in the top of stack
ret
_alloca endp
ELSEIFDEF _PASC
_alloca proc public thesize:dword, alignm:dword
pop ecx ; pops the return address
pop eax ; align
pop edx ; thesize
sub esp, edx
neg eax
and esp, eax
mov eax, esp
push ecx ; re-push the return address in the top of stack
ret
_alloca endp
ENDIF
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef
; Calculate sqrt of sum of 2 vectors of 4 doubles
AVXArithFloat proc public val1:ptr, val2: ptr, res:ptr
mov eax, val1
vmovapd ymm0,ymmword ptr [eax]
mov eax, val2
vmovapd ymm1,ymmword ptr [eax]
vaddpd ymm2,ymm0,ymm1
vsqrtpd ymm3, ymm2
mov eax, res
vmovapd ymmword ptr [eax],ymm3
vzeroupper
ret
AVXArithFloat endp
start proc
LOCAL _ymm0 : PTR YMMWORD
LOCAL _ymm1 : PTR YMMWORD
LOCAL _ymm2 : PTR YMMWORD
;int 3
vmovapd ymm6, YMMWORD ptr Reals8ToYmm(771.3101, -101.544, 9221.34, -341.7773)
INVOKE _alloca, 32, 32 ; allocate 32 bytes on the stack with 32 byte alignment
vmovapd ymmword ptr [eax], ymm6 ; copy to 32-byte aligned memory
mov _ymm0, eax
vmovapd ymm7, YMMWORD ptr Reals8ToYmm(333.3101, 701.544, -348.84, 421.6599)
INVOKE _alloca, 32, 32 ; allocate 32 bytes on the stack with 32 byte alignment
vmovapd ymmword ptr [eax], ymm7 ; copy to 32-byte aligned memory
mov _ymm1, eax
INVOKE _alloca, 32, 32 ; allocate 32 bytes on the stack with 32 byte alignment
mov _ymm2, eax
INVOKE AVXArithFloat, _ymm0, _ymm1, _ymm2
mov eax, _ymm0
INVOKE printf, addr value1, real8 ptr [eax], real8 ptr [eax+8], real8 ptr [eax+16], real8 ptr [eax+24]
mov eax, _ymm1
INVOKE printf, addr value2, real8 ptr [eax], real8 ptr [eax+8], real8 ptr [eax+16], real8 ptr [eax+24]
mov eax, _ymm2
INVOKE printf, addr result, real8 ptr [eax], real8 ptr [eax+8], real8 ptr [eax+16], real8 ptr [eax+24]
INVOKE getchar
INVOKE ExitProcess, 0
start endp
end start
end
It works, but option dllimport had no effect: I had to add
includelib \Masm32\lib\msvcrt.lib
includelib \Masm32\lib\Kernel32.lib
Value 1 is: double1 771.310 double2 -101.544 double3 9221.340 double4 -341.777
Value 2 is :double1 333.310 double2 701.544f double3 -348.840 double4 421.660
Result is double1 33.235827 double2 24.494897 double3 94.193949 double4 8.937707
Tested with Hasm64 and the older HJWasm32. Re speed, see StackBuffer (http://www.webalice.it/jj2006/MasmBasicQuickReference.htm#Mb1255):
Quoteup to about half a megabyte, it is significantly faster than HeapAlloc
No miracles, though.
P.S.: Do I use it correctly? This fails:
int 3
invoke _alloca, 80000, 32
mov dword ptr [eax], 12345678h
mov dword ptr [eax+80000], 12345678h
deleted
Quote from: jj2007 on May 19, 2017, 05:21:37 PM
It works, but option dllimport had no effect: I had to add
includelib \Masm32\lib\msvcrt.lib
includelib \Masm32\lib\Kernel32.lib
I built directly without linker: hjwasm64 -pe test.asm
Quote
P.S.: Do I use it correctly? This fails:
int 3
invoke _alloca, 80000, 32
mov dword ptr [eax], 12345678h
mov dword ptr [eax+80000], 12345678h
It appears that the stack is way too small. It works with:
invoke _alloca, 10000, 32
mov dword ptr [eax], 12345678h
mov dword ptr [eax+9996], 12345678h
I was expecting more stack, actually close to 1 MB. Something looks wrong here.
Quote from: nidud on May 19, 2017, 06:08:19 PM
Test case fast malloc vs Heap/Global/Virtual Alloc:
http://masm32.com/board/index.php?topic=4940.msg53093#msg53093
Test case alloca/malloc:
http://masm32.com/board/index.php?topic=5622.msg60517#msg60517
The link option:
http://masm32.com/board/index.php?topic=5849.msg62302#msg62302
--
_aligned_malloc.asm (https://github.com/nidud/asmc/blob/master/source/libc/alloc/_aligned_malloc.asm), _chkstk.asm (https://github.com/nidud/asmc/blob/master/source/libc/alloc/_chkstk.asm), alloca.asm (https://github.com/nidud/asmc/blob/master/source/libc/alloc/alloca.asm).
Very nice!
Ah, the joys of a MACRO assembler. :biggrin:
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
include \masm32\include64\masm64rt.inc
.code
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
entry_point proc
conout "Howdy, your new console template here.",lf,lf
call aligned_stack
waitkey
invoke ExitProcess,0
ret
entry_point endp
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
CUSTOMSTACK 4096, 128, 4096
aligned_stack proc
LOCAL BigVar[4096]:BYTE ; first LOCAL is aligned at 4096 bytes
conout "Aligned stack here",lf,lf
ret
aligned_stack endp
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
end
Default stack should indeed be around 1MB. I've put together a little testbed for timings, it's interesting:Intel(R) Core(TM) i5-2450M CPU @ 2.50GHz (MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX)
Allocating 200000 bytes:
StackBuffer z: 721 ms a16: 0
StackBuffer nz: 1 ms a16: 0
GlobalAlloc z: 800 ms a16: 0
HeapAlloc nz: 23 ms a16: 0
SysAlloc nz: 30 ms a16: 1
Allocating 400000 bytes:
StackBuffer z: 1429 ms a16: 0
StackBuffer nz: 1 ms a16: 0
GlobalAlloc z: 1455 ms a16: 0
HeapAlloc nz: 23 ms a16: 0
SysAlloc nz: 30 ms a16: 1
Allocating 800000 bytes:
StackBuffer z: 2848 ms a16: 0
StackBuffer nz: 1 ms a16: 0
GlobalAlloc z: 509 ms a16: 0
HeapAlloc nz: 502 ms a16: 0
SysAlloc nz: 644 ms a16: 1
z means buffer got initialised with zeros, nz means not initialised.
The a16 tells me if the buffer is aligned to 16 bytes for use with movaps & friends. In the case of StackBuffer, the zero result is by design, in all other cases by accident: HeapAlloc guarantees align 8 only.
Note the performance of GlobalAlloc above the 400k mark: As Hutch keeps hammering, GlobalAlloc is damn efficient. What is less obvious is the bad performance of HeapAlloc at this point: Apparently it switches to GlobalAlloc. And initialises :P
I would like to include José's function, too, but so far I don't fully understand what it does. All functions in the testbed have this format:p2 proc dummy1, dummy2, allocSize
Local pFatBuffer, a16:BYTE
mov pFatBuffer, alloc(allocSize) ; GlobalAlloc GPTR = GMEM_FIXED and GMEM_ZEROINIT
test al, 7
setne a16
.if Zero?
movaps [eax], xmm0
.else
movups [eax], xmm0
.endif
mov [eax], 11111111h
mov [eax+20000], 22222222h
mov [eax+40000], 33333333h
free pFatBuffer
movsx ecx, a16
ret
p2 endp
I think from memory that HeapAlloc() is OK as long as you stay away from the hand holding options like the low fragmentation option that is REALLLY SLOWWWWWWWWWW. :P
Quote from: hutch-- on May 19, 2017, 07:27:46 PM
Ah, the joys of a MACRO assembler. :biggrin:
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
include \masm32\include64\masm64rt.inc
.code
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
entry_point proc
conout "Howdy, your new console template here.",lf,lf
call aligned_stack
waitkey
invoke ExitProcess,0
ret
entry_point endp
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
CUSTOMSTACK 4096, 128, 4096
aligned_stack proc
LOCAL BigVar[4096]:BYTE ; first LOCAL is aligned at 4096 bytes
conout "Aligned stack here",lf,lf
ret
aligned_stack endp
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
end
64-bit always align the stack to 16 bytes anyway. :P
Where is the part where you align to 32 bytes?
I attach a version that shows also the alignment of the buffer. To trigger the misalignment that one can expect from Global and HeapAlloc, there is a void alloc(1117) before the start of the loop.
Allocating 200000 bytes:
0015F200 StackBuffer z: 717 ms a16: 0
0015F200 StackBuffer nz: 1 ms a16: 0
00304408 GlobalAlloc z: 777 ms a16: 1
00304408 HeapAlloc nz: 25 ms a16: 1
0030440C SysAlloc nz: 30 ms a16: 1
00304410 HeapAlloc16 z: 759 ms a16: 0
00304410 HeapAlloc16 nz: 759 ms a16: 0
Allocating 400000 bytes:
0012E4C0 StackBuffer z: 1387 ms a16: 0
0012E4C0 StackBuffer nz: 1 ms a16: 0
00304408 GlobalAlloc z: 1482 ms a16: 1
00304408 HeapAlloc nz: 25 ms a16: 1
0030440C SysAlloc nz: 30 ms a16: 1
00304410 HeapAlloc16 z: 1480 ms a16: 0
00304410 HeapAlloc16 nz: 1486 ms a16: 0
Allocating 800000 bytes:
000CCA40 StackBuffer z: 2823 ms a16: 0
000CCA40 StackBuffer nz: 1 ms a16: 0
00A10020 GlobalAlloc z: 422 ms a16: 0
00A10020 HeapAlloc nz: 419 ms a16: 0
00A10024 SysAlloc nz: 554 ms a16: 1
00A10020 HeapAlloc16 z: 421 ms a16: 0
00A10020 HeapAlloc16 nz: 421 ms a16: 0
The zeroinit version of StackBuffer() (http://www.webalice.it/jj2006/MasmBasicQuickReference.htm#Mb1255) is OK up to around 400k, above that it gets much slower than GlobalAlloc & friends.
Note the shift to "better aligned" buffers for *Alloc above 400k. As regards alloc$ aka SysAllocStringByteLen, it is always align 4, simply because it needs [eax-4] for the length of the string. I also added the z and nz versions for aligned HeapAlloc, Alloc16 (http://www.webalice.it/jj2006/MasmBasicQuickReference.htm#Mb1351).
@José: I'd like to add your version - grateful if you could provide one that fits the format
pxxx proc dummy1, dummy2, allocSize
Local pFatBuffer, a16:BYTE
mov pFatBuffer, YOURFUNCTION(allocSize)
Quote from: jj2007 on May 19, 2017, 07:45:50 PM
Default stack should indeed be around 1MB.
I know, but HASM does not provide 1 MB using -pe switch, I wonder how much does it provide? Johnsa, please help?
With Microsoft Linker i can get 1 MB or whatever I want but I can not make it understand what is
align 32 for data. They live in the last century as far as ASM is concerned.
Anyway, I can use the Microsoft Linker without align 32, but can not use the vmovapd (Move Aligned Packed Double-Precision Floating-Point Values) to take data that is not 32-byte aligned in memory, only vmovupd.
This means replacing vmovapd ymm6, YMMWORD ptr Reals8ToYmm(771.3101, -101.544, 9221.34, -341.7773) with vmovupd ymm6, YMMWORD ptr Reals8ToYmm(771.3101, -101.544, 9221.34, -341.7773) and vmovapd ymm7, YMMWORD ptr Reals8ToYmm(333.3101, 701.544, -348.84, 421.6599) with vmovupd ymm7, YMMWORD ptr Reals8ToYmm(333.3101, 701.544, -348.84, 421.6599)
Sucks!
Quote
I would like to include José's function, too, but so far I don't fully understand what it does. All functions in the testbed have this format:
p2 proc dummy1, dummy2, allocSize
Local pFatBuffer, a16:BYTE
mov pFatBuffer, alloc(allocSize) ; GlobalAlloc GPTR = GMEM_FIXED and GMEM_ZEROINIT
test al, 7
setne a16
.if Zero?
movaps [eax], xmm0
.else
movups [eax], xmm0
.endif
mov [eax], 11111111h
mov [eax+20000], 22222222h
mov [eax+40000], 33333333h
free pFatBuffer
movsx ecx, a16
ret
p2 endp
_alloc does not explicitly release memory. It is released when the function returns.
Actually there is a way to make Microsoft Linker accept ALIGN 32, ALIGN 64, ALIGN 256, ALIGN 512, ALIGN 4096 for data
I just found out this Redmond's well kept secret :lol:
Instead of .data, use DOS like segments. In this case
_DATA1 SEGMENT PAGE FLAT 'DATA' will be just fine.
In my example will be:
_DATA1 SEGMENT PAGE FLAT 'DATA'
align 16
value1 db "Value 1 is: double1 %.3lf double2 %.3lf double3 %.3lf double4 %.3lf",13,10,0
value2 db "Value 2 is :double1 %.3lf double2 %.3lff double3 %.3lf double4 %.3lf",13,10,0
result db "Result is double1 %lf double2 %lf double3 %lf double4 %lf",13,10,0
_DATA1 ends
Reals8ToYmm MACRO par1, par2, par3, par4
Local ymmValue
_DATA1 SEGMENT
align 32
ymmValue real8 par1, par2, par3, par4
_DATA1 ends
.code
exitm <ymmValue>
ENDM
> Where is the part where you align to 32 bytes?
No, 4096 bytes. Its done with the stack frame macro.
Quote from: hutch-- on May 20, 2017, 12:15:24 AM
> Where is the part where you align to 32 bytes?
No, 4096 bytes. Its done with the stack frame macro.
I see, I was just unable to locate the CUSTOMSTACK macro. Probably my stuff is outdated.
Quote from: aw27 on May 19, 2017, 11:38:45 PMI just found out this Redmond's well kept secret :lol:
Instead of .data, use DOS like segments.
include \masm32\include\masm32rt.inc
_DATA1 SEGMENT PAGE FLAT 'DATA'
align 16
test1 db "Test1", 0
_DATA1 ends
_DATA2 SEGMENT PAGE FLAT 'DATA'
align 16
test2 db "Test2", 0
_DATA2 ends
_DATA3 SEGMENT PAGE FLAT 'DATA'
align 16
test3 db "Test2", 0
_DATA3 ends
.code
start:
mov esi, offset test1
print esi, 13, 10
mov edi, offset test2
print edi, 13, 10
mov ebx, offset test3
print ebx, 13, 10
print hex$(esi), 13, 10
print hex$(edi), 13, 10
print hex$(ebx), 13, 10
inkey
exit
end start
It works :t
Test1
Test2
Test2
00403000
00404000
00405000
But it means 4096 bytes allocated for every aligned variable...
P.S.: I still have no clue how to use _alloca properly. In a loop, I get this:
Value 1 is: double1 771.310 double2 -101.544 double3 9221.340 double4 -341.777
Value 2 is :double1 333.310 double2 701.544f double3 -348.840 double4 421.660
Result is double1 33.235827 double2 24.494897 double3 94.193949 double4 8.937707
stack is 18ff00
Value 1 is: double1 771.310 double2 -101.544 double3 9221.340 double4 -341.777
Value 2 is :double1 333.310 double2 701.544f double3 -348.840 double4 421.660
Result is double1 33.235827 double2 24.494897 double3 94.193949 double4 8.937707
stack is 18fea0
Value 1 is: double1 771.310 double2 -101.544 double3 9221.340 double4 -341.777
Value 2 is :double1 333.310 double2 701.544f double3 -348.840 double4 421.660
Result is double1 33.235827 double2 24.494897 double3 94.193949 double4 8.937707
stack is 18fe40
Value 1 is: double1 771.310 double2 -101.544 double3 9221.340 double4 -341.777
Value 2 is :double1 333.310 double2 701.544f double3 -348.840 double4 421.660
Result is double1 33.235827 double2 24.494897 double3 94.193949 double4 8.937707
stack is 18fde0
So the stack changes each time ::)
This works:
INVOKE _alloca, 32, 32 ; allocate 32 bytes on the stack with 32 byte alignment
vmovapd ymmword ptr [eax], ymm6 ; copy to 32-byte aligned memory
This crashes:
INVOKE _alloca, 132, 32 ; allocate 132 bytes on the stack with 32 byte alignment
vmovapd ymmword ptr [eax+100], ymm6 ; copy to 32-byte aligned memory
Quote from: jj2007 on May 20, 2017, 01:42:14 AM
So the stack changes each time ::)
It has to be like this: :t
; hjwasm64 -c -coff awalloca.asm
; link awalloca.obj /STACK:52428800,52428800 /SUBSYSTEM:CONSOLE
.386
.MODEL FLAT, C
OPTION CASEMAP:NONE
includelib h:\Masm32\lib\msvcrt.lib
includelib h:\Masm32\lib\Kernel32.lib
printf PROTO C arg1:Ptr Byte, printlist: VARARG
ExitProcess PROTO STDCALL :dword
.data
result db "Finished",13,10,0
.code
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
_alloca proc C public thesize:dword, alignm:dword
pop ecx ; pops the return address
pop eax ; thesize
pop edx ; align
sub esp, eax
neg edx
and esp, edx
mov eax, esp
push edx ; re-push arguments to the stack for caller to clean them
push edx ; ditto
push ecx ; re-push the return address in the top of stack
ret
_alloca endp
myLooping proc
push ebp
mov ebp, esp ; FRAME is always required, even if no LOCALS, to re-establish the stack pointer on exit.
invoke _alloca, 50000000, 16 ; We have a mega stack of 50 MB
mov dword ptr [eax], 12345678h
mov dword ptr [eax+40000000], 12345678h
mov esp, ebp
pop ebp
ret
myLooping endp
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef
start proc
LOCAL counter
;int 3
mov counter, 3000000
.Repeat
INVOKE myLooping
dec counter
.Until ZERO?
INVOKE printf, addr result
INVOKE ExitProcess, 0
start endp
end start
Quote from: jj2007 on May 20, 2017, 01:42:14 AM
But it means 4096 bytes allocated for every aligned variable...
No, you can put many variables in each segment, no need to have one segment per variable.
Quote from: aw27 on May 20, 2017, 02:50:39 AM
Quote from: jj2007 on May 20, 2017, 01:42:14 AM
But it means 4096 bytes allocated for every aligned variable...
No, you can put many variables in each segment, no need to have one segment per variable.
Yes indeed, but to have all these variables aligned 32, you need again manual fumbling.
JJ, you can use align 32 for each variables in the same segment.
Quote from: coder on May 20, 2017, 07:37:15 AM
JJ, you can use align 32 for each variables in the same segment.
I was prepared to reply "nonsense", based on the horrible experience in the align64/transpose a matrix (http://masm32.com/board/index.php?topic=6112.msg64948#msg64948) thread. BUT here the align 64 works :dazzled:
More precisely, it works only in this very special segment. Try replacing it with a .data?, and it will choke. Attached a testbed, it works even with ML 6.14 and its linker...! José made a great discovery :icon14:
I have a different view on this though. I think segment is nothing special. It's just more primitive / low-level than sections and high-level macros like .code and .data. I think MASM's .code and .data are actually some wrapper macros to sections instead of segments to facilitate external linking, hence the difficulties in setting up the alignments. Older MASM reference books have made good use of them, but course in MZ format (executable). But yet again, PE format is MZ format in disguise. I just don't understand the sudden drop of its usage / popularity when it comes to Win / modern asm programming. A segment is the final memory partition seen by the CPU.
Quote from: coder on May 20, 2017, 09:04:57 AMI think MASM's .code and .data are actually some wrapper macros to sections instead of segments to facilitate external linking, hence the difficulties in setting up the alignments.
That could be an explanation. Still, it is rather strange that ML chokes for align 32 in .DATA? but has no problems with align 64 in an old-fashioned DOS segment.
@José: I got it running now:
Intel(R) Core(TM) i5-2450M CPU @ 2.50GHz
Allocating 5000*40800 bytes:
00186010 _alloca: 1 ms a16: 0
00186000 StackBuffer z: 11 ms a16: 0
00186000 StackBuffer nz: 0 ms a16: 0
0068E6B8 GlobalAlloc z: 81 ms a16: 1
0068E6B8 HeapAlloc nz: 69 ms a16: 1
0068E7A4 SysAlloc nz: 71 ms a16: 1
0068E7A0 HeapAlloc16 z: 68 ms a16: 0
0068E7A0 HeapAlloc16 nz: 68 ms a16: 0
Allocating 5000*163200 bytes:
001681C0 StackBuffer z: 37 ms a16: 0
001681C0 StackBuffer nz: 1 ms a16: 0
0068E7A0 GlobalAlloc z: 356 ms a16: 0
0068E7A0 HeapAlloc nz: 354 ms a16: 0
0068E7A4 SysAlloc nz: 354 ms a16: 1
0068E7A0 HeapAlloc16 z: 357 ms a16: 0
0068E7A0 HeapAlloc16 nz: 356 ms a16: 0
Allocating 5000*652800 bytes:
000F0940 StackBuffer z: 149 ms a16: 0
000F0940 StackBuffer nz: 1 ms a16: 0
00260020 GlobalAlloc z: 28 ms a16: 0
00260020 HeapAlloc nz: 27 ms a16: 0
00260024 SysAlloc nz: 30 ms a16: 1
00260020 HeapAlloc16 z: 22 ms a16: 0
00260020 HeapAlloc16 nz: 22 ms a16: 0
The problem is that _alloca can be used only for allocations up to 40800 bytes; beyond that, the guard pages kick in (see this old thread for the reasons (http://masm32.com/board/index.php?topic=2003.msg20954#msg20954)). Speed-wise it is of course identical to StackBuffer(). Source & exe attached.
@JJ
Yeah, sadly align 32 doesn't work in normal .data .code settings. I remember posting something similar here (http://masm32.com/board/index.php?topic=6194.0)
It's either you manually pad them (macros etc) or use SEGMENT.
If its for DATA, use dynamic memory and align it yourself. Verx jarst phine. :biggrin:
Quote from: jj2007 on May 20, 2017, 11:02:51 AM
@José: I got it running now.
The problem is that _alloca can be used only for allocations up to 40800 bytes; beyond that, the guard pages kick in
I downloaded the attachment and noticed a couple of points:
1) You are using the Pascal version of _alloca in the test without declaring it was Pascal in the Proc. Moreover, the Pascal version was buggy, I fixed it and modified the initial post but forgot to advertise that.. Sorry, you are using the stdcall, it is fine then if everything is stdcall.
2) You can not test _alloca like you did, the effect is that the stack is not freed during the test and will eventually overflow. You need an intermediate function as I have shown in Reply 15. Remember that with _alloca the memory is only freed when the function returns, so you need a function that return during the test, the intermediate function.
3) The problem with the guard pages applies when pages are not committed. If you make a big all committed stack you are all good.
Quote from: jj2007 on May 20, 2017, 08:17:15 AM
More precisely, it works only in this very special segment. Try replacing it with a .data?, and it will choke.
May be we can do it with a BSS segment, like this:
_DATA2 SEGMENT ALIGN(32) FLAT 'BSS'
bigArray DWORD 50000 DUP(?)
_DATA2 ends
(http://www.atelierweb.com/a/peheader.png)
Appears to work, although BSS is not in the list of typical segment classes which are:
'DATA', 'CODE', 'CONST' and 'STACK'
Cool 8)
I was looking for a class like the 'BSS' value, couldn't find it.
Tested it and it works.
Found only these class values, 'DATA', 'CODE', 'CONST', 'MODULES' and 'STACK'
https://docs.microsoft.com/nl-nl/cpp/assembler/masm/segment
Quote from: aw27 on May 20, 2017, 04:00:24 PM2) You can not test _alloca like you did, the effect is that the stack is not freed during the test and will eventually overflow. You need an intermediate function as I have shown in Reply 15. Remember that with _alloca the memory is only freed when the function returns, so you need a function that return during the test, the intermediate function.
7-zip tricked me into adding an old version of Allocs.asm. The good one is attached - sorry for that :icon_redface:
Quote3) The problem with the guard pages applies when pages are not committed. If you make a big all committed stack you are all good.
There is kind of a silent understanding here to avoid 'special' commandline options, but of course what you propose is possible. Using the options, results are as shown below. StackBuffer() (http://www.webalice.it/jj2006/MasmBasicQuickReference.htm#Mb1255) works with default options, and timings are identical; _alloca does not zero-init the pages, so compare to StackBuffer(size, nz).
Intel(R) Core(TM) i5-2450M CPU @ 2.50GHz
Allocating 50000*40000 bytes:
00186320 _alloca: 1 ms a16: 0
00186300 StackBuffer z: 83 ms a16: 0
00186300 StackBuffer nz: 1 ms a16: 0
00662DE0 GlobalAlloc z: 99 ms a16: 0
00662DE0 HeapAlloc nz: 13 ms a16: 0
00662DE4 SysAlloc nz: 18 ms a16: 1
00662DE0 HeapAlloc16 z: 100 ms a16: 0
00662DE0 HeapAlloc16 nz: 13 ms a16: 0
Allocating 50000*110000 bytes:
001751C0 _alloca: 1 ms a16: 0
00175180 StackBuffer z: 240 ms a16: 0
00175180 StackBuffer nz: 1 ms a16: 0
00662DE0 GlobalAlloc z: 255 ms a16: 0
00662DE0 HeapAlloc nz: 15 ms a16: 0
00662DE4 SysAlloc nz: 19 ms a16: 1
00662DE0 HeapAlloc16 z: 255 ms a16: 0
00662DE0 HeapAlloc16 nz: 15 ms a16: 0
Allocating 50000*302500 bytes:
001461C0 _alloca: 1 ms a16: 0
001461C0 StackBuffer z: 678 ms a16: 0
001461C0 StackBuffer nz: 1 ms a16: 0
00662DE0 GlobalAlloc z: 678 ms a16: 0
00662DE0 HeapAlloc nz: 15 ms a16: 0
00662DE4 SysAlloc nz: 19 ms a16: 1
00662DE0 HeapAlloc16 z: 676 ms a16: 0
00662DE0 HeapAlloc16 nz: 15 ms a16: 0
Allocating 50000*831875 bytes:
000C4DE0 _alloca: 1 ms a16: 0
000C4DC0 StackBuffer z: 1825 ms a16: 0
000C4DC0 StackBuffer nz: 1 ms a16: 0
022B0020 GlobalAlloc z: 265 ms a16: 0
022B0020 HeapAlloc nz: 261 ms a16: 0
022B0024 SysAlloc nz: 273 ms a16: 1
022B0020 HeapAlloc16 z: 204 ms a16: 0
022B0020 HeapAlloc16 nz: 204 ms a16: 0
Quote from: jj2007 on May 20, 2017, 07:57:53 PM
There is kind of a silent understanding here to avoid 'special' commandline options
Actually you don't need any special command line options to make _alloca make all the tests. All you have to do is turn the reserved stack pages into committed pages
at run time. And this is done by
probing them. This is actually what Microsoft's open source _chkstk does, so I included it here in the middle of the example. In this example,
before the test starts, you probe 850000 bytes from the default stack of 1MB, which makes more than enough space for your maximum allocation of 831875 bytes :t
.686
.XMM
.MODEL FLAT, STDCALL
OPTION CASEMAP:NONE
includelib \Masm32\lib\Kernel32.lib
_PAGESIZE_ equ 1000h
.code
_chkstk proc C
_alloca_probe = _chkstk
push ecx
; Calculate new TOS.
lea ecx, [esp] + 8 - 4 ; TOS before entering function + size for ret value
sub ecx, eax ; new TOS (Top of Stack)
; Handle allocation size that results in wraparound.
; Wraparound will result in StackOverflow exception.
sbb eax, eax ; 0 if CF==0, ~0 if CF==1
not eax ; ~0 if TOS did not wrapped around, 0 otherwise
and ecx, eax ; set to 0 if wraparound
mov eax, esp ; current TOS
and eax, not ( _PAGESIZE_ - 1) ; Round down to current page boundary
cs10:
cmp ecx, eax ; Is new TOS
bnd jb short cs20 ; in probed page?
mov eax, ecx ; yes.
pop ecx
xchg esp, eax ; update esp
mov eax, dword ptr [eax] ; get return address
mov dword ptr [esp], eax ; and put it at new TOS
bnd ret
; Find next lower page and probe
cs20:
sub eax, _PAGESIZE_ ; decrease by PAGESIZE
test dword ptr [eax],eax ; probe page.
jmp short cs10
_chkstk endp
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
_alloca proc stdcall public thesize:dword, alignm:dword
pop ecx ; pops the return address
pop eax ; thesize
pop edx ; align
sub esp, eax
neg edx
and esp, edx
mov eax, esp
push ecx ; re-push the return address in the top of stack
ret
_alloca endp
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef
p0 proc dummy1, dummy2, allocSize
Local pFatBuffer, a16:BYTE
invoke _alloca, allocSize, 32
mov pFatBuffer, eax
test al, 15
setne a16
.if Zero?
movaps [eax], xmm0
.else
movups [eax], xmm0
.endif
mov dword ptr [eax], 11111111h
mov edx, allocSize
mov dword ptr [eax+edx-4], 33333333h
shr edx, 1
mov dword ptr [eax+edx], 22222222h
movsx ecx, a16
mov eax, pFatBuffer
ret
p0 endp
start proc
LOCAL loops : dword
LOCAL allobytes : dword
LOCAL oldEsp : dword
mov loops, 50000
mov allobytes, 831875
mov oldEsp, esp
mov eax, 850000
invoke _chkstk
mov esp, oldEsp
mov ebx, loops-1
.Repeat
invoke p0, 123, 456, allobytes
dec ebx
.Until Sign?
ret
start endp
end start
end
Quote from: aw27 on May 21, 2017, 02:34:19 AMturn the reserved stack pages into committed pages at run time. And this is done by probing them.
Yes, this is what StackBuffer() (http://www.webalice.it/jj2006/MasmBasicQuickReference.htm#Mb1255) does, too:
Quote- StackBuffer does the stack probing for you; up to about half a megabyte, it is significantly faster than HeapAlloc
The "significantly faster" refers to the zeroing version. The nz version is always a lot faster than *Alloc.
Btw chkstk = open source? It's not in the VC header files, but of course, disassembling would be easy.
Quote from: jj2007 on May 21, 2017, 02:37:33 AM
Quote from: aw27 on May 21, 2017, 02:34:19 AMturn the reserved stack pages into committed pages at run time. And this is done by probing them.
Yes, this is what StackBuffer() (http://www.webalice.it/jj2006/MasmBasicQuickReference.htm#Mb1255) does, too:Quote- StackBuffer does the stack probing for you; up to about half a megabyte, it is significantly faster than HeapAlloc
All is explained then. :t
It's hard to believe how much uncommented garbage you allow some jerks to post here.
:biggrin:
I would not sit up at night silently sobbing about it. :tongue: