Not too sure how the align local to 16 isn't working, are you calling the function from HLL or is it being run directly from an asm app ?
Here is an example from my side and the locals bob/bob1 are aligned 16 every-time:
option frame:auto
option win64:6
option stackbase:rsp
;assemble with
; c:\jwasm\hjwasm64 -c -win64 -Zi -Zd -Zf -Zp8 aw.asm
; d:\vs2015\vc\bin\link /subsystem:console /machine:x64 /debug /entry:proc1 /Libpath:"%WINSDK%\v7.1\Lib\x64" aw.obj
__m128f struct
f0 real4 ?
f1 real4 ?
f2 real4 ?
f3 real4 ?
__m128f ends
__m128q struct
q0 QWORD ?
q1 QWORD ?
__m128q ends
__m128 union
f32 __m128f <>
q64 __m128q <>
__m128 ends
OPTION ARCH:AVX
includelib kernel32.lib
includelib user32.lib
externdef MessageBoxW : near
externdef MessageBoxA : near
MessageBoxW PROTO :qword, :qword, :qword, :qword
MessageBoxA PROTO :qword, :qword, :qword, :qword
.data
; Automatic type promotion from integer to float
aReal REAL4 2
; This is example of initializing a union with floats (first sub-type)
; using normal syntax as well as hjwasm 2.17 update to promote integer literal to float
myVec1 __m128 { < 1.0, 2.0, 3.0, 4.0 > }
myVec2 __m128 { < 1, 2, 3, 4 > }
; Hjwasm 2.22 enhanced union type (now allows direct specification of sub-type to use in initialisation):
myVec4 __m128.f32 { < 1.0, 2.0, 3.0, 4.0 > } ; you can try .f33 and hjwasm will emit an error when testing for valid sub-type.
myVec3 __m128.q64 { < 0x1234, 0x5678 > }
myVec5 __m128.f32 { < 1.0, 2.0, 3.0, 4.0 > } ; you can try .f33 and hjwasm will emit an error when testing for valid sub-type.
floatVar real4 2.3
awideStr dw "wide caption ",0
.code
start:
LOADSS xmm0,2.0
OPTION ARCH:SSE
LOADSS xmm1,3.0
OPTION ARCH:AVX
LOADSD xmm2,4.0
;this proc is creating a dud sub rsp,8 :( (FIXED)
proc2 proc public
ret
proc2 endp
sub1 proc public arg1:ptr, arg2:ptr
ret
sub1 endp
sub2 proc public uses rdi xmm0 arg1:ptr, arg2:ptr
ret
sub2 endp
newproc3 proc arg1:qword, arg2:qword
ret
newproc3 endp
newproc proc arg1:qword, arg2:real4
movss xmm3,arg2 ; with option win64:7 , this loads from [rbp+20h] but it SHOULD be [rbp+18h] :(
ret
newproc endp
newproc2 proc FRAME arg1:qword, arg2:real4, arg3:dword, arg4:dword, arg5:dword
movss xmm3,arg2 ; with option win64:7 , this loads from [rbp+20h] but it SHOULD be [rbp+18h] :(
mov eax,arg3
mov ebx,arg4
mov ecx,arg5
ret
newproc2 endp
; This one will implement FPO(frame pointer ommission as no parameters or locals are used).
newproc5 proc FRAME arg1:qword, arg2:real4, arg3:dword, arg4:dword, arg5:dword
xor eax,eax
mov ebx,eax
ret
newproc5 endp
proc1 proc FRAME arg1:qword, arg2:qword, arg3 :qword
local bob:XMMWORD
local bob1:XMMWORD
mov r9, rcx
mov r10, rdx
mov r11, r8
invoke newproc3, rax, "this is an ascii string"
movss xmm1, FP4(1.28)
movss xmm1, FP4(2.28)
movss xmm1, FP4(3.28)
invoke MessageBoxW, 0, ADDR awideStr, ADDR awideStr, 0
invoke MessageBoxA, 0, "yay string literals", "oops", 0
invoke newproc3, rax,"this is an ascii string"
invoke newproc3, rcx, L"a wide string yay"
invoke MessageBoxW, 0, L"yay wide string literal", ADDR awideStr, 0
invoke MessageBoxA, 0, "yay string literals2", "oops", 0
invoke MessageBoxA, 0, "yay string literals3", "oops", 0
invoke MessageBoxA, 0, "yay string literals4", "oops", 0
invoke newproc2, rax,xmm4,ebx,r10d,r11d
invoke newproc5, rax,xmm4,ebx,r10d,r11d
invoke newproc, rax, xmm4
invoke newproc, rax, floatVar
invoke newproc, rax, xmm1
INVOKE sub1, r10, r8
INVOKE sub2, r9, r11
mov rax, r9
vmovaps xmm0,bob
vmovaps bob1,xmm1
ret
proc1 endp
WinMainCRTStartup PROC FRAME
invoke proc1, 10, 20, 30
ret
WinMainCRTStartup ENDP
end WinMainCRTStartup