I have been playing with this idea so that later larger data types can be isolated to maintain the correct alignment. The ones that matter the most are the AVX and AVX2 sizes but it was easy enough to add the SSE alignment as well. It means that on the fly in a procedure you can set an immediate to a correctly aligned data location with no melodrama.
The stackframe macros can already align the LOCAL data in procedures so that the first item is correctly aligned to the larger data sizes. This requires the discipline of putting the larger data sizes first and adding any others in descending size order to maintain alignment but it is simple enough to do.
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
include \masm32\include64\masm64rt.inc
.sse_data equ <SEG16 SEGMENT align(16)>
.ymm_data equ <SEG32 SEGMENT align(32)>
.zmm_data equ <SEG64 SEGMENT align(64)>
.SSE_DATA equ <.sse_data>
.YMM_DATA equ <.ymm_data>
.ZMM_DATA equ <.zmm_data>
.ymm_data
avx2a YMMWORD ?
avx2b YMMWORD ?
avx2c YMMWORD ?
avx2d YMMWORD ?
avx2e YMMWORD ?
avx2f YMMWORD ?
avx2g YMMWORD ?
avx2h YMMWORD ?
.code
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
YMMSTACK ; YMM reg alignment
entry_point proc
LOCAL avx1 :YMMWORD ; YMM aligned at proc start
LOCAL avx2 :YMMWORD
LOCAL avx3 :YMMWORD
LOCAL avx4 :YMMWORD
; -------------------------------------------
; smaller data sizes AFTER YMM aligned locals
; -------------------------------------------
LOCAL pMem :QWORD ; allocated memory pointer
LOCAL aMem :QWORD ; aligned memory pointer
mov aMem, aalloc(pMem,1024*1024*64,4096) ; aligned allocate to page boundary
mov rax, aMem ; address is already in rax from aalloc
vmovntdqa ymm0, YMMWORD PTR [rax] ; load pointer into YMM register
vmovntdq avx1, ymm0 ; copy register to aligned local
vmovntdqa ymm1, avx1 ; copy aligned local to YMM reg
call testproc
waitkey
mfree pMem ; release original allocation
.exit 0
entry_point endp
STACKFRAME
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
testproc proc
vmovntdq avx2a, ymm8
vmovntdq avx2b, ymm9
vmovntdq avx2c, ymm10
vmovntdq avx2d, ymm11
vmovntdq avx2e, ymm12
vmovntdq avx2f, ymm13
vmovntdq avx2g, ymm14
vmovntdq avx2h, ymm15
.ymm_data
inited YMMWORD 1234567890.0 ; initialise an immediate
uninit YMMWORD ? ; unitialised
.code
vmovntdq inited, ymm15
conout "Save and restore AVX2 registers.",lf,lf
vmovntdqa ymm15, inited
vmovntdqa ymm8, avx2a
vmovntdqa ymm9, avx2b
vmovntdqa ymm10, avx2c
vmovntdqa ymm11, avx2d
vmovntdqa ymm12, avx2e
vmovntdqa ymm13, avx2f
vmovntdqa ymm14, avx2g
vmovntdqa ymm15, avx2h
ret
testproc endp
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
end