Be warned that this is experimental code armed with the reference material from Ray Chen on stack alignment requirements. It currently handles the entry procedure and uses a simple pure mnemonic memory copy procedure so it has yet to be tested on nested procedures. There are a number of macros in the macros64.inc file that are used in the example, mainly to compact and automate the ordinary API functions so that you don't have to mess around with individual alignments. The reason for the LOCAL64 macro is to double up the QWORD sized entries to maintain alignment. The "align8_rsp" macro is to align the stack on an 8 byte boundary for the following API function calls.
It is still rather crude code but so far its working OK and gives a reasonably clean coding interface. I have no doubt it can be improved on.
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
OPTION DOTNAME
option casemap:none
include \masm64\include\win64.inc
include \masm64\include\temphls.inc
include \masm64\include\kernel32.inc
include \masm64\include\user32.inc
include \masm64\include\msvcrt.inc
includelib \masm64\lib\user32.lib
includelib \masm64\lib\kernel32.lib
includelib \masm64\lib\msvcrt.lib
include macros64.inc
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
; --------------
; gigabyte count
; --------------
gbcnt equ <8>
.data
pttl db "Milliseconds duration",0
.code
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
main proc
; LOCAL64 macro is to maintain stack alignment of locals.
; each macro adds a dummy local after the named LOCAL to add
; an extra 8 bytes to the stack.
LOCAL64 pMem
LOCAL64 hMem
LOCAL64 tc
; -------------------------------------
; align the stack on an 8 byte boundary
; -------------------------------------
align8_rsp
; -----------------------------------------------------------
; following macros provide spill space for 4 64 bit registers
; plus + 8 bytes on entry and restore the stack on exit.
; -----------------------------------------------------------
mov pMem, rv64(GlobalAlloc,GMEM_FIXED or GMEM_ZEROINIT,1024*1024*1024*gbcnt)
mov hMem, alloc64(1024*1024*1024*gbcnt)
mov tc, rv64(GetTickCount)
; -----------------------------
; copy memory from pMem to hMem
; -----------------------------
mov r8, 1024*1024*1024*gbcnt
mov rdx, hMem
mov rcx, pMem
call mcopy64
sub rv64(GetTickCount), tc
mov tc, rax
fn64 MessageBox,0,buff$(tc),ADDR pttl,0 ; display millisecond timing
free64 pMem ; release memory
free64 hMem ; release memory
exit64 0 ; exit the process
main endp
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
mcopy64 proc
; rcx = source address
; rdx = destination address
; r8 = byte count
; --------------
; save rsi & rdi
; --------------
mov r11, rsi
mov r10, rdi
cld
mov rsi, rcx
mov rdi, rdx
mov rcx, r8
shr rcx, 3
rep movsq
mov rcx, r8
and rcx, 7
rep movsb
; -----------------
; restore rsi & rdi
; -----------------
mov rdi, r10
mov rsi, r11
retn
mcopy64 endp
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
comment #
https://msdn.microsoft.com/en-us/library/9z1stfyw.aspx
Volatile
rax rcx rdx r8 r9 r10 r11
Non Volatile
r12 r13 r14 r15 rdi rsi rbx rbp rsp
Volotile
xmm0 ymmo
xmm1 ymm1
xmm2 ymm2
xmm3 ymm3
xmm4 ymm4
xmm5 ymm5
Nonvolatile (XMM), Volatile (upper half of YMM)
xmm6-15
ymm6-15
#
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
end