simple test case:

` enter 32,0`

movaps [rbp-16],xmm0

movaps [rbp-32],xmm1

leave

` sub rsp,16*2 + 8`

movaps [rsp+0],xmm0

movaps [rsp+16],xmm1

add rsp,16*2 + 8

` push rbp`

mov rbp,rsp

sub rsp,32

movaps [rbp-16],xmm0

movaps [rbp-32],xmm1

leave

`Intel(R) Core(TM) i3 CPU 540 @ 3.07GHz (SSE4.2)`

----------------------------------------------

-- test(1)

38787 cycles, rep(3000), code( 14) 0.asm: enter/leave

14545 cycles, rep(3000), code( 18) 1.asm: sub rsp

12832 cycles, rep(3000), code( 18) 2.asm: push rbp

-- test(2)

38926 cycles, rep(3000), code( 14) 0.asm: enter/leave

15945 cycles, rep(3000), code( 18) 1.asm: sub rsp

17186 cycles, rep(3000), code( 18) 2.asm: push rbp

-- test(3)

39026 cycles, rep(3000), code( 14) 0.asm: enter/leave

15001 cycles, rep(3000), code( 18) 1.asm: sub rsp

14988 cycles, rep(3000), code( 18) 2.asm: push rbp

total [1 .. 3], 1++

45006 cycles 2.asm: push rbp

45491 cycles 1.asm: sub rsp

116739 cycles 0.asm: enter/leave