I did a quick test piece just looping the 3 combinations of register preservation and found that the test conditions changed the results in humerous ways. Increasing the priority improved the XMM version, commenting in and out the cache clearing CPUID seemed to favour the MMX register version and if you turned off increasing priority, CPUID and SleepEx that the integer version was faster. All have the same problem of cache saturation as the tests are too short to be useful, even with a very high iteration count.
Since the PIII the action has been instruction scheduling and narrow instruction testing is not far off useless. Sequencing instructions through multiple pipelines without stalls is far more useful when chasing speed as long as you stay away from very old instructions that only live in microcode and really slow the works up. The problem with narrow instruction testing is the assumption that processors work like a pre i486 instruction chomper and that world is long gone.
For the very little that its worth, this is the experimental test piiece.
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
include \masm32\include64\masm64rt.inc
.code
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
entry_point proc
USING r13, r14, r15
LOCAL ireg :QWORD
LOCAL mreg :QWORD
LOCAL xreg :QWORD
mov ireg, 0
mov mreg, 0
mov xreg, 0
SaveRegs
; HighPriority
mov r13, 8
lbl:
; ------------------------------------
mov r15, 1000000000
; rcall SleepEx, 100,0
; cpuid
call GetTickCount
mov r14, rax
@@:
call intreg
sub r15, 1
jnz @B
call GetTickCount
sub rax, r14
add ireg, rax
conout str$(rax)," intreg",lf
; ------------------------------------
mov r15, 1000000000
; rcall SleepEx, 100,0
; cpuid
call GetTickCount
mov r14, rax
@@:
call mmxreg
sub r15, 1
jnz @B
call GetTickCount
sub rax, r14
add mreg, rax
conout str$(rax)," mmxreg",lf
; ------------------------------------
mov r15, 1000000000
; rcall SleepEx, 100,0
; cpuid
call GetTickCount
mov r14, rax
@@:
call xmmreg
sub r15, 1
jnz @B
call GetTickCount
sub rax, r14
add xreg, rax
conout str$(rax)," xmmreg",lf
; ------------------------------------
sub r13, 1
jnz lbl
shr ireg, 3
conout " INT Reg Average ",str$(ireg),lf
shr mreg, 3
conout " MMX Reg Average ",str$(mreg),lf
shr xreg, 3
conout " XMM Reg Average ",str$(xreg),lf
NormalPriority
waitkey
RestoreRegs
.exit
entry_point endp
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
NOSTACKFRAME
intreg proc
mov r11, rsi
mov r10, rdi
mov rsi, r11
mov rdi, r10
ret
intreg endp
STACKFRAME
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
NOSTACKFRAME
mmxreg proc
movq mm0, rsi
movq mm1, rdi
movq rsi, mm0
movq rdi, mm1
ret
mmxreg endp
STACKFRAME
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
NOSTACKFRAME
xmmreg proc
movq xmm0, rsi
movq xmm1, rdi
movq rsi, xmm0
movq rdi, xmm1
ret
xmmreg endp
STACKFRAME
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
end