Microsoft don't use MASM in any 64-bit modules as far as I know. All speed critical algorithms (graphics/math) are all done by the compiler. This also apply to the CRT code like memcpy, strlen and so on.
Reducing size in it self may increase speed like jump distance and code cache.
.code
repeat 50
mov rax,1
mov rcx,1
mov rdx,1
mov r8,1
mov r9,1
mov r10,1
mov r11,1
xor rax,rax
xor rcx,rcx
xor rdx,rdx
xor r8,r8
xor r9,r9
xor r10,r10
xor r11,r11
endm
ret
END
.code
repeat 50
mov eax,1
mov ecx,1
mov edx,1
mov r8d,1
mov r9d,1
mov r10d,1
mov r11d,1
xor eax,eax
xor ecx,ecx
xor edx,edx
xor r8d,r8d
xor r9d,r9d
xor r10d,r10d
xor r11d,r11d
endm
ret
END
Intel(R) Core(TM) i5-6500T CPU @ 2.50GHz (AVX2)
----------------------------------------------
-- test(0)
16136 cycles, rep(100), code(3501) 0.asm: rax
15899 cycles, rep(100), code(2851) 1.asm: eax
-- test(1)
16196 cycles, rep(100), code(3501) 0.asm: rax
15899 cycles, rep(100), code(2851) 1.asm: eax
-- test(2)
17520 cycles, rep(100), code(3501) 0.asm: rax
15808 cycles, rep(100), code(2851) 1.asm: eax
-- test(3)
16421 cycles, rep(100), code(3501) 0.asm: rax
16556 cycles, rep(100), code(2851) 1.asm: eax
total [0 .. 3], 1++
64162 cycles 1.asm: eax
66273 cycles 0.asm: rax