Thanks :icon14:
Astonishing that the unrolled version is so much slower...
xorps xmm0, xmm0
ifnb <unrolled>
shr eax, 4+2 ; bufsize/16*4
mov edx, esp ; save current stack pointer
and esp, -16 ; aligned for SSE2
align 4
.Repeat
sub esp, 4*OWORD
movdqa OWORD ptr [esp], xmm0
movdqa OWORD ptr [1*OWORD+esp], xmm0
movdqa OWORD ptr [2*OWORD+esp], xmm0
movdqa OWORD ptr [3*OWORD+esp], xmm0
dec eax
.Until Zero?
else
shr eax, 4 ; /16
mov edx, esp ; save current stack pointer
and esp, -16 ; aligned for SSE2
align 4
.Repeat
sub esp, OWORD
movaps OWORD ptr [esp], xmm0
dec eax
.Until Zero?
endif