This is the Unicode version. Note that the string must be aligned 2 (which is mostly the case with Unicode strings) for this to work.
mov eax,[esp+4]
bt eax,0
jc L3
mov ecx,[esp+4]
and eax,-16
and ecx,16-1
or edx,-1
shl edx,cl
pxor xmm0,xmm0
pcmpeqw xmm0,[eax]
add eax,16
pmovmskb ecx,xmm0
pxor xmm0,xmm0
and ecx,edx
jnz L2
L1:
movaps xmm1,[eax]
pcmpeqw xmm1,xmm0
pmovmskb ecx,xmm1
add eax,16
test ecx,ecx
jz L1
L2:
bsf ecx,ecx
lea eax,[eax+ecx-16]
sub eax,[esp+4]
shr eax,1
ret
L3:
mov edx,edi
mov edi,eax
xor eax,eax
or ecx,-1
repne scasw
not ecx
dec ecx
mov eax,ecx
mov edi,edx
ret
Result:
total [0 .. 40], 8++
575817 cycles 2.asm: SSE 16
3081171 cycles 0.asm: msvcrt.wcslen()
4261124 cycles 1.asm: scasw
total [41 .. 80], 7++
629595 cycles 2.asm: SSE 16
4696938 cycles 1.asm: scasw
4742392 cycles 0.asm: msvcrt.wcslen()
total [600 .. 1000], 100++
987251 cycles 2.asm: SSE 16
7455315 cycles 1.asm: scasw
8530590 cycles 0.asm: msvcrt.wcslen()