Here is a test run with your (marginally modified) 1.asm ... 4.asm and other StrLen algos.
The 3.asm algo returns a wrong value, I have not found out why.
Intel(R) Core(TM) i5-2450M CPU @ 2.50GHz (SSE4)
6931 cycles for 100 * CRT strlen
5372 cycles for 100 * Masm32 StrLen
19701 cycles for 100 * Windows lstrlen
2006 cycles for 100 * MasmBasic Len
1579 cycles for 100 * Algo1
1712 cycles for 100 * Algo2
1103 cycles for 100 * Algo3
3209 cycles for 100 * Algo4
6941 cycles for 100 * CRT strlen
5393 cycles for 100 * Masm32 StrLen
19722 cycles for 100 * Windows lstrlen
1994 cycles for 100 * MasmBasic Len
1587 cycles for 100 * Algo1
1722 cycles for 100 * Algo2
1102 cycles for 100 * Algo3
3227 cycles for 100 * Algo4
6938 cycles for 100 * CRT strlen
5380 cycles for 100 * Masm32 StrLen
19692 cycles for 100 * Windows lstrlen
2011 cycles for 100 * MasmBasic Len
1580 cycles for 100 * Algo1
1713 cycles for 100 * Algo2
1101 cycles for 100 * Algo3
3205 cycles for 100 * Algo4
6970 cycles for 100 * CRT strlen
5404 cycles for 100 * Masm32 StrLen
19710 cycles for 100 * Windows lstrlen
2010 cycles for 100 * MasmBasic Len
1589 cycles for 100 * Algo1
1710 cycles for 100 * Algo2
1104 cycles for 100 * Algo3
3204 cycles for 100 * Algo4
14 bytes for CRT strlen
10 bytes for Masm32 StrLen
10 bytes for Windows lstrlen
10 bytes for MasmBasic Len
82 bytes for Algo1
114 bytes for Algo2
690 bytes for Algo3
761 bytes for Algo4
100 = eax CRT strlen
100 = eax Masm32 StrLen
100 = eax Windows lstrlen
100 = eax MasmBasic Len
100 = eax Algo1
100 = eax Algo2
14 = eax Algo3
100 = eax Algo4
This is the clear winner:
; include 1j.asm
Algo1 proc
mov eax,[esp+4]
mov ecx,eax ; much faster than [esp+4]
and eax,-16
and ecx,16-1
or edx,-1
shl edx,cl
xorps xmm0,xmm0
pcmpeqb xmm0,[eax]
add eax,16
pmovmskb ecx,xmm0
; xorps xmm0,xmm0 ; ??
and ecx,edx
jnz L2
L1:
movaps xmm1,[eax]
pcmpeqb xmm1,xmm0
pmovmskb ecx,xmm1
add eax,16
test ecx,ecx
jz L1
L2:
bsf ecx,ecx
lea eax,[eax+ecx-16]
sub eax,[esp+4]
retn 4
Algo1 endp