This is another one, this time for an AVX-512 strlen (based
on this url)
strlenZMM:
push esi
mov eax, 01010101h
vpbroadcastd zmm2, eax ; broadcast eax to all elements
xor edx, edx ; len = 0
mov eax, 80808080h
vpbroadcastd zmm3, eax
mov esi, [esp+8]
@@:
vmovdqu32 zmm0, ZMMWORD PTR [esi+edx]
vpsubd zmm1, zmm0, zmm2
vpternlogd zmm1, zmm0, zmm3, 32
vptestmd k1, zmm1, zmm1
kmovw eax, k1
movzx eax, ax
test ax, ax
jnz @F
add edx, 64
jmp short @B
@@:
bsf eax, eax
push 32
pop ecx
cmovne ecx, eax
lea esi, dword ptr [esi+ecx*4]
cmp byte ptr [esi+edx], 0
lea eax, dword ptr [edx+ecx*4]
je short @exit
cmp byte ptr [esi+edx+1], 0
jne short @F
inc eax
jmp short @exit
@@:
cmp byte ptr [esi+edx+2], 0
jne short @F
add eax, 2
jmp short @exit
@@:
add eax, 3
@exit:
vzeroupper
pop esi
ret
end
I added a 4th test for strings between 40000 and 40900 to see it the AVX-512 decouples. Well, not really, SSE Intel Silvermont and SSE Intel Atom are there as well.

total [0 .. 40], 8++
290780 cycles 7.asm: sse2
355355 cycles 5.asm: PCMPISTRI
412251 cycles 3.asm: SSE Intel Silvermont
469664 cycles 8.asm: Agner Fog
502841 cycles 1.asm: SSE 16
524321 cycles 2.asm: SSE 32
597335 cycles 9.asm: ZMM AVX512
865552 cycles 4.asm: SSE Intel Atom
908227 cycles 6.asm: scasb
913651 cycles 0.asm: msvcrt.strlen()
total [41 .. 80], 7++
270380 cycles 3.asm: SSE Intel Silvermont
299431 cycles 5.asm: PCMPISTRI
306940 cycles 7.asm: sse2
314735 cycles 1.asm: SSE 16
364536 cycles 9.asm: ZMM AVX512
380247 cycles 8.asm: Agner Fog
405156 cycles 2.asm: SSE 32
639091 cycles 4.asm: SSE Intel Atom
758265 cycles 6.asm: scasb
982403 cycles 0.asm: msvcrt.strlen()
total [600 .. 1000], 100++
202227 cycles 9.asm: ZMM AVX512
237534 cycles 3.asm: SSE Intel Silvermont
292854 cycles 4.asm: SSE Intel Atom
334146 cycles 2.asm: SSE 32
338568 cycles 1.asm: SSE 16
356720 cycles 7.asm: sse2
436840 cycles 8.asm: Agner Fog
650222 cycles 5.asm: PCMPISTRI
1438033 cycles 6.asm: scasb
1830544 cycles 0.asm: msvcrt.strlen()
total [40000 .. 40900], 100++
2161645 cycles 3.asm: SSE Intel Silvermont
2224521 cycles 4.asm: SSE Intel Atom
2342704 cycles 9.asm: ZMM AVX512
3137064 cycles 1.asm: SSE 16
3465817 cycles 7.asm: sse2
3514206 cycles 2.asm: SSE 32
4113016 cycles 8.asm: Agner Fog
6173622 cycles 5.asm: PCMPISTRI
13022424 cycles 6.asm: scasb
16670776 cycles 0.asm: msvcrt.strlen()