This is another one, this time for an AVX-512 strlen (based

on this url)

`strlenZMM:`

push esi

mov eax, 01010101h

vpbroadcastd zmm2, eax ; broadcast eax to all elements

xor edx, edx ; len = 0

mov eax, 80808080h

vpbroadcastd zmm3, eax

mov esi, [esp+8]

@@:

vmovdqu32 zmm0, ZMMWORD PTR [esi+edx]

vpsubd zmm1, zmm0, zmm2

vpternlogd zmm1, zmm0, zmm3, 32

vptestmd k1, zmm1, zmm1

kmovw eax, k1

movzx eax, ax

test ax, ax

jnz @F

add edx, 64

jmp short @B

@@:

bsf eax, eax

push 32

pop ecx

cmovne ecx, eax

lea esi, dword ptr [esi+ecx*4]

cmp byte ptr [esi+edx], 0

lea eax, dword ptr [edx+ecx*4]

je short @exit

cmp byte ptr [esi+edx+1], 0

jne short @F

inc eax

jmp short @exit

@@:

cmp byte ptr [esi+edx+2], 0

jne short @F

add eax, 2

jmp short @exit

@@:

add eax, 3

@exit:

vzeroupper

pop esi

ret

end

I added a 4th test for strings between 40000 and 40900 to see it the AVX-512 decouples. Well, not really, SSE Intel Silvermont and SSE Intel Atom are there as well.

total [0 .. 40], 8++

290780 cycles 7.asm: sse2

355355 cycles 5.asm: PCMPISTRI

412251 cycles 3.asm: SSE Intel Silvermont

469664 cycles 8.asm: Agner Fog

502841 cycles 1.asm: SSE 16

524321 cycles 2.asm: SSE 32

597335 cycles 9.asm: ZMM AVX512

865552 cycles 4.asm: SSE Intel Atom

908227 cycles 6.asm: scasb

913651 cycles 0.asm: msvcrt.strlen()

total [41 .. 80], 7++

270380 cycles 3.asm: SSE Intel Silvermont

299431 cycles 5.asm: PCMPISTRI

306940 cycles 7.asm: sse2

314735 cycles 1.asm: SSE 16

364536 cycles 9.asm: ZMM AVX512

380247 cycles 8.asm: Agner Fog

405156 cycles 2.asm: SSE 32

639091 cycles 4.asm: SSE Intel Atom

758265 cycles 6.asm: scasb

982403 cycles 0.asm: msvcrt.strlen()

total [600 .. 1000], 100++

202227 cycles 9.asm: ZMM AVX512

237534 cycles 3.asm: SSE Intel Silvermont

292854 cycles 4.asm: SSE Intel Atom

334146 cycles 2.asm: SSE 32

338568 cycles 1.asm: SSE 16

356720 cycles 7.asm: sse2

436840 cycles 8.asm: Agner Fog

650222 cycles 5.asm: PCMPISTRI

1438033 cycles 6.asm: scasb

1830544 cycles 0.asm: msvcrt.strlen()

total [40000 .. 40900], 100++

2161645 cycles 3.asm: SSE Intel Silvermont

2224521 cycles 4.asm: SSE Intel Atom

2342704 cycles 9.asm: ZMM AVX512

3137064 cycles 1.asm: SSE 16

3465817 cycles 7.asm: sse2

3514206 cycles 2.asm: SSE 32

4113016 cycles 8.asm: Agner Fog

6173622 cycles 5.asm: PCMPISTRI

13022424 cycles 6.asm: scasb

16670776 cycles 0.asm: msvcrt.strlen()