Two fast algos with PCMPISTRI :biggrin:
.const
EQUAL_ANY equ 0000b
RANGES equ 0100b
EQUAL_EACH equ 1000b
EQUAL_ORDERED equ 1100b
NEGATIVE_POLARITY equ 010000b
BYTE_MASK equ 1000000b
.code
align 16
InstrLingo proc haystack:DWORD,needle:DWORD
mov ecx, haystack ; ecx = haystack, edx = needle
push esi
mov edx, needle
push edi
mov eax, ecx
movdqu xmm2, oword ptr[edx] ; load 1st 16 bytes of needle
pxor xmm3, xmm3
@Loop: ; find the first possible match of 16-byte fragment in haystack
pcmpistri xmm2, oword ptr[eax], EQUAL_ORDERED
lea eax, [eax+16]
ja @Loop
lea eax, [eax+ecx-16] ; save the possible match start
mov edi, edx
mov esi, eax
jnc @NotFound
sub edi, eax
@@: ; compare the strings
movdqu xmm1, oword ptr [esi + edi]
pcmpistrm xmm3, xmm1, EQUAL_EACH + NEGATIVE_POLARITY + BYTE_MASK ; mask out invalid bytes in the haystack
movdqu xmm4, oword ptr[esi]
pand xmm4, xmm0
pcmpistri xmm1, xmm4, EQUAL_EACH + NEGATIVE_POLARITY
lea esi, [esi+16]
ja @b
jnc @Found
add eax,1 ; continue searching from the next byte
jmp @Loop
@NotFound:
xor eax, eax
@Found:
pop edi
pop esi
ret
InstrLingo endp
StrLen_Lingo proc lpStr:DWORD
mov eax, lpStr
pxor xmm1, xmm1
pcmpistri xmm1, oword ptr [eax], EQUAL_EACH
jz @End
mov edx, eax
and eax, -16
@@:
pcmpistri xmm1, oword ptr [eax+16], EQUAL_EACH
lea eax, [eax+16]
jnz @b
sub ecx, edx
add eax, ecx
ret
@End:
mov eax, ecx
ret
StrLen_Lingo endp
And here (https://www.strchr.com/strcmp_and_strlen_using_sse_4.2) are the original sources by Peter Kankowski, December 2008:
strlen_sse42:
; ecx = string
mov eax, -16
mov edx, ecx
pxor xmm0, xmm0
STRLEN_LOOP:
add eax, 16
PcmpIstrI xmm0, dqword[edx + eax], EQUAL_EACH
jnz STRLEN_LOOP
add eax, ecx
ret
strstr_sse42:
; ecx = haystack, edx = needle
push esi
push edi
MovDqU xmm2, dqword[edx] ; load the first 16 bytes of neddle
Pxor xmm3, xmm3
lea eax, [ecx - 16]
; find the first possible match of 16-byte fragment in haystack
STRSTR_MAIN_LOOP:
add eax, 16
PcmpIstrI xmm2, dqword[eax], EQUAL_ORDERED
ja STRSTR_MAIN_LOOP
jnc STRSTR_NOT_FOUND
add eax, ecx ; save the possible match start
mov edi, edx
mov esi, eax
sub edi, esi
sub esi, 16
; compare the strings
@@:
add esi, 16
MovDqU xmm1, dqword[esi + edi]
; mask out invalid bytes in the haystack
PcmpIstrM xmm3, xmm1, EQUAL_EACH + NEGATIVE_POLARITY + BYTE_MASK
MovDqU xmm4, dqword[esi]
PAnd xmm4, xmm0
PcmpIstrI xmm1, xmm4, EQUAL_EACH + NEGATIVE_POLARITY
ja @B
jnc STRSTR_FOUND
; continue searching from the next byte
sub eax, 15
jmp STRSTR_MAIN_LOOP
STRSTR_NOT_FOUND:
xor eax, eax
STRSTR_FOUND:
pop edi
pop esi
ret
Quote from: lingo on August 06, 2023, 02:28:23 AMTwo fast algos with PCMPISTRI :biggrin:
.const
EQUAL_ANY equ 0000b
RANGES equ 0100b
EQUAL_EACH equ 1000b
EQUAL_ORDERED equ 1100b
NEGATIVE_POLARITY equ 010000b
BYTE_MASK equ 1000000b
.code
align 16
InstrLingo proc haystack:DWORD,needle:DWORD
mov ecx, haystack ; ecx = haystack, edx = needle
push esi
mov edx, needle
push edi
mov eax, ecx
movdqu xmm2, oword ptr[edx] ; load 1st 16 bytes of needle
pxor xmm3, xmm3
@Loop: ; find the first possible match of 16-byte fragment in haystack
pcmpistri xmm2, oword ptr[eax], EQUAL_ORDERED
lea eax, [eax+16]
ja @Loop
lea eax, [eax+ecx-16] ; save the possible match start
mov edi, edx
mov esi, eax
jnc @NotFound
sub edi, eax
@@: ; compare the strings
movdqu xmm1, oword ptr [esi + edi]
pcmpistrm xmm3, xmm1, EQUAL_EACH + NEGATIVE_POLARITY + BYTE_MASK ; mask out invalid bytes in the haystack
movdqu xmm4, oword ptr[esi]
pand xmm4, xmm0
pcmpistri xmm1, xmm4, EQUAL_EACH + NEGATIVE_POLARITY
lea esi, [esi+16]
ja @b
jnc @Found
add eax,1 ; continue searching from the next byte
jmp @Loop
@NotFound:
xor eax, eax
@Found:
pop edi
pop esi
ret
InstrLingo endp
StrLen_Lingo proc lpStr:DWORD
mov eax, lpStr
pxor xmm1, xmm1
pcmpistri xmm1, oword ptr [eax], EQUAL_EACH
jz @End
mov edx, eax
and eax, -16
@@:
pcmpistri xmm1, oword ptr [eax+16], EQUAL_EACH
lea eax, [eax+16]
jnz @b
sub ecx, edx
add eax, ecx
ret
@End:
mov eax, ecx
ret
StrLen_Lingo endp
Thank you JJ,
I like his StrLen algo and modified it a little bit in my way:
StrLen_Lingo2 proc lpStr:dword
mov eax, lpStr ; eax = string
pxor xmm0, xmm0
@@:
pcmpistri xmm0, oword ptr[eax],EQUAL_EACH
lea eax, [eax+16]
jnz @b
sub eax, lpStr
lea eax, [eax+ecx-16]
ret
StrLen_Lingo2 endp
:biggrin:
Thanks JJ for the link to the original source.
At the link (replied 5 years ago), Randall Hyde notes the MMU page boundary problem.
To make Peter Kankowski's code MASM compliant, change all the "dqword[" to "oword ptr["
dqword equ <oword ptr>
I ported these algos to x64 and removed the nonvolatile Registers for the purpose of timing them.
With a haystack of 207 bytes and the needle (7 bytes long) at byte 63, called 100 million (counter = 100000000) times
Intel(R) Core(TM) i7-10875H CPU @ 2.30GHz
Proc size Lingo = 84 bytes
Proc size Kankowski = 87 bytes
InstrLingos: 0.512102 s
strstr_Kankowski: 0.520384 s
Lingo
.const
EQUAL_ANY equ 0000b
RANGES equ 0100y
EQUAL_EACH equ 1000y
EQUAL_ORDERED equ 1100y
NEGATIVE_POLARITY equ 010000y
BYTE_MASK equ 1000000y
.data
ProcSizeLingo dd InstrLingoLen - InstrLingo
.code
align 16
InstrLingo proc ; haystack:qword, needle:qword
; rcx = & haystack
; rdx = & needle
mov rax, rcx
movdqu xmm2, oword ptr[rdx] ; load 1st 16 bytes of needle
pxor xmm3, xmm3
@Loop: ; find the first possible match of 16-byte fragment in haystack
pcmpistri xmm2, oword ptr[rax], EQUAL_ORDERED
lea rax, [rax+16]
ja @Loop
lea rax, [rax+rcx-16] ; save the possible match start
mov r9, rdx
mov r8, rax
jnc @NotFound
sub r9, rax
@@: ; compare the strings
movdqu xmm1, oword ptr[r8+r9]
pcmpistrm xmm3, xmm1, (EQUAL_EACH + NEGATIVE_POLARITY + BYTE_MASK) ; mask out invalid bytes in the haystack
movdqu xmm4, oword ptr[r8]
pand xmm4, xmm0
pcmpistri xmm1, xmm4, (EQUAL_EACH + NEGATIVE_POLARITY)
lea r8, [r8+16]
ja @b
jnc @Found
add rax, 1 ; continue searching from the next byte
jmp @Loop
@NotFound:
xor rax, rax
@Found:
ret
InstrLingo endp
InstrLingoLen:
Peter Kankowski
.const
EQUAL_ANY equ 0000y
RANGES equ 0100y
EQUAL_EACH equ 1000y
EQUAL_ORDERED equ 1100y
NEGATIVE_POLARITY equ 010000y
BYTE_MASK equ 1000000y
.data
ProcSizeKankowskiLen dd strstr_KankowskiLen - strstr_Kankowski
.code
align 16
strstr_Kankowski proc ; haystack:qword, needle:qword
; rcx = & haystack
; rdx = & needle
movdqu xmm2, oword ptr[rdx] ; load the first 16 bytes of neddle
pxor xmm3, xmm3
lea rax, [rcx-16]
STRSTR_MAIN_LOOP: ; find the first possible match of 16-byte fragment in haystack
add rax, 16
pcmpistri xmm2, oword ptr[rax], EQUAL_ORDERED
ja STRSTR_MAIN_LOOP
jnc STRSTR_NOT_FOUND
add rax, rcx ; save the possible match start
mov r9, rdx
mov r8, rax
sub r9, r8
sub r8, 16
@@: ; compare the strings
add r8, 16
movdqu xmm1, oword ptr[r8+r9]
pcmpistrm xmm3, xmm1, (EQUAL_EACH + NEGATIVE_POLARITY + BYTE_MASK) ; mask out invalid bytes in the haystack
movdqu xmm4, oword ptr[r8]
pand xmm4, xmm0
pcmpistri xmm1, xmm4, (EQUAL_EACH + NEGATIVE_POLARITY)
ja @B
jnc STRSTR_FOUND
sub rax, 15 ; continue searching from the next byte
jmp STRSTR_MAIN_LOOP
STRSTR_NOT_FOUND:
xor rax, rax
STRSTR_FOUND:
ret
strstr_Kankowski endp
strstr_KankowskiLen:
OK JJ, now I know how you reply so fast.
Check the forum for new posts
https://masm32.com/board/index.php?topic=9237.0
By the way, do you ever sleep?