News:

Masm32 SDK description, downloads and other helpful links
Message to All Guests
NB: Posting URL's See here: Posted URL Change

Main Menu

SIMD_BinaryScan (Instring equivalent)

Started by guga, March 17, 2025, 05:27:34 PM

Previous topic - Next topic

guga

Hi Guys

Updated (24/03/2025):
Sometime ago i started creating a fast Instring function to be used as described at here

The function is a pattern compare that later i´ll use it as a InString (IN fact, it can be used already as such). Recently i found  small issue in a test i was doing where it mismatched one of the patterns, so one pattern was not found on a string, but it was there.

Can someone help me to show your results and also if one of the cases tested failed to identify the pattern ?

Updated Post:
RosAsm code
Proc SIMD_BinaryScan:
    Arguments @pDataBuffer, @DataSize, @pSearchPattern, @PatternSize
    Local @FirstBitPos, @ScanLimit, @StringEnd, @pFunction
    Structure @XmmPreserve 80, @XMMReg0Dis 0, @XMMReg1Dis 16, @XMMReg2Dis 32, @XMMReg3Dis 48, @XMMReg4Dis 64
    Uses esi, ebx, edi, edx

    ; Preserve XMM registers efficiently
    movdqu X@XMMReg0Dis XMM0
    movdqu X@XMMReg1Dis XMM1
    movdqu X@XMMReg2Dis XMM2
    movdqu X@XMMReg3Dis XMM3

    mov esi D@DataSize

    ; edi = DataSize
    ; edx = Bytes for ScanLimit
    ; ebx = PatternSize
    ; Security: Pattern size cannot be bigger than data size, and it cannot be zero. Also we precalculate ScanLimit Bytes
    ...If_Or esi < D@PatternSize, D@PatternSize = 0
        mov eax 0-1 ; Return -1 for invalid input (PatternSize > DataSize or 0)
    ...Else

        ; Initialize pattern and buffer pointers
        mov eax D@pDataBuffer
        mov edi D@DataSize | lea eax D$eax+edi | mov D@StringEnd eax ; StringEnd = pDataBuffer + DataSize

        mov esi D@pSearchPattern
        mov ebx D@PatternSize

        ; Precompute pattern bytes for SIMD comparison
        movzx eax B$esi | SSE2_EXPAND_BYTE XMM2 eax                         ; First byte in XMM2
        lea eax D$esi+ebx-1 | movzx eax B$eax | SSE2_EXPAND_BYTE XMM3 eax   ; Last byte in XMM3

        ; Select verification function based on PatternSize
        mov D@pFunction ComparePattern16ByteBlocks  ; Default for PatternSize > 16
        ; Check if PatternSize >= 16 (shift by 4 = divide by 16)
        ; If >= 16, keep ComparePattern16ByteBlocks
        ; Otherwise, use byte/dword scalar comparison
        mov eax ebx | shr eax 4 | jnz L1> | mov D@pFunction ComparePatternBytes | L1:

        mov esi D@pDataBuffer

        ; Handle small datasets (< 32 bytes)
        ; edi = DataSize / 32
        ; If >= 32, proceed to main loop
        mov eax edi | shr edi 5 | jnz L1> ; (DataSize/32)
            ; eax = DataSize - PatternSize
            ; Check if remaining size fits 16-byte SSE register
            ; if data is smaller then 32 bytes we need to see if the pattern fits the SSe register size limit (16 bytes)
            sub eax ebx | shr eax 4 | jnz L1> ; (DataSize-PatternSize)/16 . If >= 16, use main loop
                call ScanSmallSSE2 esi, D@DataSize, D@pSearchPattern, D@PatternSize
                jmp L2>>
        L1:

        ; Compute ScanLimit for main loop (aligns to 16-byte boundary)

        ; Calculate ScanLimit
        mov eax D@DataSize      ; eax = DataSize
        and eax 0-16            ; eax = DataSize & 0xFFFFFFF0 . Align DataSize down to multiple of 16
        lea eax D$esi+eax+1     ; eax = esi + (DataSize & 0xFFFFFFF0) + 1 . ScanLimit = pDataBuffer + aligned DataSize + 1
        mov D@ScanLimit eax     ; Store result in ScanLimit

        ; Main loop: Process the dataset in blocks of 16 bytes. ebx = PatternSize. The register must not be changed
        .Do
            mov D@FirstBitPos 0
            movdqu XMM0 X$esi ; Load the first 16 bytes of the current block into xmm0. Var_BlockFirst.
            movdqu XMM1 X$esi+ebx-1 ; Load the last 16 bytes of the current block into xmm1. Var_BlockLast.

            pcmpeqb xmm0 xmm2 ; Compare the first byte of the pattern with the first byte of the block.
            pcmpeqb xmm1 xmm3 ; Compare the last byte of the pattern with the last byte of the block.

            ; Combine the masks to find positions where both first and last bytes match.
            pand xmm1 xmm0      ; xmm1 = combined mask (Var_EqFirst AND Var_EqLast).
            ; Generate a bitmask from the combined mask.
            pmovmskb edx XMM1   ; ebx = bitmask of matching positions. Maximum value = 0FFFF

            ; Check if there are any matches.
            .Test_If_Not_Zero edx
                ; Process each matching position.
                Do
                    ; Find the first set bit in the bitmask.
                    GET_FIRST_BIT_SET edx | mov D@FirstBitPos eax
                    ; Calculate the address of the matching position.
                    lea edi D$esi+eax ; edi = potential match address
                    ; Perform a detailed comparison of the pattern at this position.
                    call D@pFunction edi, D@pSearchPattern, D@PatternSize
                    cmp eax 0 | jnz L2>>    ; If a match is found, return the position.
                    ; Clear the leftmost set bit in the bitmask and save the result in dx.
                    CLEAR_LEFTMOST_SET16 dx ; Clear processed bit
                Repeat_Until_Zero   ; perfom a loop if dx is not zero.
                ; Update the position to continue searching.
                mov eax D@FirstBitPos | add eax D@PatternSize   ; Advance past current pattern
            .Test_Else
                ; Check first byte matches to optimize step
                ; If no matches, find the last set bit in the mask and advance the pointer.
                pmovmskb eax xmm0 ; eax = bitmask of first byte matches. Maximum value = 0FFFF
                Test_If_Not_Zero eax
                    ; Find the last set bit.
                    GET_LAST_BIT_SET eax    ; Find last first-byte match
                    inc eax ; Advance to the next position.
                            ; We found at least 1 byte matching, increase the position of the last byte that matched by 1
                Test_Else
                    mov eax 16  ; No matches, step full 16 bytes
                Test_End
            .Test_End
            add esi eax ; Advance buffer pointer

            ; Check if remaining size is too small for pattern
            mov edi D@StringEnd | sub edi esi
            If edi < D@PatternSize
                xor eax eax | jmp L2>>
            End_If

            lea eax D$esi+15        ; Prepare next block check
        .Loop_Until eax > D@ScanLimit

        ; Handle tail (remaining bytes < 16)
        mov eax D@StringEnd | sub eax esi
        If eax => D@PatternSize
            call ScanSmallSSE2 esi, eax, D@pSearchPattern, D@PatternSize
        Else
            xor eax eax
        End_If

    ...End_If

L2:

    ; Restore XMM registers
    movdqu XMM0 X@XMMReg0Dis
    movdqu XMM1 X@XMMReg1Dis
    movdqu XMM2 X@XMMReg2Dis
    movdqu XMM3 X@XMMReg3Dis

EndP


Helper functions
Proc ComparePattern16ByteBlocks:
    Arguments @pDataBuffer, @pSearchPattern, @PatternSize
    Local @Reminder
    Uses esi, edx

    ; Step 1: Check if the length is zero. If so, skip the function.
    mov edx D@PatternSize
    mov esi D@pDataBuffer
    mov edi D@pSearchPattern
    ; Compute multiples of 16 and remainder
    ;and edx 0-16 | mov eax edx | xor edx D@PatternSize ; When 0 means lenght is divisible by 16 and we have no remainders.
    mov eax edx
    and eax 0-16                ; eax = multiples of 16 (aligned down)
    and edx (16-1)              ; edx = remainder (0-15)

    ; Process 16-byte blocks with SIMD
    .Test_If_Not_Zero eax ; edx = 0 ? no remainders exit
        mov D@Reminder edx ; Preserve remainder
        ; We are working with multiples of 16 bytes, therefore, we can get rid of using shr eax 4 and directly subtract 16 bytes on each loop
        ; On this way we get a bit more of speed while keeping performance. Interesting is that add eax 0-16 seems a bit faster then sub eax 16
        .Do
            movdqu xmm0 X$esi   ; Load 16 bytes from data
            movdqu xmm1 X$edi   ; Load 16 bytes from pattern
            pcmpeqd xmm0 xmm1   ; Compare 4 dwords (16 bytes) at once
            movmskps edx xmm0   ; Extract 4-bit mask (1 per dword)
            cmp edx 0F | jnz @NoMatch ; All 4 dwords must match
            add esi 16
            add edi 16
            add eax 0-16 ; subtract by 16 bytes. Faster then sub eax 16
        .Repeat_Until_Zero
        mov edx D@Reminder ; Restore remainder
    .Test_End

    ; Handle remainder (0-15 bytes, if any). When eax is also 0, we skip this and exit the D@pDataBuffer.
    ; Why ? Because on the previous test, eax was never 0
    .Test_If_Not_Zero edx ; Check if edx is not 0
        ; Split PatternSize into dwords (dl) and bytes (dh)
        mov dh dl
        and dl 0-4 ; dl = our Dword computation. So, values multiples of 4 only, on a maximum of 12 ; dl = dwords * 4 (0, 4, 8, 12)
        xor dh dl ; dh = our byte computation. So, values are multples of 1 only, on a maximum of 3 ; ; dh = bytes (0, 1, 2, 3)
        Test_If_Not_Zero dl ; Compare DWORD remainders.
            ; If dl = 0, so if we have no dwords remainder, jmp over to compute the byte remainders
            mov eax D$esi | cmp eax D$edi | jnz @NoMatch | add esi 4 | add edi 4 | add dl 0-4 | jz @CheckBytes
            mov eax D$esi | cmp eax D$edi | jnz @NoMatch | add esi 4 | add edi 4 | add dl 0-4 | jz @CheckBytes
            mov eax D$esi | cmp eax D$edi | jnz @NoMatch | add esi 4 | add edi 4
        Test_End
@CheckBytes:
        ; check if we have no byte remainders
        Test_If_Not_Zero dh ; dh = our byte computation. So, values are multples of 1 only, on a maximum of 3
            mov al B$esi | cmp al B$edi | jnz @NoMatch | dec dh | jz @Match
            mov al B$esi+1 | cmp al B$edi+1 | jnz @NoMatch | dec dh | jz @Match
            mov al B$esi+2 | cmp al B$edi+2 | jnz @NoMatch
        Test_End
    .Test_End
@Match:
    ; If the pattern matches, return the starting position.
    mov eax D@pDataBuffer
    ExitP

@NoMatch:
    ; No matches, return 0
    xor eax eax


EndP

Proc ComparePatternBytes:
    Arguments @pDataBuffer, @pSearchPattern, @PatternSize
    Uses esi, edx

    ; Step 1: Check if the length is zero. If so, skip the function.
    mov edx D@PatternSize
    mov esi D@pDataBuffer
    mov edi D@pSearchPattern
    ; Split PatternSize into dwords (dl) and bytes (dh)
    mov dh dl       ; dh = PatternSize copy
    and dl 0-4      ; dl = our Dword computation (0, 8, 8, 12). So, values multiples of 4 only, on a maximum of 12
    xor dh dl       ; dh = our byte computation (0, 1, 2, 3). So, values are multples of 1 only, on a maximum of 3

    Test_If_Not_Zero dl ; Compare DWORD remainders. (up to 12 bytes)
        ; If dl = 0, so if we have no dwords remainder, jmp over to compute the byte remainders
        mov eax D$esi | cmp eax D$edi | jnz @NoMatch | add esi 4 | add edi 4 | add dl 0-4 | jz @CheckBytes
        mov eax D$esi | cmp eax D$edi | jnz @NoMatch | add esi 4 | add edi 4 | add dl 0-4 | jz @CheckBytes
        mov eax D$esi | cmp eax D$edi | jnz @NoMatch | add esi 4 | add edi 4
    Test_End

@CheckBytes:
    ; check if we have no byte remainders (0-3)
    Test_If_Not_Zero dh ; dh = our byte computation. So, values are multples of 1 only, on a maximum of 3
        mov al B$esi | cmp al B$edi | jnz @NoMatch | dec dh | jz @Match
        mov al B$esi+1 | cmp al B$edi+1 | jnz @NoMatch | dec dh | jz @Match
        mov al B$esi+2 | cmp al B$edi+2 | jnz @NoMatch
    Test_End

@Match:
    ; If the pattern matches, return the starting position.
    mov eax D@pDataBuffer
    ExitP

@NoMatch:
    ; No matches, return 0
    xor eax eax

EndP

Proc ScanSmallSSE2:
    Arguments @pDataBuffer, @DataSize, @pSearchPattern, @PatternSize
    Local @StringEnd, @FirstBitPos, @ScanLimit, @pFunction

    ; Early exit for tiny datasets (<= 8 bytes)
    ..If D@DataSize <= 8
        call CompareSmallDataBlock esi, D@DataSize, D@pSearchPattern, D@PatternSize
    ..Else
        ; Validate inputs
        mov esi D@pDataBuffer
        mov edi D@DataSize | lea eax D$esi+edi | mov D@StringEnd eax
        mov ebx D@PatternSize

        ; Calculate ScanLimit
        mov eax edi             ; eax = DataSize
        and eax 0-8             ; eax = DataSize & 0xFFFFFFF0
        lea eax D$esi+eax+1     ; eax = esi + (DataSize & 0xFFFFFFF0) + 1
        mov D@ScanLimit eax     ; Store result in ScanLimit

        ; Select comparison function based on pattern size. For situations where pattern size is betwen 16 and 31
        mov D@pFunction ComparePatternBytes ; Default for PatternSize <= 15
        If ebx >= 16
            mov D@pFunction ComparePattern16ByteBlocks  ; For PatternSize 16-31
        End_If

        ; Main loop: Process the dataset in blocks of 16 bytes. ebx = PatternSize. The register must not be changed
        .Do
            mov D@FirstBitPos 0
            movq XMM0 X$esi ; Load the first 16 bytes of the current block into xmm0. Var_BlockFirst.
            movq XMM1 X$esi+ebx-1 ; Load the last 16 bytes of the current block into xmm1. Var_BlockLast.

            pcmpeqb xmm0 xmm2 ; Compare the first byte of the pattern with the first byte of the block.
            pcmpeqb xmm1 xmm3 ; Compare the last byte of the pattern with the last byte of the block.

            ; Combine the masks to find positions where both first and last bytes match.
            pand xmm1 xmm0      ; xmm1 = combined mask (Var_EqFirst AND Var_EqLast).
            ; Generate a bitmask from the combined mask.
            pmovmskb edx XMM1   ; ebx = bitmask of matching positions. Maximum value = 0FFFF

            ; Check if there are any matches.
            .Test_If_Not_Zero edx
                ; Process each matching position.
                Do
                    ; Find the first set bit in the bitmask.
                    GET_FIRST_BIT_SET edx | mov D@FirstBitPos eax
                    ; Calculate the address of the matching position.
                    lea edi D$esi+eax
                    ; Perform a detailed comparison of the pattern at this position.
                    call D@pFunction edi, D@pSearchPattern, D@PatternSize
                    cmp eax 0 | jnz L2>>    ; If a match is found, return the position.
                    ; Clear the leftmost set bit in the bitmask and save the result in dx.
                    CLEAR_LEFTMOST_SET16 dx
                Repeat_Until_Zero   ; perfom a loop if dx is not zero.
                ; Update the position to continue searching.
                mov eax D@FirstBitPos | add eax D@PatternSize
            .Test_Else
                ; If no matches, find the last set bit in the mask and advance the pointer.
                pmovmskb eax xmm0 ; eax = bitmask of first byte matches. Maximum value = 0FFFF
                Test_If_Not_Zero eax
                    ; Find the last set bit.
                    GET_LAST_BIT_SET eax
                    inc eax ; Advance to the next position.
                            ; We found at least 1 byte matching, increase the position of the last byte that matched by 1
                Test_Else
                    mov eax 8  ; No matches, advance by 8 bytes.
                Test_End
            .Test_End
            add esi eax ; Advance the dataset pointer.

            ; Check remaining size
            mov eax D@StringEnd | sub eax esi
            If eax < D@PatternSize
                xor eax eax | ExitP
            End_If

            lea eax D$esi+8
        .Loop_Until eax > D@ScanLimit

        ; Handle remaining tail
        mov eax D@StringEnd | sub eax esi
        If eax => D@PatternSize
            call CompareSmallDataBlock esi, eax, D@pSearchPattern, D@PatternSize
        Else
            xor eax eax
        End_If
    ..End_If
L2:

EndP

Proc CompareSmallDataBlock:
    Arguments @pDataBuffer, @DataSize, @pSearchPattern, @PatternSize
    Local @EndString
    Uses esi, edi, edx, ecx

    ; Initialize pointers.
    mov esi D@pDataBuffer
    mov eax D@DataSize
    ; EndString = pDataBuffer + DataSize - PatternSize
    lea eax D$esi+eax | sub eax D@PatternSize | mov D@EndString eax

    mov edi D@pSearchPattern
    .Do
        ; Compare the pattern byte-by-byte.
        mov ecx D@PatternSize
        xor eax eax
        Do
            movzx edx B$esi+eax | cmp dl B$edi+eax | jne L1>
            inc eax
            dec ecx
        Repeat_Until_Zero
        ; If the pattern matches, return the starting position.
        mov eax esi
        ExitP

    L1:
        ; Move to the next position in the dataset.
        inc esi
    .Loop_Until esi > D@EndString

    ; If no match is found, return 0.
    xor eax eax

EndP

Macros used:
[CLEAR_LEFTMOST_SET16 | mov ax #1 | add #1 0-1 | and #1 ax]
[GET_FIRST_BIT_SET | bsf eax #1]
[GET_LAST_BIT_SET | bsr eax #1]
[SSE2_EXPAND_BYTE | movd #1 #2 | punpcklbw #1 #1 | punpcklwd #1 #1 | pshufd #1 #1 0]

Masm translation (I hope it is ok, but if not, i included the disassembled source that works - just rename xmmword with oword or other compatible masm token for SSE registers)
New Files Atacched (24/03/2025) with comments (See RosAsmCode_Info.txt)

Bug fixes:

  • Fixed an error in one of the helper functions that was causing the pattern to not found properly.
  • Fixed security to guarantee that the data stays within the limits of a SSE2 register (16 bytes per block)
  • Improved speed for Match and Non-Match cases. Now both are faster
  • Added more helpers to handle patterns and data sizes smaller than 32 and 16 bytes to gain performance in small data chunks.
  • Added comments on all functions.

Note: For SSE2 with x86, i guess i reached my limit on speed the algorithm. In theory, it could be improved a little bit forcing the algo to use the others registers (xmm4 to xmm7) to also look for the 2nd and penultimate byte from the pattern, or even creating now a flag to handle when the pattern is only 2 bytes, so it won´t enter the core function since it already found the matching pair etc etc, but it will result on a more complex code and i´m not sureif the performance will vary that much.

In any case, if you want to test to see if it is possible to optimize it even further (obbeying the limits etc), you can see an alternative solution here at post #31
Coding in Assembly requires a mix of:
80% of brain, passion, intuition, creativity
10% of programming skills
10% of alcoholic levels in your blood.

My Code Sites:
http://rosasm.freeforums.org
http://winasm.tripod.com

zedd151

Hi guga!

Too many examples in the test. The entirety of the results was greater than the 20000 character limit per post, so I could not post it all.
Here is the Min/Max results:

Intel(R) Core(TM) i7-7700 CPU @ 3.60GHz  3.60 GHz
_______________________________________________

Match Min Max Timmings:
_______________________________________________

Slowest Match cases

Clock cycles: 94 clocks
Example Case: 160

Fastest Match cases

Clock cycles: 38 clocks
Example Case: 3
_______________________________________________

Not Found Min Max Timmings:
_______________________________________________

Slowest Match cases

Clock cycles: 55 clocks
Example Case: 135

Fastest Match cases

Clock cycles: 19 clocks
Example Case: 48

Maybe use less number of examples, so all of the results can be posted?

Additionally...
If 'CharToFind' length is greater than the 'TestingString' length, the test should not even be run.
You should test the string lengths against each other before performing  the tests, imo.
Then if the 'CharToFind' length is greater then the 'TestingString' length display error message?

Example:
Clock cycles: 19 clocks

TestingString: abcdef1234567890pqrstuvabcd1234567890abcdefgxyz789abcd1234567890abcdefgxyz789abcd1234567890abcdefgxyz789abcd1234567890abcdefgxyz789abcd1234567890
TestingString Size: 145
CharToFind: abcdef1234567890pqrstuvabcd1234567890abcdefgxyz789abcd1234567890abcdefgxyz789abcd1234567890abcdefgxyz789abcd1234567890abcdefgxyz789abcd1234567890gugaxy789abcdefxyz789abcd1234567890abcdefgxyz789abcd1234567890
CharToFind Size: 207
Found at: Pos (0) - Not found
Obviously the 'CharToFind' will never be found, it is longer than the 'TestingString'.  :smiley:

Also...  'CharToFind' maybe should be named 'StringToFind' instead?
¯\_(ツ)_/¯   :azn:

'As we don't do "requests", show us your code first.'  -  hutch—

guga

Hi Zedd

Many Tks

I´ll try using less examples. I just created several of them with different lenght with odd, even and prime numbers sizes to see if the function will fail.

About the cases where the pattern is smaller than the Dataset, the function already checks this at the very beginning, i just included on the tests to see if it was not bypassing the security routines i settled for that. Those situations are checked twice inside the function.
1st - When the data input is smaller than the pattern and the function immediately exists
2nd - If the Dataset is bigger than the pattern, but on a specific point the remainder bytes to be scanned are smaller than the pattern. (Of course, after the previous bytes where correctly checked as well).

I´m optimizing it a bit more and suceedd to remove some uneeded checks in one of it´s internal functions. I´ll try one more thing and check if the results are being returned without any mismatch before converting it to masm and post it here for us. If this small fix was succeeded, i did gained something around 20% to 40% of speed

_______________________________________________


Match Min Max Timmings:
_______________________________________________

Slowest Match cases

Clock cycles: 68 clocks
Example Case: 47

Fastest Match cases

Clock cycles: 42 clocks
Example Case: 2
_______________________________________________

Not Found Min Max Timmings:
_______________________________________________

Slowest Match cases

Clock cycles: 48 clocks
Example Case: 135

Fastest Match cases

Clock cycles: 18 clocks
Example Case: 48


Press enter to exit...

About changing the label name to StringToFind. Sure, it was just to display something on screen to identify what was being done. I´ll rename it on the next tests before convert this function to masm
Coding in Assembly requires a mix of:
80% of brain, passion, intuition, creativity
10% of programming skills
10% of alcoholic levels in your blood.

My Code Sites:
http://rosasm.freeforums.org
http://winasm.tripod.com

zedd151

Okay, guga. I just thought I would mention those, just in case that they might have been overlooked or missed...

I'll look for the next test when it becomes available.
¯\_(ツ)_/¯   :azn:

'As we don't do "requests", show us your code first.'  -  hutch—

guga

Ok, guys, new test. This time, on the print, i excluded on the display the cases where the mismatch is obvious (Pattern bigger than dataset). Of course, it didn´t affected the function at all...i just removed it from the print.

On this way we can check if the results are ok, since we have less data on the print to check. Also, i changed a bit the function and made it a bit faster. I hope this gain of speed is really due to performance and elimination of redundancies of the code and not due to mismatch on the result that could affect the speed.

If the results of this test are ok, i´ll port it to masm. Pls let me know so i can start porting it and inserting the proper comments on the function.

Btw...about the name of the app "TestingResultDeletar2a", well..it was just a dumy name for this test. I was about to delete it (that´s why i named it as "deletar" - portuguese for "to delete") if someting was wrong, but since i didn´t find wrong results so far, i kept this dummy name.

Btw..the function was created to respect the size limits of a x86 SSE2 register, so the goal was not to extrapolate it ever. On my tests, it seems to work as expected. If it really correct, then when i port it to masm, you can check for yourselfs too if any other error case do exists on larger datasets.
Coding in Assembly requires a mix of:
80% of brain, passion, intuition, creativity
10% of programming skills
10% of alcoholic levels in your blood.

My Code Sites:
http://rosasm.freeforums.org
http://winasm.tripod.com

zedd151

Again the results exceed the 20000 chacter limit for posting it, in its entirety into a post. Full results attached in zip file
Min Max:
Match Min Max Timmings:
_______________________________________________

Slowest Match cases

Clock cycles: 65 clocks
Example Case: 47

Fastest Match cases

Clock cycles: 39 clocks
Example Case: 23
_______________________________________________

Not Found Min Max Timmings:
_______________________________________________

Slowest Match cases

Clock cycles: 55 clocks
Example Case: 135

Fastest Match cases

Clock cycles: 31 clocks
Example Case: 19


Press enter to exit...

¯\_(ツ)_/¯   :azn:

'As we don't do "requests", show us your code first.'  -  hutch—

guga

Hi Zedd

tks, but the result is empty. I guess you forgot to paste the result on the text file.

But..did you checked to see if we have any mismatch ? I mean if the function failed to find the proper patterns ?
Coding in Assembly requires a mix of:
80% of brain, passion, intuition, creativity
10% of programming skills
10% of alcoholic levels in your blood.

My Code Sites:
http://rosasm.freeforums.org
http://winasm.tripod.com

zedd151

Quote from: guga on March 18, 2025, 10:50:09 AMtks, but the result is empty. I guess you forgot to paste the result on the text file.
And I forgot to test the zip file. That was indeed my fault, sorry.

I have corrected that and reattached it above, where it was originally.

No, I have not inspected that the results were accurate that will take a little time... I'll have to get back to you in a little while.
¯\_(ツ)_/¯   :azn:

'As we don't do "requests", show us your code first.'  -  hutch—

zedd151

I think I found a couple errors in the position where string was found.
I'll make a report for each case where an error found....

It'll take some time.
¯\_(ツ)_/¯   :azn:

'As we don't do "requests", show us your code first.'  -  hutch—

guga

Tks, Btw...about the position...its´rather an index than the position. For example, on the 1st case

TestingString: 578987hGugf8ahj1g1ghdlgjGughjkhjk8ahj18ahj1g1ghdmerdalgjg1ghdlgj8a8ahj1g1ghdlgjhj1g1ghdlgj
TestingString Size: 90
CharToFind: merdalgjg1ghdlgj8a8ahj1g1
CharToFind Size: 25
Found at: Pos (48) - merdalgjg1ghdlgj8a8ahj1g1ghdlgjhj1g1ghdlgj

The value of 48 means we have 48 bytes before the match. Therefore the pattern was located at the 49th byte. If needed i´ll redo the printing to reflect the exact position where the pattern was found.

"578987hGugf8ahj1g1ghdlgjGughjkhjk8ahj18ahj1g1ghd" = 48 bytes (Pos)

found it at the 49th byte: merdalgjg1ghdlgj8a8ahj1g1ghdlgjhj1g1ghdlgj


When i did the printf i simply calculated the returned address in eax and subtracted from the beginning of the data.

esi = Initial TestingString address

eax = Pattern address found

Pos = eax - esi
Coding in Assembly requires a mix of:
80% of brain, passion, intuition, creativity
10% of programming skills
10% of alcoholic levels in your blood.

My Code Sites:
http://rosasm.freeforums.org
http://winasm.tripod.com

zedd151

Okay, I will reexamine the results. I could have made a mistake too, when I looked the first time.

I am going to be away from computer for some time. It's beautiful outside, I keep going back in to the computer, and back outside...

Give me about an hour to finish checking each result properly.  :smiley:
So I can enjoy the weather a little longer.   :azn:
¯\_(ツ)_/¯   :azn:

'As we don't do "requests", show us your code first.'  -  hutch—

zedd151

I came back inside early. I didn't want to make you wait too long...  :smiley:

Okay. Indeed I did make an error when saying that "there was an error in the position where the string is found".
My mistake. The couple errors that I thought I found were the second or more occurrence of the string, not the first. The position listed is for the first occurrence, even if multiple occurrences are there.

I tested the first 40 Examples out of the 167 Examples tested. Almost 25%

The results of 38 of those are all correct as well as the positions of where that first occurence of each string is found, the 2 Examples where the string is not found the 'not found' result is also correct (Example 19 and Example 25).

I didn't go any further as I think I will not find any errors. It does indeed appear that there are no errors in the results.   :thumbsup:
¯\_(ツ)_/¯   :azn:

'As we don't do "requests", show us your code first.'  -  hutch—

guga

HI guys, i updated the 1st post and included the translation to masm of the last working version. I attached the file that i had to disassemble (as did before in other threads) which is more guaranteed to work (Just rename the xmmword to oword if needed). Additionally, i added on the post itself my attempt to port it to masm compatible syntax as well masm macros (But, i dont know if i translated correctly)

Btw, due to the limitations of characters lenght on the forum, i had to remove all the comments on the 1st post., but if needed here are the comments of the functions themselves

;;
    SIMD_BinaryScan
        This function performs a binary search for a chunk of data (pSearchPattern) within a larger dataset (pDataBuffer).
        It uses SSE2 instructions for optimization and speed, especially with large datasets.
        The function can be used to find data of any size, including strings (case-sensitive) within a larger text.

    Parameters:
        pDataBuffer (in):       Pointer to the start of the larger dataset in which the search is performed.
                                This is the buffer in which you are searching.
        DataSize (in):          The size (in bytes) of the dataset. The size must be greater than or equal to 16 bytes.
        pSearchPattern (in):    Pointer to the start of the search pattern (the data chunk to search for within the dataset).
                                This is the smaller data chunk you are searching for.
        PatternSize (in):       The size (in bytes) of the search pattern.

    Return Value:
        Success: If the pattern is found, the function returns a pointer to the starting position of the matching chunk in the dataset.
        Failure: Returns 0 (&NULL) if the pattern is not found in the dataset.
;;

;;
    CompareSmallDataBlock
        This function performs a byte-by-byte comparison of a small dataset (<= 16 bytes) with a search pattern.
        It is used as a fallback when the dataset is too small for SIMD optimizations.

    Parameters:
        pDataBuffer (in):       Pointer to the start of the dataset.
        DataSize (in):          The size (in bytes) of the dataset.
        pSearchPattern (in):    Pointer to the start of the search pattern.
        PatternSize (in):       The size (in bytes) of the search pattern.

    Return Value:
        Success: If the pattern is found, the function returns a pointer to the starting position of the match.
        Failure: Returns 0 (&NULL) if the pattern is not found.
;;

;;
    VerifyPatternMatch
        This function performs a detailed comparison of a search pattern with a dataset block.
        It is used after a potential match is found in the main function to confirm the match.

    Parameters:
        pDataBuffer (in):       Pointer to the start of the dataset block.
        pSearchPattern (in):    Pointer to the start of the search pattern.
        PatternSize (in):       The size (in bytes) of the search pattern.

    Return Value:
        Success: If the pattern matches, the function returns a pointer to the starting position of the match.
        Failure: Returns 0 (&NULL) if the pattern does not match.
;;


And below is a example (I hope) how to call this function:

; Define data and code sections
.data
    ; Define example data
    haystack db "This is a simple string to search within. While trying to find sinp from sinples string", 0
    needle db "search wi", 0

    ; Variables to store string lengths
    haystack_len dd ?
    needle_len dd ?

    ; Variable to store the search result
    result dd ?

.code
start:
    ; Calculate the length of the haystack string
    invoke lstrlen, addr haystack
    mov haystack_len, eax

    ; Calculate the length of the needle string
    invoke lstrlen, addr needle
    mov needle_len, eax

    ; Call the SIMD_BinaryScan function
    invoke SIMD_BinaryScan, addr haystack, haystack_len, addr needle, needle_len
    mov result, eax

    ; Check the result
    .if result == 0
        ; If the result is 0, the pattern was not found.
        print "Pattern not found.", 13, 10
    .else
        ; If the result is non-zero, the pattern was found.
        print "Pattern found at position: "
        print str$(result), 13, 10
    .endif

    ; Terminate the program
    invoke ExitProcess, 0
Coding in Assembly requires a mix of:
80% of brain, passion, intuition, creativity
10% of programming skills
10% of alcoholic levels in your blood.

My Code Sites:
http://rosasm.freeforums.org
http://winasm.tripod.com

zedd151

Hi guga,

I got the source code from the masm compatible .zip file to assemble okay. It will take a little time to get it ready to test it.

But....
But upon actually reading post #1 I see that you have a 'fixed' masm version, with proper names, etc. and should be ready to assemble. I did not read the entire post before downloading the .zip file.  :tongue:
¯\_(ツ)_/¯   :azn:

'As we don't do "requests", show us your code first.'  -  hutch—

zedd151

After copying each masm compatible procedure from post #1 and the macros there to a new file. I had to correct the starting line and ending line for each procedure.

After fixing that minor annoyance and adding some necessities, upon attempting to assemble the file:
Microsoft (R) Macro Assembler Version 14.00.24210.0
Copyright (C) Microsoft Corporation.  All rights reserved.

 Assembling: C:\Users\Administrator\Desktop\SIMD_BinaryScan_masm.asm

***********
ASCII build
***********

C:\Users\Administrator\Desktop\SIMD_BinaryScan_masm.asm(10) too large for specified size
C:\Users\Administrator\Desktop\SIMD_BinaryScan_masm.asm(10) : C:\Users\Administrator\Desktop\SIMD_BinaryScan_masm.asm(11) too large for specified size
C:\Users\Administrator\Desktop\SIMD_BinaryScan_masm.asm(11) : error A¿C:\Users\Administrator\Desktop\SIMD_BinaryScan_masm.asm(12) too large for specified size
C:\Users\Administrator\Desktop\SIMD_BinaryScan_masm.asm(12) : fatal 掟睦w_
Assembly Error
Press any key to continue . . .

I don't think ml.exe likes your macros, guga.

The macros and masm compatible procedures from post #1 are in this source code, it would have been easier if they were all in the same code block in post #1:
include \masm32\include\masm32rt.inc
.xmm

SIMD_BinaryScan        proto :DWORD, :DWORD, :DWORD, :DWORD
CompareSmallDataBlock  proto :DWORD, :DWORD, :DWORD, :DWORD
VerifyPatternMatch      proto :DWORD, :DWORD, :DWORD

; ---------------------------------------------------------------------------

if macro condition
    .if condition
endif

else macro
    .else
endif

elseif macro condition
    .elseif condition
endif

endif macro
    .endif
endif

; MASM Macros for loops
repeat macro
    .repeat
endm

until macro condition
    .until condition
endm

; MASM Macros for bit manipulation
bsf macro dest, src
    bsf dest, src
endm

bsr macro dest, src
    bsr dest, src
endm

; MASM Macros for SSE2 operations
SSE2_EXPAND_BYTE macro xmmreg, byteval
    movd xmmreg, byteval
    punpcklbw xmmreg, xmmreg
    punpcklwd xmmreg, xmmreg
    pshufd xmmreg, xmmreg, 0
endm

CLEAR_LEFTMOST_SET16 macro reg
    mov ax, reg      ; Copia o valor do registrador para AX.
    add reg, -1      ; Subtrai 1 do registrador (inverte todos os bits até o primeiro bit definido).
    and reg, ax      ; Aplica a máscara para limpar o bit mais à esquerda.
endm

GET_FIRST_BIT_SET macro reg
    bsf eax, reg      ; Encontra o índice do primeiro bit definido em `reg` e armazena em EAX.
endm

GET_LAST_BIT_SET macro reg
    bsr eax, reg      ; Encontra o índice do último bit definido em `reg` e armazena em EAX.
endm

; ---------------------------------------------------------------------------

.code

; ---------------------------------------------------------------------------

SIMD_BinaryScan proc pDataBuffer:DWORD, DataSize:DWORD, pSearchPattern:DWORD, PatternSize:DWORD
local FirstBitPos:DWORD, ScanLimit:DWORD, StringEnd:DWORD
local XmmPreserve[20]:BYTE
    xor eax, eax
    mov esi, [DataSize]
    if (esi < [PatternSize]) || ([PatternSize] == 0)
        mov eax, 0
        ret
    elseif ([DataSize] <= 16)
        invoke CompareSmallDataBlock, [pDataBuffer], [DataSize], [pSearchPattern], [PatternSize]
        ret
    endif
    lea eax, [XmmPreserve]
    movdqu [eax], xmm0
    movdqu [eax+16], xmm1
    movdqu [eax+32], xmm2
    movdqu [eax+48], xmm3
    mov esi, [pSearchPattern]
    mov ebx, [PatternSize]
    movzx eax, byte ptr [esi]
    SSE2_EXPAND_BYTE xmm2, eax
    lea eax, [esi+ebx-1]
    movzx eax, byte ptr [eax]
    SSE2_EXPAND_BYTE xmm3, eax
    mov esi, [pDataBuffer]
    mov edi, [DataSize]
    mov eax, [PatternSize]
    shr eax, 4
    add eax, 16
    sub edi, eax
    if edi > 16
        mov edi, [DataSize]
        sub edi, [PatternSize]
    endif
    lea eax, [esi+edi]
    mov [ScanLimit], eax
    mov eax, [DataSize]
    lea eax, [esi+eax]
    mov [StringEnd], eax
    .repeat
        mov [FirstBitPos], 0
        movups xmm0, [esi]
        mov eax, [PatternSize]
        movups xmm1, [esi+eax-1]
        pcmpeqb xmm0, xmm2
        pcmpeqb xmm1, xmm3
        pand xmm1, xmm0
        pmovmskb ebx, xmm1
        .if ebx != 0
            .repeat
                GET_FIRST_BIT_SET ebx
                mov [FirstBitPos], eax
                lea edi, [esi+eax]
                invoke VerifyPatternMatch, edi, [pSearchPattern], [PatternSize]
                .if eax != 0
                    jmp L2
                .endif
                CLEAR_LEFTMOST_SET16 bx
            .until bx == 0
            mov eax, [FirstBitPos]
            add eax, [PatternSize]
        .else
            pmovmskb eax, xmm0
            .if eax != 0
                GET_LAST_BIT_SET eax
                inc eax
            .else
                mov eax, 16
            .endif
        .endif
        add esi, eax
    .until esi > [ScanLimit]
    xor eax, eax
    .if [FirstBitPos] == 0
        mov eax, [StringEnd]
        sub eax, esi
        .if [PatternSize] <= eax
            invoke CompareSmallDataBlock, esi, eax, [pSearchPattern], [PatternSize]
        .else
            xor eax, eax
        .endif
    .endif
L2:
    lea esi, [XmmPreserve]
    movdqu xmm0, [esi]
    movdqu xmm1, [esi+16]
    movdqu xmm2, [esi+32]
    movdqu xmm3, [esi+48]
    ret
SIMD_BinaryScan endp

; ---------------------------------------------------------------------------

CompareSmallDataBlock proc pDataBuffer:DWORD, DataSize:DWORD, pSearchPattern:DWORD, PatternSize:DWORD
local EndString:DWORD
    mov esi, [pDataBuffer]
    mov eax, [DataSize]
    lea eax, [esi+eax]
    sub eax, [PatternSize]
    mov [EndString], eax
    mov edi, [pSearchPattern]
    .repeat
        mov ecx, [PatternSize]
        xor eax, eax
        .repeat
            movzx edx, byte ptr [esi+eax]
            cmp dl, byte ptr [edi+eax]
            jne L1
            inc eax
            dec ecx
        .until ecx == 0
        mov eax, esi
        ret
    L1:
        inc esi
    .until esi > [EndString]
    xor eax, eax
    ret
CompareSmallDataBlock endp

; ---------------------------------------------------------------------------

VerifyPatternMatch proc pDataBuffer:DWORD, pSearchPattern:DWORD, PatternSize:DWORD
local Reminder:DWORD
    mov edx, [PatternSize]
    mov esi, [pDataBuffer]
    mov edi, [pSearchPattern]
    and edx, -16
    mov eax, edx
    xor edx, [PatternSize]
    .if edx != 0
        lea esi, [esi+eax]
        lea edi, [edi+eax]
        mov dh, dl
        and dl, -4
        xor dh, dl
        .if dl != 0
            mov eax, [esi]
            cmp eax, [edi]
            jnz L4
            add esi, 4
            add edi, 4
            add dl, -4
            jz L2
            mov eax, [esi]
            cmp eax, [edi]
            jnz L4
            add esi, 4
            add edi, 4
            add dl, -4
            jz L2
            mov eax, [esi]
            cmp eax, [edi]
            jnz L4
            add esi, 4
            add edi, 4
        .endif
    L2:
        .if dh != 0
            mov eax, 0
            mov dl, byte ptr [esi]
            cmp dl, byte ptr [edi]
            jnz L4
            dec dh
            jz L2
            mov dl, byte ptr [esi+1]
            cmp dl, byte ptr [edi+1]
            jnz L4
            dec dh
            jz L2
            mov dl, byte ptr [esi+2]
            cmp dl, byte ptr [edi+2]
            jnz L4
        .endif
        mov eax, [pDataBuffer]
    .else
        mov eax, esi
    .endif
L4:
    ret
VerifyPatternMatch endp

start:

end start
I can go no further with this.
¯\_(ツ)_/¯   :azn:

'As we don't do "requests", show us your code first.'  -  hutch—