News:

Masm32 SDK description, downloads and other helpful links
Message to All Guests
NB: Posting URL's See here: Posted URL Change

Main Menu

a fast function to scan byte

Started by TouEnMasm, September 04, 2012, 10:00:55 PM

Previous topic - Next topic

TouEnMasm


I haved a look on the strlen function.
Thinking she could be of good help in this matter,i have made some modifies on it
Here is the result:
Quote

OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
ALIGN   4
;adress,length in byte,mask
;mask repeat 4 times the byte to search
;example : 34343434h search for "4"
;or carsearch db "4444",0
;push dword ptr [carsearch]
Fastscanbyte proc item:DWORD,longueur:DWORD,masque:DWORD
   push    ebp                                   
   mov     ebp, esp                               
   push    edi                                   
   push    ebx   
   ;count the number of dword                                 
   mov     eax, dword ptr [ebp+0CH]               
   test    al, 03H           ;fraction of a dword                     
   jnz     ?_009                                   
   shr     eax, 2                                 
   jmp     ?_010                                   
?_009:  shr     eax, 2                                 
   inc     eax                                     
?_010:  mov     dword ptr [ebp+0CH], eax
   ;--------------------------------               
   mov     ebx, dword ptr [ebp+10H]               
   mov     eax, dword ptr [ebp+8H]                 
   lea     edx, [eax+3H]                           
readdword:
   mov     edi, dword ptr [eax]                   
                ;----- xor it with mask ------------------
   xor     edi, ebx                         
                ;--------------------------------------------     
   add     eax, 4                                 
   lea     ecx, [edi-1010101H]                     
   not     edi                                     
   and     ecx, edi                               
   and     ecx, 80808080H                         
   jnz     nxt                                   
   dec     dword ptr [ebp+0CH]      ;verify the length of the search               
   jnz     readdword                                   
   mov     eax, 0                                 
   jmp     afin                                   

nxt: 
   test    ecx, 8080H                             
   jnz     ?_013                                   
   shr     ecx, 16                                 
   add     eax, 2                                 
?_013:  shl     cl, 1                                   
   sbb     eax, edx                               
   inc     eax                                     
afin:
   pop     ebx                                     
   pop     edi 
   leave                                   
   ret     12
Fastscanbyte endp                                        
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef



Fa is a musical note to play with CL

jj2007

Seems to work :t

AMD Athlon(tm) Dual Core Processor 4450B (SSE3)
49      cycles for MasmBasic Len, result=100
284     cycles for MasmBasic Instr, result=100
156     cycles for Fastscanbyte, len=-1, result=100
314     cycles for Fastscanbyte, len(src), result=100

53      cycles for MasmBasic Len, result=100
284     cycles for MasmBasic Instr, result=100
176     cycles for Fastscanbyte, len=-1, result=100
315     cycles for Fastscanbyte, len(src), result=100

Vortex

Intel(R) Pentium(R) 4 CPU 3.20GHz (SSE3)
117     cycles for MasmBasic Len, result=100
612     cycles for MasmBasic Instr, result=100
202     cycles for Fastscanbyte, len=-1, result=100
454     cycles for Fastscanbyte, len(src), result=100

64      cycles for MasmBasic Len, result=100
592     cycles for MasmBasic Instr, result=100
200     cycles for Fastscanbyte, len=-1, result=100
457     cycles for Fastscanbyte, len(src), result=100

KeepingRealBusy

Wouldn't it be faster to just use SSE (16 bytes at a time)?


;-------------------------------------------------------------------------------
;   SSEStrchr - Scan szSource for match with a character or a Null,
;   (INVOKE SSEStrchr,pszSource,cbMatch,pMatchLoc - returns a Null or the
;   MatchPoint in pMatchLoc, uses xmm0, xmm1, xmm2).
;-------------------------------------------------------------------------------
ALIGN       OWORD
SSEStrchr   PROC,
            pszSource:PBYTE,
            cbMatch:DWORD,
            pMatchLoc:PDWORD

     push      eax                           ;   Save regs
     push      edx

     mov       eax,[esp+(3*oPointerSize)]    ;   Get the source string pointer.
     movzx     edx,BYTE PTR [esp+(4*oPointerSize)]  ;   Get the match character.

    movdqu    xmm2,[eax]                    ;   Get the first 16 BYTES of the source string.
     add       dh,dl                         ;  Copy the match character to dh.
     jnz       FindFirstChar                 ;   Not a null character search.
    pxor      xmm0,xmm0                     ;   Clear all characters in xmm0.
     jmp       FindFirstNull                 ;   Simple scan for null.
;
;   dh=dl=matchchar,xmm0 will be matchs,xmm1 will be nulls,xmm2=source.
;
ALIGN   OWORD
FindFirstChar:
    movd      xmm0,edx                      ;   Move the match character pair to the lowest WORD of xmm0.
     pxor      xmm1,xmm1                     ;   Clear all characters in xmm1.
    pshuflw   xmm0,xmm0,0                   ;   Interleave the lowest WORD to the lowest DWORD in xmm0.
     pcmpeqb   xmm1,xmm2                     ;   Compare 16 BYTES of the source to 16 nulls.
    pshufd    xmm0,xmm0,0                   ;   Copy the lowest DWORD to all DWORDS in xmm0.
     pcmpeqb   xmm2,xmm0                     ;   Compare 16 BYTES of the source to 16 match characters.
     por       xmm2,xmm1                     ;   Or the results of the two compares.
    pmovmskb  edx,xmm2                      ;   Return a 1 for each matched character to the low 16 BITS of edx.
     or        edx,edx                       ;   Any match or null match found?
     jnz       FoundChar                     ;   Yes.
     and       eax,-16                       ;   Mask the source pointer to a mod 16 bound for aligned compares.
    movdqa    xmm2,xmm0                     ;   Save the 16 match characters in xmm2.
;
;   xmm0=xmm2=matches,xmm1=nulls,[eax+16]=source.
;
ALIGN   OWORD
FindNextChar:
     lea       eax,[eax+16]                  ;   Increment the source string pointer by 16.
    pcmpeqb   xmm2,[eax]                 ;   Compare 16 match characters to 16 BYTES of the source.
    pcmpeqb   xmm1,[eax]                 ;   Compare 16 nulls to 16 BYTES of the source.
    por       xmm1,xmm2                     ;   Or the results of the two compares.
     movdqa    xmm2,xmm0                     ;   Move the 16 match characters to xmm2.
    pmovmskb  edx,xmm1                      ;   Return a 1 for each matched character to the low 16 BITS of edx.
     or        edx,edx                       ;   Any match or null match found?
     jz        FindNextChar                  ;   No, check again.
     jmp       FoundChar

ALIGN   OWORD
FoundChar:
     bsf       edx,edx                       ;   Move the BIT number of the lowest BIT in edx to edx.
     lea       eax,[eax+edx]                 ;   Add the BIT number to the pointer in eax.
     xor       edx,edx                       ;   Clear edx as a null response for the SSEStrchr call.
     cmp       [eax],dl                      ;   Does the match character match a null?
     cmove     eax,edx                       ;   If so, return a null response.
     jmp       Exit                          ;   Go to save match, restore regs, and return.
;
;   xmm0=nulls,xmm2=source.
;
ALIGN   OWORD
FindFirstNull:
    pcmpeqb   xmm2,xmm0                     ;   Compare 16 BYTES of the source to 16 nulls.
    pmovmskb  edx,xmm2                      ;   Return a 1 for each matched character to the low 16 BITS of edx.
     or        edx,edx                       ;   Any match or null match found?
     jnz       FoundNull                     ;   Yes.
     and       eax,-16                       ;   Mask the source pointer to a mod 16 bound for aligned compares.
;
;   xmm0=nulls,xmm2=source.
;
ALIGN   OWORD
FindNextNull:
    pcmpeqb   xmm0,[eax+16]                 ;   Compare 16 nulls to 16 BYTES of the source.
     lea       eax,[eax+16]                  ;   Increment the source string pointer by 16.
    pmovmskb  edx,xmm0                      ;   Return a 1 for each matched character to the low 16 BITS of edx.
     or        edx,edx                       ;   Any match or null match found?
     jz        FindNextNull                  ;   No, check again.

FoundNull:
     bsf       edx,edx                       ;   Move the BIT number of the lowest BIT in edx to edx.
     add       eax,edx                       ;   Add the BIT number to the pointer in eax.
;
;   Exit SSEStrchr.
;
Exit:
     mov       edx,[esp+(5*oPointerSize)]    ;   Get the destination for the match location.
     mov       [edx],eax                     ;   Save the match pointer where it was directed.

     pop       edx                           ;   Restore regs
     pop       eax

;     cmp    dErrorPkg,0                      ;   Return any error.
     ret       12                            ;   Exit PROC SSEStrchr.
SSEStrchr   ENDP
SSEStrchrLen EQU ($-SSEStrchr)

;-------------------------------------------------------------------------------
;   End of PROC SSEStrchr.
;-------------------------------------------------------------------------------




Dave.

jj2007

Quote from: KeepingRealBusy on September 05, 2012, 06:25:59 AM
Wouldn't it be faster to just use SSE (16 bytes at a time)?

That's why MasmBasic Len() is so fast:

Intel(R) Celeron(R) M CPU        420  @ 1.60GHz (SSE3)
37      cycles for MasmBasic Len, result=100
537     cycles for MasmBasic Instr, result=100
164     cycles for Fastscanbyte, len=-1, result=100
305     cycles for Fastscanbyte, len(src), result=100

37      cycles for MasmBasic Len, result=100
534     cycles for MasmBasic Instr, result=100
164     cycles for Fastscanbyte, len=-1, result=100
305     cycles for Fastscanbyte, len(src), result=100


But it isn't trivial because of the alignment and buffer bounds issues. See e.g. szLen optimize ("Read 42623 times"). I can also recommend, especially to you, Dave, this thread  :P

dedndave

i don't get the "len=-1" part   :redface:

you might want to test it on strings with different alignments

jj2007

Yves' algo requires to supply the string len, which costs extra cycles if you don't know it in advance. You can supply -1 but that works only if there is a match :P

TouEnMasm


scanning for any byte is not like scanning for zero byte.
If you are pretty sure to find a zero , it isn't the same thing for any byte.
The control of the length searched is needed here.You can just pass an estimated max length.

SSE is useless in small chain but very usefull searching in a file.
Fa is a musical note to play with CL

TouEnMasm


Here is a sample scanning for end of line with SSE.
Retur the number of lines.
The memory could be aligned or not,no problem.
It is the good method to use SSE searching speed.
Quote
CompteurLignes PROC uses ebx edi esi pmem:DWORD,taille:DWORD
         Local  Nblines:DWORD,count,reste
         local  theEnd:dword
   ;init
   mov Nblines,0
   mov reste,0   
   mov edx,pmem
   add edx,taille
   mov theEnd,edx
   mov edx,pmem
   mov esi,edx   
   and edx,0Fh
   .if edx != 0
      ;search lines in the non align memory
      mov ecx,16
      sub ecx,edx
      @@:
      .if byte ptr [esi] != 0
         .if word ptr [esi] == 0A0Dh
            inc Nblines         
         .endif
      .else
         mov eax,Nblines
         jmp FindeCompteurLignes
      .endif
      inc esi
      dec ecx
      jnz @B      
   .endif
   ;esi point on a 16 aligned memory
   ;count the number  of 32 bytes parts
   mov edx,0
   mov eax,theEnd
   sub eax,esi
   .if eax == 0
      mov eax,Nblines
      jmp FindeCompteurLignes      
   .endif
   .if eax < 32
      mov reste,eax
      mov eax,Nblines      
      jmp EndNonaligned
   .endif
   mov edx,0
   mov ecx,32
   div ecx
   mov count,eax      
   mov reste,edx
   ;--------------------------  search in aligned part -------------      
   ;init of various register
   mov eax, 0d0d0d0dh   ; Ascii 10, linefeed
   movd xmm6, eax
   pshufd xmm6, xmm6, 0   ; linefeeds for comparison in xmm2   
   mov eax,Nblines   ;line counter
   ;ready
   NewBloc:
      ;------ align 16 needed ----------
      ;1731187 cycles for 22274 lines
      movdqa xmm1,xmm6         ;charge 13
      movdqa xmm2,xmm6         ;charge 13      
      pcmpeqb  xmm1,[esi]      ;cmp with memory align 16      
      pcmpeqb  xmm2,[esi+16]      ;cmp with memory ,align 16                  
      pmovmskb ecx, xmm1 ; result in ecx
      pmovmskb edx, xmm2 ; result +16 edx
      shl edx,16
      add ecx,edx
      jz suite
      NbLineBreak:
      bsf   edx,   ecx
      jz suite
      .if    word   ptr [edx+esi] == 0A0Dh
         inc   eax
      .endif
      btr   ecx,   edx
      jmp NbLineBreak
   suite:
   lea esi,[esi+32]
   dec count
   jnz NewBloc
   
EndNonaligned:   
   .if reste != 0
      mov ecx,reste
      @@:
      .if word ptr [esi] == 0A0Dh
         inc eax
      .endif   
      inc esi
      dec ecx
      jnz @B                  
   .endif
   
FindeCompteurLignes:
ret
CompteurLignes endp
Fa is a musical note to play with CL

qWord

MREAL macros - when you need floating point arithmetic while assembling!

hutch--

For the many algos used to calculate string lengths, few give any real advantage as the vast majority of strings are very short, under 1k. Now if you were only scanning at 1 meg a second a 1k string still occurs in 1 thousanth of a second and of course any computer in the last 20 years is a lot faster than that. Unless I have some reason to try and scan long strings, ( > 1 meg and much larger) I stick to short and simple code. Example below.



IF 0  ; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
                      Build this template with "CONSOLE ASSEMBLE AND LINK"
ENDIF ; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤

    include \masm32\include\masm32rt.inc

    slen PROTO :DWORD

  ; --------------------
    getsl MACRO ptxt
      LOCAL lbl0
      mov eax, ptxt
      sub eax, 1
    lbl0:
      add eax, 1
      cmp BYTE PTR [eax], 0
      jne lbl0
      sub eax, ptxt                 ;; result = length in EAX
      EXITM <eax>
    endm
  ; --------------------
    slenth MACRO ptxt
      LOCAL lbl0
      mov eax, ptxt
      sub eax, 1
    lbl0:
      add eax, 1
      cmp BYTE PTR [eax], 0
      jne lbl0
      sub eax, ptxt                 ;; result = length in EAX
    endm
  ; --------------------

    .code

start:
   
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤

    call main
    inkey
    exit

; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤

main proc

    LOCAL txt   :DWORD

    sas txt,"12345678901234567890"

    print ustr$(rv(slen,txt)),13,10 ; procedure call

    print ustr$(getsl(txt)),13,10   ; macro function

    slenth txt                      ; simple macro
    print ustr$(eax),13,10

    ret

main endp

; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤

OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE

slen proc pstr:DWORD

    mov eax, [esp+4]
    sub eax, 1

  lbl0:
    add eax, 1
    cmp BYTE PTR [eax], 0
    jne lbl0

    sub eax, [esp+4]

    ret 4

OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef

slen endp

; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤

end start


There are many variations of a scanner written inline in an algo, these are just a few simple ones.

TouEnMasm

The SSE example i have posted is in use in the translator.
He win about 10 minutes on a 86 Mo sum of text files.
Fa is a musical note to play with CL

jj2007

Ten minutes??? That's odd. It takes less than half a second to find a string in the 77MB of the Windows SDK...

TouEnMasm


Quote
It takes less than half a second to find a string in the 77MB of the Windows SDK...
I am curious to see the routine who do that,post it here??????
Fa is a musical note to play with CL

jj2007

Quote from: ToutEnMasm on September 06, 2012, 12:23:48 AM

Quote
It takes less than half a second to find a string in the 77MB of the Windows SDK...
I am curious to see the routine who do that,post it here??????

Already posted - xHelp is part of MasmBasic. Download and extract, launch \Masm32\RichMasm\RichMasm.exe, select any text, e.g. SMALL_RECT, and hit F1. If the SDK folder is correctly registered in \Masm32\help\xHelp\xHelp.ini, it will take a few seconds until it is fully loaded, but then, any new search in the 77+ MB costs only a few milliseconds.

The source is in \Masm32\help\xHelp\xHelp.asc (asc=Assembler Source Code, open in RichMasm.exe)
xHelp looks for structure definitions, so SMALL_RECT will find WinCon.h
If you specify a partial or non-uppercase string, e.g. small_, it will find also WTS_SMALL_RECT in wtsdefs.h, and the search will take longer, but still under one second.