Author Topic: Word tokeniser.  (Read 444 times)

hutch--

  • Administrator
  • Member
  • ******
  • Posts: 5838
  • Mnemonic Driven API Grinder
    • The MASM32 SDK
Word tokeniser.
« on: May 22, 2018, 11:16:48 PM »
I went to write a toy recently and needed a word tokeniser and did not have one. This is the test piece for the algo that will be added to the main library.

; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤

    include \masm32\include64\masm64rt.inc

    .code

; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤

entry_point proc

    LOCAL pcmd  :QWORD                      ; pseudo cmd line pointer
    LOCAL parr  :QWORD                      ; array pointer
    LOCAL arrm[16] :QWORD                   ; array memory
    LOCAL acnt  :QWORD
    LOCAL .r15  :QWORD

    mov .r15, r15                           ; preserve non volatile register

    mov parr, ptr$(arrm)                    ; get array pointer
    mrm pcmd, "one  ,  two,three  ,  four,five  ,  six,seven  ,  eight,nine  ,  ten"

    rcall wordtok,pcmd,parr,44              ; call the word parser
    mov acnt, rax                           ; get the arg count

    mov r15, parr                           ; load array address into r15

    sub r15, 8                              ; set up loop
  lbl:
    add r15, 8                              ; add 8 byte offset
    rcall szTrim,QWORD PTR [r15]            ; trim any junk from both ends
    conout QWORD PTR [r15],lf               ; display each word
    sub acnt, 1                             ; decrement the counter
    jnz lbl                                 ; loop back if not zero

    mov r15, .r15                           ; restore non volatile register

    waitkey

    invoke ExitProcess,0

    ret

entry_point endp

; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤

 NOSTACKFRAME

 wordtok proc                   ; pcmd:QWORD, parr:QWORD, dlmt:QWORD
  ; -------------------------------------------
  ; tokenise text based on a supplied delimiter
  ; text is parsed in place (no copy)
  ; pcmd = pointer to text to tokenise
  ; parr = array pointer
  ; dlmt = delimiter as user supplied ascii char as number IE: 44 = ","
  ; return value = word or text count
  ; -------------------------------------------
    mov r11, rcx                ; pcmd
    mov r10, rdx                ; parr
    xor r9,  r9                 ; use r9 as arg counter

    mov [r10], r11              ; load first start address
    add r10, 8
    add r9, 1                   ; increment counter
    sub r11, 1
  lbl0:
    add r11, 1
    movzx rax, BYTE PTR [r11]
    test rax, rax               ; test for and exit on terminator
    jz out1
    cmp rax, r8                 ; check if char is delimiter
    jne lbl0                    ; loop back if not
    mov BYTE PTR [r11], 0       ; terminate array member
    add r11, 1                  ; increment to next arg 1st char
    add r9, 1                   ; increment the arg counter
    mov [r10], r11              ; load next text address into r10
    add r10, 8                  ; add 8 for next array member
    jmp lbl0

  out1:
    mov rax, r9                 ; return the arg count
    ret

 wordtok endp

 STACKFRAME

; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤

 ;  wordtok proc pcmd:QWORD, parr:QWORD, dlmt:QWORD
 ;   ; -------------------------------------------
 ;   ; tokenise text based on a supplied delimiter
 ;   ; text is parsed in place (no copy)
 ;   ; pcmd = pointer to text to tokenise
 ;   ; parr = array pointer
 ;   ; dlmt = delimiter as user supplied ascii char as number IE: 44 = ","
 ;   ; return value = word or text count
 ;   ; -------------------------------------------
 ;     mov r11, pcmd
 ;     mov r10, parr
 ;     xor r9,  r9                 ; use r9 as arg counter
 ;
 ;     mov [r10], r11              ; load first start address
 ;     add r10, 8
 ;     add r9, 1                   ; increment counter
 ;     sub r11, 1
 ;   lbl0:
 ;     add r11, 1
 ;     movzx rax, BYTE PTR [r11]
 ;     test rax, rax               ; test for and exit on terminator
 ;     jz out1
 ;     cmp rax, dlmt               ; check if char is delimiter
 ;     jne lbl0                    ; loop back if not
 ;     mov BYTE PTR [r11], 0       ; terminate array member
 ;     add r11, 1                  ; increment to next arg 1st char
 ;     add r9, 1                   ; increment the arg counter
 ;     mov [r10], r11              ; load next text address into r10
 ;     add r10, 8                  ; add 8 for next array member
 ;     jmp lbl0
 ;
 ;   out1:
 ;     mov rax, r9                 ; return the arg count
 ;     ret
 ;
 ;  wordtok endp

; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤

    end
hutch at movsd dot com
http://www.masm32.com    :biggrin:  :biggrin:

zedd151

  • Member
  • ****
  • Posts: 850
Re: Word tokeniser.
« Reply #1 on: May 23, 2018, 12:53:15 AM »
Would be real interesting to see how fast it will run on normal text.  I'm thinking like the Bible or "War and Peace". I'm  pretty sure it will run very fast.  In place tokenizing,  using a table for delimiters.
I'm not always the sharpest knife in the drawer, but I have my moments.  :P

hutch--

  • Administrator
  • Member
  • ******
  • Posts: 5838
  • Mnemonic Driven API Grinder
    • The MASM32 SDK
Re: Word tokeniser.
« Reply #2 on: May 23, 2018, 01:16:19 AM »
Z,

That can be done but its a different algorithm, you would probably use a character table and omissions in the table, "0" would function as the delimiter but this would only delimit single words. The algo above because it uses a character delimiter can handle multiple words and can handle stuff like "long file name.exe" without having to quote it.
hutch at movsd dot com
http://www.masm32.com    :biggrin:  :biggrin:

sinsi

  • Member
  • *****
  • Posts: 1072
Re: Word tokeniser.
« Reply #3 on: May 23, 2018, 01:32:18 AM »
Code: [Select]
wordtok proc ; pcmd:QWORD, parr:QWORD, dlmt:QWORD
        sub eax,eax
next:   mov [rdx+rax*8],rcx
        inc eax
skip:   movzx r9,BYTE PTR [rcx]
        inc rcx
        test r9,r9
        jz done
        cmp r9,r8
        jnz skip
        and BYTE PTR [rcx-1],0
        jmp next
done:   ret
wordtok endp
I can walk on water but stagger on beer.

hutch--

  • Administrator
  • Member
  • ******
  • Posts: 5838
  • Mnemonic Driven API Grinder
    • The MASM32 SDK
Re: Word tokeniser.
« Reply #4 on: May 23, 2018, 02:07:00 AM »
sinsi,

Looks good, worked OK here.
hutch at movsd dot com
http://www.masm32.com    :biggrin:  :biggrin:

hutch--

  • Administrator
  • Member
  • ******
  • Posts: 5838
  • Mnemonic Driven API Grinder
    • The MASM32 SDK
Re: Word tokeniser.
« Reply #5 on: May 23, 2018, 04:07:01 AM »
I have cleaned it up to better suit the no stack frame proc but its still 2 instructions longer than your version. The use of the compound instruction "mov [rdx+rax*8],rcx" saves you at least 1 instruction. Both it and your version can be unrolled but its probably a case of diminishing returns for what an algo of this type is used for.

 NOSTACKFRAME

 wordtok proc                   ; psrc:QWORD, parr:QWORD, delm:QWORD

    xor rax, rax                ; clear the counter
    mov [rdx], rcx              ; load start address into 1st array member
    sub rcx, 1

  pre:
    add rdx, 8                  ; add 8 for next array member
    add rax, 1                  ; increment the arg counter

  lbl0:
    add rcx, 1
    movzx r9, BYTE PTR [rcx]
    test r9, r9                 ; test for and exit on terminator
    jz bye
    cmp r9, r8                  ; check if char is delimiter
    jne lbl0                    ; loop back if not

    mov BYTE PTR [rcx], 0       ; terminate array member
    add rcx, 1                  ; increment to next arg 1st char
    mov [rdx], rcx              ; load next text address into rdx
    jmp pre

  bye:
    ret

 wordtok endp

 STACKFRAME
hutch at movsd dot com
http://www.masm32.com    :biggrin:  :biggrin:

daydreamer

  • Member
  • ****
  • Posts: 548
  • reach for the stars
Re: Word tokeniser.
« Reply #6 on: May 24, 2018, 03:39:45 AM »
nice work Hutch,wanna take a look at sztrim proc



Quote from Flashdance
Nick  :  When you give up your dream, you die.
*wears a flameproof asbestos suit*

hutch--

  • Administrator
  • Member
  • ******
  • Posts: 5838
  • Mnemonic Driven API Grinder
    • The MASM32 SDK
Re: Word tokeniser.
« Reply #7 on: May 24, 2018, 04:03:16 AM »
This is the version that I added to the library, I used the technique that sinsi designed to make the terminate and array increment faster and unrolled the 6 instruction loop.

; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤

 NOSTACKFRAME

 wordtok proc                   ; psrc:QWORD, parr:QWORD, delm:QWORD

  ; -------------------------------------------
  ; tokenise text based on a supplied delimiter
  ; text is tokenised in place (no copy)
  ; rcx : psrc = pointer to text to tokenise
  ; rdx : parr = address of pointer array
  ; r8  : dlmt = delimiter as user supplied ascii char as number IE: 44 = ","
  ; return value = word or text count
  ; EXAMPLE : invoke wordtok,psrc,parr,44
  ; -------------------------------------------

    xor rax, rax                ; clear the counter

  pre:
    mov [rdx+rax*8], rcx        ; load next text address into rdx
    add rax, 1                  ; increment the arg counter

  lbl0:
    add rcx, 1
    movzx r11, BYTE PTR [rcx]
    test r11, r11               ; test for and exit on terminator
    jz bye
    sub r11, r8                 ; check if char is delimiter
    jz lbl1                     ; branch if delimiter

    add rcx, 1
    movzx r11, BYTE PTR [rcx]
    test r11, r11               ; test for and exit on terminator
    jz bye
    sub r11, r8                 ; check if char is delimiter
    jnz lbl0                    ; loop back if not

  lbl1:
    mov BYTE PTR [rcx], 0       ; terminate array member
    add rcx, 1                  ; increment to next arg 1st char
    jmp pre

  bye:
    ret

 wordtok endp

 STACKFRAME

; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
hutch at movsd dot com
http://www.masm32.com    :biggrin:  :biggrin: