I went to write a toy recently and needed a word tokeniser and did not have one. This is the test piece for the algo that will be added to the main library.
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
include \masm32\include64\masm64rt.inc
.code
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
entry_point proc
LOCAL pcmd :QWORD ; pseudo cmd line pointer
LOCAL parr :QWORD ; array pointer
LOCAL arrm[16] :QWORD ; array memory
LOCAL acnt :QWORD
LOCAL .r15 :QWORD
mov .r15, r15 ; preserve non volatile register
mov parr, ptr$(arrm) ; get array pointer
mrm pcmd, "one , two,three , four,five , six,seven , eight,nine , ten"
rcall wordtok,pcmd,parr,44 ; call the word parser
mov acnt, rax ; get the arg count
mov r15, parr ; load array address into r15
sub r15, 8 ; set up loop
lbl:
add r15, 8 ; add 8 byte offset
rcall szTrim,QWORD PTR [r15] ; trim any junk from both ends
conout QWORD PTR [r15],lf ; display each word
sub acnt, 1 ; decrement the counter
jnz lbl ; loop back if not zero
mov r15, .r15 ; restore non volatile register
waitkey
invoke ExitProcess,0
ret
entry_point endp
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
NOSTACKFRAME
wordtok proc ; pcmd:QWORD, parr:QWORD, dlmt:QWORD
; -------------------------------------------
; tokenise text based on a supplied delimiter
; text is parsed in place (no copy)
; pcmd = pointer to text to tokenise
; parr = array pointer
; dlmt = delimiter as user supplied ascii char as number IE: 44 = ","
; return value = word or text count
; -------------------------------------------
mov r11, rcx ; pcmd
mov r10, rdx ; parr
xor r9, r9 ; use r9 as arg counter
mov [r10], r11 ; load first start address
add r10, 8
add r9, 1 ; increment counter
sub r11, 1
lbl0:
add r11, 1
movzx rax, BYTE PTR [r11]
test rax, rax ; test for and exit on terminator
jz out1
cmp rax, r8 ; check if char is delimiter
jne lbl0 ; loop back if not
mov BYTE PTR [r11], 0 ; terminate array member
add r11, 1 ; increment to next arg 1st char
add r9, 1 ; increment the arg counter
mov [r10], r11 ; load next text address into r10
add r10, 8 ; add 8 for next array member
jmp lbl0
out1:
mov rax, r9 ; return the arg count
ret
wordtok endp
STACKFRAME
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
; wordtok proc pcmd:QWORD, parr:QWORD, dlmt:QWORD
; ; -------------------------------------------
; ; tokenise text based on a supplied delimiter
; ; text is parsed in place (no copy)
; ; pcmd = pointer to text to tokenise
; ; parr = array pointer
; ; dlmt = delimiter as user supplied ascii char as number IE: 44 = ","
; ; return value = word or text count
; ; -------------------------------------------
; mov r11, pcmd
; mov r10, parr
; xor r9, r9 ; use r9 as arg counter
;
; mov [r10], r11 ; load first start address
; add r10, 8
; add r9, 1 ; increment counter
; sub r11, 1
; lbl0:
; add r11, 1
; movzx rax, BYTE PTR [r11]
; test rax, rax ; test for and exit on terminator
; jz out1
; cmp rax, dlmt ; check if char is delimiter
; jne lbl0 ; loop back if not
; mov BYTE PTR [r11], 0 ; terminate array member
; add r11, 1 ; increment to next arg 1st char
; add r9, 1 ; increment the arg counter
; mov [r10], r11 ; load next text address into r10
; add r10, 8 ; add 8 for next array member
; jmp lbl0
;
; out1:
; mov rax, r9 ; return the arg count
; ret
;
; wordtok endp
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
end
Would be real interesting to see how fast it will run on normal text. I'm thinking like the Bible or "War and Peace". I'm pretty sure it will run very fast. In place tokenizing, using a table for delimiters.
Z,
That can be done but its a different algorithm, you would probably use a character table and omissions in the table, "0" would function as the delimiter but this would only delimit single words. The algo above because it uses a character delimiter can handle multiple words and can handle stuff like "long file name.exe" without having to quote it.
wordtok proc ; pcmd:QWORD, parr:QWORD, dlmt:QWORD
sub eax,eax
next: mov [rdx+rax*8],rcx
inc eax
skip: movzx r9,BYTE PTR [rcx]
inc rcx
test r9,r9
jz done
cmp r9,r8
jnz skip
and BYTE PTR [rcx-1],0
jmp next
done: ret
wordtok endp
sinsi,
Looks good, worked OK here.
I have cleaned it up to better suit the no stack frame proc but its still 2 instructions longer than your version. The use of the compound instruction "mov [rdx+rax*8],rcx" saves you at least 1 instruction. Both it and your version can be unrolled but its probably a case of diminishing returns for what an algo of this type is used for.
NOSTACKFRAME
wordtok proc ; psrc:QWORD, parr:QWORD, delm:QWORD
xor rax, rax ; clear the counter
mov [rdx], rcx ; load start address into 1st array member
sub rcx, 1
pre:
add rdx, 8 ; add 8 for next array member
add rax, 1 ; increment the arg counter
lbl0:
add rcx, 1
movzx r9, BYTE PTR [rcx]
test r9, r9 ; test for and exit on terminator
jz bye
cmp r9, r8 ; check if char is delimiter
jne lbl0 ; loop back if not
mov BYTE PTR [rcx], 0 ; terminate array member
add rcx, 1 ; increment to next arg 1st char
mov [rdx], rcx ; load next text address into rdx
jmp pre
bye:
ret
wordtok endp
STACKFRAME
nice work Hutch,wanna take a look at sztrim proc
This is the version that I added to the library, I used the technique that sinsi designed to make the terminate and array increment faster and unrolled the 6 instruction loop.
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
NOSTACKFRAME
wordtok proc ; psrc:QWORD, parr:QWORD, delm:QWORD
; -------------------------------------------
; tokenise text based on a supplied delimiter
; text is tokenised in place (no copy)
; rcx : psrc = pointer to text to tokenise
; rdx : parr = address of pointer array
; r8 : dlmt = delimiter as user supplied ascii char as number IE: 44 = ","
; return value = word or text count
; EXAMPLE : invoke wordtok,psrc,parr,44
; -------------------------------------------
xor rax, rax ; clear the counter
pre:
mov [rdx+rax*8], rcx ; load next text address into rdx
add rax, 1 ; increment the arg counter
lbl0:
add rcx, 1
movzx r11, BYTE PTR [rcx]
test r11, r11 ; test for and exit on terminator
jz bye
sub r11, r8 ; check if char is delimiter
jz lbl1 ; branch if delimiter
add rcx, 1
movzx r11, BYTE PTR [rcx]
test r11, r11 ; test for and exit on terminator
jz bye
sub r11, r8 ; check if char is delimiter
jnz lbl0 ; loop back if not
lbl1:
mov BYTE PTR [rcx], 0 ; terminate array member
add rcx, 1 ; increment to next arg 1st char
jmp pre
bye:
ret
wordtok endp
STACKFRAME
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤