I wrote this one in PB and it was an easy port to MASM. It differs from the tokeniser in the masm32 library in that it preserves empty lines and does not left trim tabs and spaces. I could not get a timing on it with a 4.5 meg file and got a 32 ms timing on a 17.5 meg file on my i7 so its probably fast enough. use should be general purpose but it is capable of identifying line number from the array index so it can be used to map a text or source file. Builds at 3k so it should not blow out your hard disk.
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
include \masm32\include\masm32rt.inc
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
comment * -----------------------------------------------------
Build this template with
"CONSOLE ASSEMBLE AND LINK"
----------------------------------------------------- *
line_tokeniser PROTO :DWORD
get_lcnt PROTO :DWORD
.data
caesar \
db "Friends, Romans, countrymen, lend me your ears;",13,10
db "I come to bury Caesar, not to praise him.",13,10
db "The evil that men do lives after them;",13,10
db "The good is oft interred with their bones;",13,10,0
ptxt dd caesar
.code
start:
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
call main
print chr$(13,10)
inkey "Thats all folks, press a key to exit..."
exit
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
main proc
LOCAL pMem :DWORD
LOCAL lcnt :DWORD
mov pMem, rv(line_tokeniser,ptxt) ; tokenise text
mov lcnt, ecx ; save the line count
push esi
push edi
mov esi, pMem ; load array into ESI
mov edi, lcnt ; use EDI as the counter
@@:
print [esi],13,10 ; display each line of text
add esi, 4 ; increment to next pointer
sub edi, 1 ; dec the counter
jnz @B ; loop back if not zero
pop edi
pop esi
free pMem ; release memory from tokeniser
ret
main endp
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
line_tokeniser proc src:DWORD
; ---------------------------------------------------
; tokeniser for CRLF delimited text
; ---------------------------------------------------
; replaces the ascii 13 with zero and writes apointer
; to the allocated memory as an array of pointers
; return value in EAX = pointer array address
; return value in ECX = the line count
; array address must be de-allocated using
; GlobalFree() or the macro "free".
; ---------------------------------------------------
LOCAL lcnt :DWORD
LOCAL pMem :DWORD
LOCAL alen :DWORD
push src
call get_lcnt ; get the line count
mov lcnt, eax ; store line count in variable
lea eax, [eax*4] ; set pointer array length
mov alen, eax ; store the array size in alen
mov pMem, alloc(alen) ; allocate the pointer array
mov edx, src ; source address in ESI
mov ecx, pMem ; pointer array address in EBX
mov [ecx], edx ; load array address into 1st member of array
add ecx, 4
sub edx, 1
lbl1:
add edx, 1
movzx eax, BYTE PTR [edx] ; zero extend byte into EAX
test eax, eax ; test for zero
jz lbl2 ; exit loop on zero
cmp eax, 13 ; test for ascii 13
jne lbl1 ; short loop back if not 13
mov BYTE PTR [edx], 0 ; write terminator at ascii 13 location
add edx, 2 ; step over ascii 13 and 10
mov [ecx], edx ; write the next line start to pointer
add ecx, 4 ; increment to next pointer
jmp lbl1 ; long loop after writing pointer
lbl2:
mov ecx, lcnt ; return the line count in ECX
mov eax, pMem ; the array pointer in EAX
ret
line_tokeniser endp
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
get_lcnt proc src:DWORD
; --------------------------------------
; count ascii 13 to determine line count
; --------------------------------------
mov edx, [esp+4] ; the source address
sub edx, 1
xor eax, eax
jmp lbl1
pre:
add eax, 1 ; increment the counter
lbl1:
; -----------
; unroll by 4
; -----------
add edx, 1
movzx ecx, BYTE PTR [edx]
cmp ecx, 13
je pre
test ecx, ecx
jz lbl2
add edx, 1
movzx ecx, BYTE PTR [edx]
cmp ecx, 13
je pre
test ecx, ecx
jz lbl2
add edx, 1
movzx ecx, BYTE PTR [edx]
cmp ecx, 13
je pre
test ecx, ecx
jz lbl2
add edx, 1
movzx ecx, BYTE PTR [edx]
cmp ecx, 13
je pre
test ecx, ecx
jnz lbl1
lbl2:
ret 4
get_lcnt endp
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
end start
Little test:
main proc
LOCAL pMem :DWORD
LOCAL lcnt :DWORD
LOCAL pContent
and pContent, 0
mov pContent, InputFile("\Masm32\include\Windows.inc")
mov pMem, rv(line_tokeniser, pContent) ; tokenise text
mov lcnt, ecx ; save the line count
push esi
push edi
mov esi, pMem ; load array into ESI
mov edi, lcnt ; use EDI as the counter
; mov edi, 9
@@:
lodsd
.if eax>127
print eax, 13, 10
.else
print "ERROR ################", 13, 10
.endif
sub edi, 1 ; dec the counter
jnz @B ; loop back if not zero
pop edi
pop esi
free pContent
free pMem ; release memory from tokeniser
ret
main endp
The line count is correct - the problem is elsewhere. Check what happens with the pointers to empty lines; at first sight, I can't find any problem in your code, it really looks correct, but ...
Speedwise it looks quite OK. Recall (http://www.webalice.it/jj2006/MasmBasicQuickReference.htm#Mb1172) is over twice as fast, but that one requires SSE2, of course.
The intel strchr use a different method to find a char in a string
page ,132
title strchr - search string for given character
;***
;strchr.asm - search a string for a given character
;
; Copyright (c) Microsoft Corporation. All rights reserved.
;
;Purpose:
; defines strchr() - search a string for a character
;
;*******************************************************************************
.xlist
include cruntime.inc
.list
page
;***
;char *strchr(string, chr) - search a string for a character
;
;Purpose:
; Searches a string for a given character, which may be the
; null character '\0'.
;
; Algorithm:
; char *
; strchr (string, chr)
; char *string, chr;
; {
; while (*string && *string != chr)
; string++;
; if (*string == chr)
; return(string);
; return((char *)0);
; }
;
;Entry:
; char *string - string to search in
; char chr - character to search for
;
;Exit:
; returns pointer to the first occurence of c in string
; returns NULL if chr does not occur in string
;
;Uses:
;
;Exceptions:
;
;*******************************************************************************
CODESEG
found_bx:
lea eax,[edx - 1]
pop ebx ; restore ebx
ret ; _cdecl return
align 16
public strchr, __from_strstr_to_strchr
strchr proc \
string:ptr byte, \
chr:byte
OPTION PROLOGUE:NONE, EPILOGUE:NONE
.FPO ( 0, 2, 0, 0, 0, 0 )
xor eax,eax
mov al,[esp + 8] ; al = chr (search char)
__from_strstr_to_strchr label proc
push ebx ; PRESERVE EBX
mov ebx,eax ; ebx = 0/0/0/chr
shl eax,8 ; eax = 0/0/chr/0
mov edx,[esp + 8] ; edx = buffer
test edx,3 ; test if string is aligned on 32 bits
jz short main_loop_start
str_misaligned: ; simple byte loop until string is aligned
mov cl,[edx]
add edx,1
cmp cl,bl
je short found_bx
test cl,cl
jz short retnull_bx
test edx,3 ; now aligned ?
jne short str_misaligned
main_loop_start: ; set all 4 bytes of ebx to [chr]
or ebx,eax ; ebx = 0/0/chr/chr
push edi ; PRESERVE EDI
mov eax,ebx ; eax = 0/0/chr/chr
shl ebx,10h ; ebx = chr/chr/0/0
push esi ; PRESERVE ESI
or ebx,eax ; ebx = all 4 bytes = [chr]
; in the main loop (below), we are looking for chr or for EOS (end of string)
main_loop:
mov ecx,[edx] ; read dword (4 bytes)
mov edi,7efefeffh ; work with edi & ecx for looking for chr
mov eax,ecx ; eax = dword
mov esi,edi ; work with esi & eax for looking for EOS
xor ecx,ebx ; eax = dword xor chr/chr/chr/chr
add esi,eax
add edi,ecx
xor ecx,-1
xor eax,-1
xor ecx,edi
xor eax,esi
add edx,4
and ecx,81010100h ; test for chr
jnz short chr_is_found ; chr probably has been found
; chr was not found, check for EOS
and eax,81010100h ; is any flag set ??
jz short main_loop ; EOS was not found, go get another dword
and eax,01010100h ; is it in high byte?
jnz short retnull ; no, definitely found EOS, return failure
and esi,80000000h ; check was high byte 0 or 80h
jnz short main_loop ; it just was 80h in high byte, go get
; another dword
retnull:
pop esi
pop edi
retnull_bx:
pop ebx
xor eax,eax
ret ; _cdecl return
chr_is_found:
mov eax,[edx - 4] ; let's look one more time on this dword
cmp al,bl ; is chr in byte 0?
je short byte_0
test al,al ; test if low byte is 0
je retnull
cmp ah,bl ; is it byte 1
je short byte_1
test ah,ah ; found EOS ?
je retnull
shr eax,10h ; is it byte 2
cmp al,bl
je short byte_2
test al,al ; if in al some bits were set, bl!=bh
je retnull
cmp ah,bl
je short byte_3
test ah,ah
jz retnull
jmp short main_loop ; neither chr nor EOS found, go get
; another dword
byte_3:
pop esi
pop edi
lea eax,[edx - 1]
pop ebx ; restore ebx
ret ; _cdecl return
byte_2:
lea eax,[edx - 2]
pop esi
pop edi
pop ebx
ret ; _cdecl return
byte_1:
lea eax,[edx - 3]
pop esi
pop edi
pop ebx
ret ; _cdecl return
byte_0:
lea eax,[edx - 4]
pop esi ; restore esi
pop edi ; restore edi
pop ebx ; restore ebx
ret ; _cdecl return
strchr endp
end
Quote from: ToutEnMasm on October 24, 2014, 05:05:06 PM
The intel strchr use a different method to find a char in a string
Oh really? Is it faster? Can you post timings?
@Hutch: GOTCHA!
lea eax, [eax*4+1] ; set pointer array length
...
lbl1:
add edx, 1
lbl1a:
movzx eax, BYTE PTR [edx] ; zero extend byte into EAX
..
jmp lbl1a ; long loop after writing pointer
Not really interested in that.I have my own routines to work with text.Perhaps in another post
Quote from: ToutEnMasm on October 24, 2014, 06:32:36 PM
I have my own routines to work with text.
Interesting. Maybe we can do a speed contest? :icon14:
The basic version with the identical code does not crash, It runs to the end of the source then crashes. I am burdened at the moment because my Win7 does not have a post mortem debugger.
A bit later : The extra label does it, redirecting the output produces and identical file.
Quote
Interesting. Maybe we can do a speed contest?
Ok,
The start point of my routines are here:
http://www.masmforum.com/board/index.php?topic=11061.0 (http://www.masmforum.com/board/index.php?topic=11061.0)
The counter of lines count the word 13,10 not only 13.
An html file is a text file and use 10 and 13,10
The SSE2 instructions are in use and it must be difficult to find faster.
I have made more codes using that but all comments are in french,some of them are lost in the forum in english.
Here is try 2, unrolled part of the tokeniser, reduced the unroll in the line counter and it run at about 630 meg/sec as 486 compatible code. My test piece was a 315 meg text file and it kept timimg at just under 500 ms so its speed is OK for 486 code. You will need to supply your own test text file and insert the correct name in the example.
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
include \masm32\include\masm32rt.inc
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
comment * -----------------------------------------------------
Build this template with
"CONSOLE ASSEMBLE AND LINK"
----------------------------------------------------- *
line_tokeniser PROTO :DWORD
get_lcnt PROTO :DWORD
.data
caesar \
db "Friends, Romans, countrymen, lend me your ears;",13,10
db "I come to bury Caesar, not to praise him.",13,10
db "The evil that men do lives after them;",13,10
db "The good is oft interred with their bones;",13,10,0
ptxt dd caesar
.code
start:
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
call main
print chr$(13,10)
inkey "Thats all folks, press a key to exit..."
exit
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
main proc
LOCAL hMem :DWORD
LOCAL pMem :DWORD
LOCAL lcnt :DWORD
mov hMem, InputFile("big2.txt") ; 315 meg test file
; ------------------
; time the tokeniser
; ------------------
fn GetTickCount
push eax
mov pMem, rv(line_tokeniser,hMem) ; tokenise text
mov lcnt, ecx ; save the line count
fn GetTickCount
pop ecx
sub eax, ecx
; -----------------------------------------------------
; remove the RET to display the file contents
; don't do it on a BIG file or it will never finish. :)
; -----------------------------------------------------
print str$(eax)," ms",13,10
ret
push esi
push edi
mov esi, pMem ; load array into ESI
mov edi, lcnt ; use EDI as the counter
@@:
print [esi],13,10 ; display each line of text
add esi, 4 ; increment to next pointer
sub edi, 1 ; dec the counter
jnz @B ; loop back if not zero
pop edi
pop esi
free hMem
free pMem ; release memory from tokeniser
ret
main endp
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
line_tokeniser proc src:DWORD
; ----------------------------------------------------
; tokeniser for CRLF delimited text
; ----------------------------------------------------
; replaces the ascii 13 with zero and writes a pointer
; to the allocated memory as an array of pointers
; return value in EAX = pointer array address
; return value in ECX = the line count
; array address must be de-allocated using
; GlobalFree() or the macro "free".
; ----------------------------------------------------
LOCAL lcnt :DWORD
LOCAL pMem :DWORD
LOCAL alen :DWORD
push src
call get_lcnt ; get the line count
mov lcnt, eax ; store line count in variable
lea eax, [eax*4] ; set pointer array length
mov alen, eax ; store the array size in alen
mov pMem, alloc(alen) ; allocate the pointer array
mov edx, src ; source address in ESI
mov ecx, pMem ; pointer array address in EBX
mov [ecx], edx ; load array address into 1st member of array
add ecx, 4
sub edx, 1
lbl1:
add edx, 1
nxt:
movzx eax, BYTE PTR [edx] ; zero extend byte into EAX
test eax, eax ; test for zero
jz lbl2 ; exit loop on zero
cmp eax, 13 ; test for ascii 13
je wrtptr ; short loop back if not 13
add edx, 1
movzx eax, BYTE PTR [edx] ; zero extend byte into EAX
test eax, eax ; test for zero
jz lbl2 ; exit loop on zero
cmp eax, 13 ; test for ascii 13
jne lbl1 ; short loop back if not 13
wrtptr:
mov BYTE PTR [edx], 0 ; write terminator at ascii 13 location
add edx, 2 ; step over ascii 13 and 10
mov [ecx], edx ; write the next line start to pointer
add ecx, 4 ; increment to next pointer
jmp nxt ; long loop after writing pointer
lbl2:
mov ecx, lcnt ; return the line count in ECX
mov eax, pMem ; the array pointer in EAX
ret
line_tokeniser endp
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
get_lcnt proc src:DWORD
; --------------------------------------
; count ascii 13 to determine line count
; --------------------------------------
mov edx, [esp+4] ; the source address
sub edx, 1
xor eax, eax
jmp lbl1
pre:
add eax, 1 ; increment the counter
lbl1:
; -----------
; unroll by 2
; -----------
add edx, 1
movzx ecx, BYTE PTR [edx]
cmp ecx, 13
je pre
test ecx, ecx
jz lbl2
add edx, 1
movzx ecx, BYTE PTR [edx]
cmp ecx, 13
je pre
test ecx, ecx
jnz lbl1
lbl2:
ret 4
get_lcnt endp
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
end start
Allright, folks, here is the speed test, including CompteurLines - although I have a suspicion that it does really just count the lines:
Intel(R) Core(TM) i5-2450M CPU @ 2.50GHz (MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX)
Hutch Yves (http://www.masmforum.com/board/index.php?topic=11061.0) Recall (http://www.webalice.it/jj2006/MasmBasicQuickReference.htm#Mb1172)
2177 902 719 us
2184 1054 874 us
2136 894 1052 us
2115 943 895 us
2061 943 844 us
2094 933 856 us
3039 1015 1109 us
2129 945 853 us
2108 945 844 us
2128 954 858 us
3017 977 853 us
1939 865 991 us
2105 875 888 us
Results Hutch:
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
; It is not recomended that WINDOWS.INC be modified but if you need to add
; equates or structures to WINDOWS.INC, do not write anything after the
; following conditional assembly directive that display the duplicate
; warning or it will be duplicated if the file is included more than once.
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
ELSE
echo ------------------------------------------
echo WARNING Duplicate include file windows.inc
echo ------------------------------------------
ENDIF
Thats all folks, press a key to exit...
Jochen,
the application doesn't work under Windows 7-64.
Gunther
Quote from: Gunther on October 25, 2014, 11:25:29 AMthe application doesn't work under Windows 7-64.
Interesting - it's written under Windows 7-64 ::)
Error messages? Where does it stop, if it starts at all?
Anybody else having problems?
No problems here
AMD A10-7850K APU with Radeon(TM) R7 Graphics (MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX)
Windows 7 x64 Ultimate
Hutch Yves Recall
2731 1530 1389 us
2473 1437 1684 us
2442 1282 1626 us
2329 1525 1600 us
2608 1778 1682 us
2438 1384 1558 us
2391 1357 1567 us
2390 1393 1538 us
2410 1349 1456 us
2329 1660 1821 us
2413 1374 1563 us
2403 1393 1523 us
2404 1339 1774 us
Intel(R) Core(TM) i7-3770K CPU @ 3.50GHz (MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX)
Windows 8.1 x64 Pro
Hutch Yves Recall
1839 919 674 us
1716 829 776 us
1684 768 734 us
2048 775 789 us
1621 735 727 us
1644 734 699 us
1656 884 746 us
1603 736 695 us
1678 770 723 us
1647 746 704 us
1799 804 745 us
1623 745 705 us
1597 806 694 us
It worked OK on my Win7 64 so it may be a security setting. What I would be interested in is a long linear test rather than cycling through a much smaller file as you end up with cache thrashing rather than an accurate speed reading. I used a 315 meg text file to benchmark against so that cache thrashing was not a factor and it was running at about 630 meg/sec on my i7.
There is a technique that is very crude where you simply allocate a massive pointer array buffer and do not use the line count code and this will certainly up the speed when its only a single pass rather than a double pass but to be safe you would have to be able to cater for the full file length being nothing but 13,10 line delimiters but its very memory hungry.
Windows XP,access violation bad adress
Quote
(9b4.f5c): Access violation - code c0000005 (!!! second chance !!!)
eax=00000000 ebx=7ffd9000 ecx=00000000 edx=00000000 esi=7c91d96e edi=0099b678
eip=004014ae esp=0012ff8c ebp=0012ffa0 iopl=0 nv up ei pl zr ac pe cy
cs=001b ss=0023 ds=0023 es=0023 fs=003b gs=0000 efl=00000257
*** WARNING: Unable to verify checksum for C:\DOCUME~1\Luce\LOCALS~1\Temp\Répertoire temporaire 1 pour LineTokenisers (1).zip\LineTokeniserHutch.exe
*** ERROR: Module load completed but symbols could not be loaded for C:\DOCUME~1\Luce\LOCALS~1\Temp\Répertoire temporaire 1 pour LineTokenisers (1).zip\LineTokeniserHutch.exe
LineTokeniserHutch+0x14ae:
004014ae 0fb60a movzx ecx,byte ptr [edx] ds:0023:00000000=??
I have posted the text routines in used in the translater (enough improved)
They are a full soluce to work with text and lines numbers.
http://masm32.com/board/index.php?topic=3715.msg39270#msg39270 (http://masm32.com/board/index.php?topic=3715.msg39270#msg39270)
Win 8.1 64bit
Intel(R) Core(TM) i7-4930K CPU @ 3.40GHz (MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX)
Hutch Yves Recall
4155 847 828 us
1823 786 737 us
1772 756 769 us
1796 776 783 us
1767 782 733 us
1762 762 760 us
1764 775 728 us
1770 773 760 us
1771 774 734 us
1772 767 731 us
1788 773 734 us
1778 779 728 us
1794 804 740 us
Thanks, Marinus :icon14:
Quote from: ToutEnMasm on October 25, 2014, 03:56:30 PM
Windows XP,access violation bad adress
004014ae 0fb60a movzx ecx,byte ptr [edx] ds:0023:00000000=??
@Hutch:It's line 194ff:
add edx, 1
movzx ecx, BYTE PTR [edx] cmp ecx, 13
... but no worries,
your code is ok. The dump says edx is zero, i.e. no source, and Yves launched it from a temp C: folder, while his Windows.inc is probably on another drive - see InputFile("\Masm32\include\Windows.inc"). My fault, my code should shout foul if the file isn't loaded.
@Yves:Re text routines: Do you seriously expect that somebody starts searching for the right one in that archive? Indicate clearly which is the line tokeniser (if you have one), or post it here.
New version attached - it will complain if there is no windows.inc in the current drive. Note that "Yves" represents the algo
CompteurLignes, which performs only what Hutch's subroutine
get_lcnt does, i.e. no array returned.
Intel(R) Core(TM) i5-2450M CPU @ 2.50GHz (MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX)
Testing 47 MB in tmp_test.txt
Hutch Yves Recall
117 53 45 ms
110 52 49 ms
109 52 49 ms
110 52 52 ms
115 52 49 ms
110 51 49 ms
110 51 49 ms
109 52 49 ms
110 52 51 ms
111 51 49 ms
111 52 49 ms
110 51 50 ms
109 52 49 ms
Results Recall:
echo ------------------------------------------
echo WARNING Duplicate include file windows.inc
echo ------------------------------------------
ENDIF
Results Hutch:
echo ------------------------------------------
echo WARNING Duplicate include file windows.inc
echo ------------------------------------------
ENDIF
(No results for Yves - line count only)
Thats all folks, press a key to exit...
Intel(R) Core(TM) i7-4930K CPU @ 3.40GHz (MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, S
SE4.2, AVX)
Testing 0 MB in tmp_test.txt
Hutch Yves Recall
113 44 49 ms
99 47 46 ms
96 45 45 ms
95 47 46 ms
96 45 46 ms
96 45 46 ms
96 46 46 ms
96 45 46 ms
96 47 48 ms
98 45 46 ms
96 45 46 ms
96 45 46 ms
96 45 46 ms
Results Recall:
echo ------------------------------------------
echo WARNING Duplicate include file windows.inc
echo ------------------------------------------
ENDIF
Results Hutch:
echo ------------------------------------------
echo WARNING Duplicate include file windows.inc
echo ------------------------------------------
ENDIF
(No results for Yves - line count only)
Thats all folks, press a key to exit...
Better result in the disk with masm32
Quote
Intel(R) Celeron(R) CPU 2.80GHz (MMX, SSE, SSE2, SSE3)
Hutch Yves Recall
85543 3136 3059 us
5276 3099 5293 us
5623 3126 3291 us
5328 3098 3514 us
5331 3057 3272 us
5571 3107 3231 us
5237 3141 3294 us
5314 3108 3365 us
5503 3060 3260 us
5250 3091 3228 us
5231 3168 3260 us
5274 3096 3300 us
5298 3048 3688 us
Results Hutch:
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
; It is not recomended that WINDOWS.INC be modified but if you need to add
; equates or structures to WINDOWS.INC, do not write anything after the
; following conditional assembly directive that display the duplicate
; warning or it will be duplicated if the file is included more than once.
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
ELSE
echo ------------------------------------------
echo WARNING Duplicate include file windows.inc
echo ------------------------------------------
ENDIF
Thats all folks, press a key to exit...
The routines I have posted are not for test here (The line counter is in the test).
Jochen,
the new version works fine under Windows7-64. Here are the results:
Quote
Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz (MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SS
E4.2, AVX)
Testing 47 MB in tmp_test.txt
Hutch Yves Recall
88 41 37 ms
92 43 43 ms
85 39 40 ms
84 39 40 ms
84 39 40 ms
84 39 40 ms
84 39 40 ms
84 39 39 ms
84 39 40 ms
84 39 40 ms
84 39 40 ms
85 39 40 ms
84 39 40 ms
Results Recall:
echo ------------------------------------------
echo WARNING Duplicate include file windows.inc
echo ------------------------------------------
ENDIF
Results Hutch:
echo ------------------------------------------
echo WARNING Duplicate include file windows.inc
echo ------------------------------------------
ENDIF
(No results for Yves - line count only)
Thats all folks, press a key to exit...
Gunther
Thanks, Marinus & Gunther :t
Quote from: ToutEnMasm on October 25, 2014, 10:53:24 PMThe routines I have posted are not for test here
:P
For JJ,I see that you have not read the post,it's usefull sometimes:
Quote
The text routines have been tested in the translator since 2010
Which text routines? Which translator? And why should anybody wish to dig (fouiller) into your archives, which obviously do not contain a tokeniser (otherwise you would have proudly posted it here)?
Sorry that google don't know what is a tokeniser.
What do the routines i have posted is an array with:
A pointer on each line
The size of each line
This allow to get a line by his number,without any scrutation.
Quote from: ToutEnMasm on October 26, 2014, 01:04:15 AM
A pointer on each line
The size of each line
That sounds interesting. Now, if you reveal the name of the file and the name of the algo, I might give it a try.
It is here :Texte_Routines\lib\Texte_scrutTxt.asm
http://masm32.com/board/index.php?topic=3715.msg39270#msg39270 (http://masm32.com/board/index.php?topic=3715.msg39270#msg39270)
In the source file ,you find "Scrutationdeux" it save each pointer and size after the count of lines had been made and memory allocated.
Quote from: ToutEnMasm on October 26, 2014, 04:49:14 AM
It is here :Texte_Routines\lib\Texte_scrutTxt.asm
http://masm32.com/board/index.php?topic=3715.msg39270#msg39270 (http://masm32.com/board/index.php?topic=3715.msg39270#msg39270)
In the source file ,you find "Scrutationdeux" it save each pointer and size after the count of lines had been made and memory allocated.
found it. :t
Gunther
Quote from: ToutEnMasm on October 26, 2014, 04:49:14 AM
In the source file ,you find "Scrutationdeux"
Line 387:
sub eax,esi
.if eax == 0
jmp FindeMemoriseLigne
.endif
Ever heard of dead code and conditional jumps?
sub eax,esi
je FindeMemoriseLigne It doesn't assemble, and since the whole thing looks a bit slow, I guess I'll just switch on the TV and have a nice evening :biggrin:
> I guess I'll just switch on the TV and have a nice evening
Sounds like you are doing it tough. :P
Use a batch is too much difficult :lol:
But as I said,they don't need to be tested.
deleted
Quote from: nidud on October 26, 2014, 10:51:19 PM
how many lines in the buffer?
An important test indeed.
include \masm32\MasmBasic\MasmBasic.inc ; download (http://masm32.com/board/index.php?topic=94.0)
.data
justOne db "This is one line",0
Init
; StringToArray (http://www.webalice.it/jj2006/MasmBasicQuickReference.htm#Mb1130) uses Recall (http://www.webalice.it/jj2006/MasmBasicQuickReference.htm#Mb1172) code but with a buffer instead of a file name
StringToArray offset justOne, My$()
Print Str$("%i lines\n", eax)
Inkey "Line 0: [", My$(0), "]"
Exit
end startOutput:
1 lines
Line 0: [This is one line]
I have this problem with the style of testing being done, it is incomplete, unbuildable and unverifiable. The whole point of a code laboratory is to be able to compare and test different algorithms and with this style of testing this is not being done. I wonder what happened to posted testable algorithms ?
nidud,
The result with a string,
one db "one line",0
with the get_lcnt algo should be 0 as there are no ascii 13 characters to count in a single line that is not 13,10 delimited.
Quote from: hutch-- on November 05, 2014, 01:32:19 PMincomplete, unbuildable and unverifiable
This is indeed a problem. Snippets without headers etc are fine to have a quick look, but the complete buildable example should then be attached. Most of the time people do post buildable examples, but others post code that requires fumbling with environment variables, which is a bad habit because it may interfere with the setup of one's machine. There is a reason why practically all Masm32 code has
include \Masm32\include\somelib.inc on top, and not
include somelib.inc or, worse,
include C:\Masm32\include\somelib.incI must say that in this respect Masm32 is an excellent library:
All 300+ examples in \Masm32\examples build without a single error message. In my experience, in contrast 95% of all C/C++ examples on the web throw cryptic error messages when you drop them into the leading free C/C++ software, VC Express, and you spend a lot of time to decipher what this behemoth wants from you, and where to find the missing header files :(
Quote from: hutch-- on November 05, 2014, 01:32:19 PM
The result with a string,
one db "one line",0
with the get_lcnt algo should be 0 as there are no ascii 13 characters to count in a single line that is not 13,10 delimited.
IMHO, for the purpose of building an array, both
one db "one line",0
and
one db "one line", 13, 10, 0
should return one line. Other views? Practical examples supporting other views?
There is a very simple way to solve that problem if you wish to include lines that are not 13,10 delimited or a variation of either. As you almost exclusively know the length of the text, read the last byte before the terminator and if its not 13 or 10, write one there. You can set a flag for whether there is a trailing 13,10 or not, and if it matters, you can trim the tail end off the string by writing zero to the last line to truncate it back before the added byte.
Yes, this is indeed the strategy of Recall() (http://www.webalice.it/jj2006/MasmBasicQuickReference.htm#Mb1172). And qEditor also recognises both versions, of course - see attachment for a test. Textfiles with no CrLf at the end are relatively rare, but they do occur in real life, and not taking that into account may lead to bugs that are really difficult to chase.
What was the purpose of the 2 text files when most know the difference between a last line that is zero terminated and a last line the is 13,10,0 terminated ?
Quote from: hutch-- on November 05, 2014, 03:10:32 PM
What was the purpose of the 2 text files when most know the difference between a last line that is zero terminated and a last line the is 13,10,0 terminated ?
Just to encourage some testing. Btw I had swapped the names, attached the right ones, together with a demo:
Testing 0 MB in TestLastLineZero.txt
...
Lines found:
10 Hutch
10 Yves
11 JJ/Recall
Results Recall:
Line 7
Line 8
Line 9
Last line
----------
Results Hutch:
Line 7
Line 8
Line 9
----------What is the desired result, for all practical purposes? qEditor.exe does show "Last line".
QE does not use a tokeniser at all, it simply streams file data into the rich edit control.
deleted
nidud,
> In my view a file begins at offset 0 and ends at EOF, and its size are measured between these two.
I have been looking at files in both text editors and in hex editors for a mighty long time and they either end at the OS stored byte count or with an ascii zero. I have not seen an ascii 26 (eof) terminating any file I have ever viewed.
deleted
Quote from: hutch-- on November 11, 2014, 02:39:59 AM
I have been looking at files in both text editors and in hex editors for a mighty long time and they either end at the OS stored byte count or with an ascii zero. I have not seen an ascii 26 (eof) terminating any file I have ever viewed.
Hi,
FWIW I have seen quite a few. Some text files from CPM systems.
And some from what appeared to be text editors ported from CPM.
And one strange OS/2 editor.
Quote from: nidud on November 11, 2014, 04:50:50 AM
I don't think the CTRL-Z character is actually used any more.
If you go to a command line and copy to a file from the CON:
device, a Ctrl-Z is used to terminate input. Something like the
following copied from a command prompt.
C:\>COPY CON FileName
test text
line 2^Z
1 file(s) copied.
C:\>TYPE FileName
test text
line 2
C:\>
Cheers,
Steve N.
deleted
I can give you an example of text files with data beyond the text
when you save a registry (.REG) file from RegEdit, you will see binary data following an EOF (1Ah)
Never seen an embedded null (00) in a text file.
Creative Labs (the Sound Blaster people) had an audio file .VOC which started with a header
sig db 'Creative Voice File',1ah
Legend has it that people would use 'type' to try and listen to the sound.
Who knows, in the good old dos5 days commands sometimes did weird things...
you could use
copy/b file.txt con
similar to type, but treats it as a binary file
not that it will create sounds - lol