News:

Masm32 SDK description, downloads and other helpful links
Message to All Guests

Main Menu

One pass hash table replace of multiple words

Started by hutch--, January 27, 2023, 12:02:37 PM

Previous topic - Next topic

hutch--

You will need to download the two libraries from this web site to build this code if you have not already got them. This example has very small files to keep the example file size down but it is capable of handling large counts of word replacement pairs. It can be done from an external file as per the example or the hash table can be loaded directly in an executable.

Its virtue is it can handle large counts of word replacement pairs in a single pass through the input file that is being modified.

Note that the example has not had its input files tested for validity or if they exist, this is to keep the example simple. Run "test.bat" to see how it works.

' ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤

    #compile exe "MulRep.exe"
    #compiler PBCC

    #include "\basic\include\win32api.inc"

    #link "hashcode.lib"
    #link "pbgplib.lib

    GLOBAL tbl1() as STRING                             ' GLOBAL scope
    GLOBAL tbl2() as STRING
    GLOBAL indx   as DWORD
    GLOBAL retn   as DWORD

' ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤

FUNCTION PBmain as LONG

    LOCAL flen as DWORD

    cmd$ = command$                                     ' get the commmand line
    cmd$ = block_monospace$(cmd$)                       ' clean up command line
    wpStr$ = parse$(cmd$,chr$(32),1)                    ' word pair file name
    ipStr$ = parse$(cmd$,chr$(32),2)                    ' input file name to modify
    opStr$ = parse$(cmd$,chr$(32),3)                    ' output file name for result

    indx = Hash_Init(1000,tbl1(),tbl2())                ' initialise the hash table

    Open wpStr$ for Input as #1                         ' open the word pair file
    Do
      Line Input #1, a$                                 ' get the line of text
      a$ = block_monospace$(a$)                         ' clean up each line
      wd1$ = parse$(a$,chr$(32),1)                      ' get the 1st word
      wd2$ = parse$(a$,chr$(32),2)                      ' get the 2nd word
      retn = hash_write(wd1$,wd2$,tbl1(),tbl2(),indx)   ' load pair into hash table
    Loop while not EOF(1)
    Close #1

    Open ipStr$ for Binary as #1                        ' load input file in one pass
      flen = lof(1)
      Get$ #1,flen,inpt$                                ' name the file
    Close #1

    reslt$ = hash_replace(inpt$,tbl1(),tbl2(),indx)     ' replace all of the occurring words in one pass

    Open opStr$ for Output as #1                        ' write the output to disk
      Print #1, reslt$;
    Close #1

End FUNCTION

' ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤

hutch--

This is a variant of the above. This version uses a fixed "wordpair.txt" file that is tracked by the app so that the app can always find the wordpair file. This required the wordpair file to always be in the same directory as the app but by doing so, the app can be called from anywhere on the computer.

It is a reasonably specialised app that will replace single word tags with other text set up in the word pair file. This can be single words or phrases in a single line.

To build it, you will need the two library files that are at the top of this sub forum.


tag1 Any text you like after the tag name

Run a source file through the app and every instance of "tag1" will be replaced with the trailing text in the word pair file.

In the zip file is a word pair file that contains "The charge of the light brigade" with a tag for each line.

Feed the text file "textfile.txt" which is in the format,

tag1
tag2
tag3
tag4
tag5
tag6
tag7
tag8
tag9
tag10
etc ...

and the output file will display the Charge Of The Light Brigade.

The original purpose of the algos was for equate replacement in a scripting engine and it needed to be fast and work in a single pass.


' ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤

' This version requires a text file of word pairs "wordpair.txt" to be present in the same directory
' as this executable.

' ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤


    #compile exe "mrep.exe"
    #compiler PBCC

    #include "\basic\include\win32api.inc"

    #link "hashcode.lib"
    #link "pbgplib.lib

    GLOBAL tbl1() as STRING                             ' GLOBAL scope
    GLOBAL tbl2() as STRING
    GLOBAL indx   as DWORD
    GLOBAL retn   as DWORD

' ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤

FUNCTION PBmain as LONG

    LOCAL lcnt as DWORD                                 ' line count
    LOCAL bcnt as DWORD                                 ' byte count
    LOCAL tcnt as DWORD                                 ' tick count

    tcnt = GetTickCount                                 ' start the timing

    apath$ = app_path                                   ' get this app path
    tfile$ = apath$+"wordpair.txt"                      ' append text file name to it

    If exist(tfile$) = 0 Then                           ' test if the word pair file is present
      conout "Cannot find wordpair.txt"                 ' output error message to the console
      Exit FUNCTION
    End If

    txt$ = load_file(tfile$)                            ' get a copy of the word pair file
    lcnt = tally(txt$,chr$(13,10))                      ' count the CRLFs
    txt$ = ""                                           ' reduce memory usage

    cmd$ = command$                                     ' get the commmand line
    cmd$ = block_monospace$(cmd$)                       ' clean up command line

    ipStr$ = parse$(cmd$,chr$(32),1)                    ' input file name to modify
    opStr$ = parse$(cmd$,chr$(32),2)                    ' output file name for result

    indx = Hash_Init(lcnt,tbl1(),tbl2())                ' initialise the hash table

    Open tfile$ for Input as #1                         ' open the word pair file
    Do
      Line Input #1, a$                                 ' get the line of text
      a$   = ltrim$(a$)                                 ' trim any leading spaces
      wd1$ = parse$(a$,chr$(32),1)                      ' get the 1st word
      wd2$ = right$(remove$(a$,wd1$),-1)                ' get the rest of the string minus 1st word
      retn = hash_write(wd1$,wd2$,tbl1(),tbl2(),indx)   ' load word pair into hash table
    Loop while not EOF(1)
    Close #1

    inpt$ = load_file(ipStr$)                           ' load the input file
    reslt$ = hash_replace(inpt$,tbl1(),tbl2(),indx)     ' replace all occurring words in one pass
    bcnt = save_file(opStr$,reslt$)                     ' save the output file

    tcnt = GetTickCount - tcnt                          ' calculate timing in milliseconds
    conout opStr$+" written to disk at"+str$(bcnt)+_
           " bytes in"+str$(tcnt)+" ms"+chr$(13,10)

End FUNCTION

' ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤

#IF 0  ' ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤

    ----------------------------------------------------------------
    This is the character table used by the "hash_replace" algorithm
    ----------------------------------------------------------------

    ! db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    ! db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0     ' 31
    ! db 0,1,0,1,1,1,1,0,0,0,0,0,0,0,0,0     ' 47
    ! db 1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,1     ' 63   ' numbers
    ! db 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1     ' 79   ' upper case
    ! db 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1     ' 95
    ! db 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1     ' 111  ' lower case
    ! db 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1     ' 127
    ! db 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
    ! db 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
    ! db 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
    ! db 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
    ! db 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
    ! db 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
    ! db 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
    ! db 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1

    ' characters     ! # $ % & ? @ _
    ' numbers        0123456789
    ' upper case     ABCDEFGHIJKLMNOPQRSTUVWXYZ
    ' lower case     abcdefghijklmnopqrstuvwxyz
    ' high ansi set  128 to 255

#ENDIF ' ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤