News:

Masm32 SDK description, downloads and other helpful links
Message to All Guests

Main Menu

Create Unique Word List

Started by hutch--, December 30, 2022, 08:58:57 AM

Previous topic - Next topic

hutch--

The attached app is a prototype I need for a task I have in mind but its also useful as a general purpose tool for processing word lists passed in the form of one word per line that are CRLF delimited. You will need to download the two libraries I have posted on this site (saves duplication) and it will build OK.

It uses a hash table to test for duplicates and in this case, case sensitive and while the setup time is a bit slow as it creates a basic dynamic string array for the hash table, it has massive capacity (if you have enough memory) and once the array is allocated, it is genuinely fast.

Usage is simple, input file with word list and the name of the output file.

uwords infile.ext outfile.ext

The "sauce".  :tongue:

' ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤

    #compile exe "uWords.exe"
    #compiler PBWIN

    #link "hashcode.lib"                                    ' link the two (2) libraries
    #link "pbgplib.lib"

' ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤

FUNCTION PBmain as LONG

    LOCAL hash1() as STRING                                 ' tables must be used in 1 2 order
    LOCAL hash2() as STRING
    LOCAL indx    as DWORD
    LOCAL rtn     as DWORD
    LOCAL cnt     as DWORD
    LOCAL wcnt    as DWORD

    cmd$ = command$                                         ' get the command line
    cmd$ = block_monospace$(cmd$)                           ' clean up the comand line

    ifile$ = parse$(cmd$," ",1)                             ' get source file name

    If exist(ifile$) = 0 Then
      msgbox "File cannot be found",0,"No source file"
      Exit FUNCTION
    End If

    ofile$ = parse$(cmd$," ",2)                             ' get the target file name

    If ofile$ = "" Then
      msgbox "No target specified",0,"Missing target name"
      Exit FUNCTION
    End If

    txt$ = load_file(ifile$)                                ' load source file
    wcnt = tally(txt$,chr$(13,10))                          ' count lines
    txt$ = ""                                               ' deallocate string

    indx = Hash_Init(wcnt,hash1(),hash2())

    Open ifile$ for Input as #1                             ' open both files for file IO
    Open ofile$ for Output as #2

    cnt = 0                                                 ' zero the word counter

    Do
      Line Input #1, txt$
      rtn = hash_write(txt$,"?",hash1(),hash2(),indx)       ' 2nd text arg is not used
      If rtn = 0 Then                                       ' 0 is success, other is duplicate
        ! add cnt, 1                                        ' increment counter of unique words
        Print #2, txt$                                      ' write unique word to disk file
      End If
    Loop While not eof(1)

    Close #2
    Close #1

    erase hash1()
    erase hash2()

    msgbox "Unique word count",0,str$(cnt)

End FUNCTION

' ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤