# checker.py
# a happy python script that reads a file dict.txt for a spelling dictionary
# and uses it to check a file input.txt for spelling errors, printing a
# list of misspelled words and their frequency as output.

# import sundries for reading files and text searching
import xreadlines, re

# load the dictionary
dictafile = open('dict.txt')
for line in xreadlines.xreadlines(dictafile):
    word = line.replace('\n','')
    addWord(word)
dictafile.close()

# load the file to be checked
words = tokenize('input.txt')

# patterns for getting out all the nasty punctuation
mainpattern = re.compile('[\.,\"\(\)\$\:\?\;]|\'s|^\'|\'$')
dashpattern = re.compile('\-')
numberpattern = re.compile('^[0-9]+$|^[0-9]+th$|^[0-9]+s$')
spacepattern = re.compile('\\W')

# check each word in the file
misspelledwords = {}
for word in words:
    # when taking out the punctuation, do the following things:
    #   1) remove . , " ( ) $ : ? ; 's and ' (at the start or end of a word)
    #   2) if the word contains any dashes, split it up into smaller words,
    #      check the first word, and append the other words to the list of
    #      words to check
    #   3) if the word contains only numbers, optionally followed by th or s,
    #      skip the word completely
    #   4) if the word is actually a null strip, skip the word
    strippedword = mainpattern.sub('',word)
    if dashpattern.search(strippedword):
        splitwords = strippedword.split('-')
        strippedword = splitwords.pop(0)
        words.extend(splitwords)
    if numberpattern.search(strippedword):
        continue
    if strippedword == '':
        continue
 
    # now see if the word is in the dictionary
    lowerstrippedword = strippedword.lower()
    if strippedword in bestMatch(strippedword):
        pass
    elif lowerstrippedword in bestMatch(lowerstrippedword):
        pass
    else:
        if strippedword in misspelledwords:
            misspelledwords[strippedword] = misspelledwords[strippedword] + 1
        else:
            misspelledwords[strippedword] = 1

#print out the sorted list of misspelled words
badwordlist = misspelledwords.keys()
badwordlist.sort()
for word in badwordlist:
    print word + '(' + str(misspelledwords[word]) + ')'