# checker.py # a happy python script that reads a file dict.txt for a spelling dictionary # and uses it to check a file input.txt for spelling errors, printing a # list of misspelled words and their frequency as output. # import sundries for reading files and text searching import xreadlines, re # load the dictionary dictafile = open('dict.txt') for line in xreadlines.xreadlines(dictafile): word = line.replace('\n','') addWord(word) dictafile.close() # load the file to be checked words = tokenize('input.txt') # patterns for getting out all the nasty punctuation mainpattern = re.compile('[\.,\"\(\)\$\:\?\;]|\'s|^\'|\'$') dashpattern = re.compile('\-') numberpattern = re.compile('^[0-9]+$|^[0-9]+th$|^[0-9]+s$') spacepattern = re.compile('\\W') # check each word in the file misspelledwords = {} for word in words: # when taking out the punctuation, do the following things: # 1) remove . , " ( ) $ : ? ; 's and ' (at the start or end of a word) # 2) if the word contains any dashes, split it up into smaller words, # check the first word, and append the other words to the list of # words to check # 3) if the word contains only numbers, optionally followed by th or s, # skip the word completely # 4) if the word is actually a null strip, skip the word strippedword = mainpattern.sub('',word) if dashpattern.search(strippedword): splitwords = strippedword.split('-') strippedword = splitwords.pop(0) words.extend(splitwords) if numberpattern.search(strippedword): continue if strippedword == '': continue # now see if the word is in the dictionary lowerstrippedword = strippedword.lower() if strippedword in bestMatch(strippedword): pass elif lowerstrippedword in bestMatch(lowerstrippedword): pass else: if strippedword in misspelledwords: misspelledwords[strippedword] = misspelledwords[strippedword] + 1 else: misspelledwords[strippedword] = 1 #print out the sorted list of misspelled words badwordlist = misspelledwords.keys() badwordlist.sort() for word in badwordlist: print word + '(' + str(misspelledwords[word]) + ')'