| 1 | # checker.py |
| 2 | # a happy python script that reads a file dict.txt for a spelling dictionary |
| 3 | # and uses it to check a file input.txt for spelling errors, printing a |
| 4 | # list of misspelled words and their frequency as output. |
| 5 | |
| 6 | # import sundries for reading files and text searching |
| 7 | import xreadlines, re |
| 8 | |
| 9 | # load the dictionary |
| 10 | dictafile = open('dict.txt') |
| 11 | for line in xreadlines.xreadlines(dictafile): |
| 12 | word = line.replace('\n','') |
| 13 | addWord(word) |
| 14 | dictafile.close() |
| 15 | |
| 16 | # load the file to be checked |
| 17 | words = tokenize('input.txt') |
| 18 | |
| 19 | # patterns for getting out all the nasty punctuation |
| 20 | mainpattern = re.compile('[\.,\"\(\)\$\:\?\;]|\'s|^\'|\'$') |
| 21 | dashpattern = re.compile('\-') |
| 22 | numberpattern = re.compile('^[0-9]+$|^[0-9]+th$|^[0-9]+s$') |
| 23 | spacepattern = re.compile('\\W') |
| 24 | |
| 25 | # check each word in the file |
| 26 | misspelledwords = {} |
| 27 | for word in words: |
| 28 | # when taking out the punctuation, do the following things: |
| 29 | # 1) remove . , " ( ) $ : ? ; 's and ' (at the start or end of a word) |
| 30 | # 2) if the word contains any dashes, split it up into smaller words, |
| 31 | # check the first word, and append the other words to the list of |
| 32 | # words to check |
| 33 | # 3) if the word contains only numbers, optionally followed by th or s, |
| 34 | # skip the word completely |
| 35 | # 4) if the word is actually a null strip, skip the word |
| 36 | strippedword = mainpattern.sub('',word) |
| 37 | if dashpattern.search(strippedword): |
| 38 | splitwords = strippedword.split('-') |
| 39 | strippedword = splitwords.pop(0) |
| 40 | words.extend(splitwords) |
| 41 | if numberpattern.search(strippedword): |
| 42 | continue |
| 43 | if strippedword == '': |
| 44 | continue |
| 45 | |
| 46 | # now see if the word is in the dictionary |
| 47 | lowerstrippedword = strippedword.lower() |
| 48 | if strippedword in bestMatch(strippedword): |
| 49 | pass |
| 50 | elif lowerstrippedword in bestMatch(lowerstrippedword): |
| 51 | pass |
| 52 | else: |
| 53 | if strippedword in misspelledwords: |
| 54 | misspelledwords[strippedword] = misspelledwords[strippedword] + 1 |
| 55 | else: |
| 56 | misspelledwords[strippedword] = 1 |
| 57 | |
| 58 | #print out the sorted list of misspelled words |
| 59 | badwordlist = misspelledwords.keys() |
| 60 | badwordlist.sort() |
| 61 | for word in badwordlist: |
| 62 | print word + '(' + str(misspelledwords[word]) + ')' |