1 | # checker.py |
2 | # a happy python script that reads a file dict.txt for a spelling dictionary |
3 | # and uses it to check a file input.txt for spelling errors, printing a |
4 | # list of misspelled words and their frequency as output. |
5 | |
6 | # import sundries for reading files and text searching |
7 | import xreadlines, re |
8 | |
9 | # load the dictionary |
10 | dictafile = open('dict.txt') |
11 | for line in xreadlines.xreadlines(dictafile): |
12 | word = line.replace('\n','') |
13 | addWord(word) |
14 | dictafile.close() |
15 | |
16 | # load the file to be checked |
17 | words = tokenize('input.txt') |
18 | |
19 | # patterns for getting out all the nasty punctuation |
20 | mainpattern = re.compile('[\.,\"\(\)\$\:\?\;]|\'s|^\'|\'$') |
21 | dashpattern = re.compile('\-') |
22 | numberpattern = re.compile('^[0-9]+$|^[0-9]+th$|^[0-9]+s$') |
23 | spacepattern = re.compile('\\W') |
24 | |
25 | # check each word in the file |
26 | misspelledwords = {} |
27 | for word in words: |
28 | # when taking out the punctuation, do the following things: |
29 | # 1) remove . , " ( ) $ : ? ; 's and ' (at the start or end of a word) |
30 | # 2) if the word contains any dashes, split it up into smaller words, |
31 | # check the first word, and append the other words to the list of |
32 | # words to check |
33 | # 3) if the word contains only numbers, optionally followed by th or s, |
34 | # skip the word completely |
35 | # 4) if the word is actually a null strip, skip the word |
36 | strippedword = mainpattern.sub('',word) |
37 | if dashpattern.search(strippedword): |
38 | splitwords = strippedword.split('-') |
39 | strippedword = splitwords.pop(0) |
40 | words.extend(splitwords) |
41 | if numberpattern.search(strippedword): |
42 | continue |
43 | if strippedword == '': |
44 | continue |
45 | |
46 | # now see if the word is in the dictionary |
47 | lowerstrippedword = strippedword.lower() |
48 | if strippedword in bestMatch(strippedword): |
49 | pass |
50 | elif lowerstrippedword in bestMatch(lowerstrippedword): |
51 | pass |
52 | else: |
53 | if strippedword in misspelledwords: |
54 | misspelledwords[strippedword] = misspelledwords[strippedword] + 1 |
55 | else: |
56 | misspelledwords[strippedword] = 1 |
57 | |
58 | #print out the sorted list of misspelled words |
59 | badwordlist = misspelledwords.keys() |
60 | badwordlist.sort() |
61 | for word in badwordlist: |
62 | print word + '(' + str(misspelledwords[word]) + ')' |