1 | /* Created by bgalbs on Jan 30, 2003 at 11:45:25 PM */ |
---|
2 | package com.swabunga.spell.engine; |
---|
3 | |
---|
4 | import java.io.File; |
---|
5 | import java.io.IOException; |
---|
6 | import java.io.Reader; |
---|
7 | import java.security.InvalidParameterException; |
---|
8 | import java.util.*; |
---|
9 | |
---|
10 | /** |
---|
11 | * Container for various methods that any <code>SpellDictionary</code> will use. |
---|
12 | * Based on the original Jazzy <a href="http://aspell.net/">aspell</a> port. |
---|
13 | * <p/> |
---|
14 | * |
---|
15 | * |
---|
16 | */ |
---|
17 | public abstract class SpellDictionaryASpell implements SpellDictionary { |
---|
18 | |
---|
19 | |
---|
20 | /** The reference to a Transformator, used to transform a word into it's phonetic code. */ |
---|
21 | protected Transformator tf; |
---|
22 | |
---|
23 | public SpellDictionaryASpell(File phonetic) throws IOException { |
---|
24 | if (phonetic == null) |
---|
25 | tf = new DoubleMeta(); |
---|
26 | else |
---|
27 | tf = new GenericTransformator(phonetic); |
---|
28 | } |
---|
29 | |
---|
30 | public SpellDictionaryASpell(File phonetic, String encoding) throws IOException { |
---|
31 | if (phonetic == null) |
---|
32 | tf = new DoubleMeta(); |
---|
33 | else |
---|
34 | tf = new GenericTransformator(phonetic, encoding); |
---|
35 | } |
---|
36 | |
---|
37 | public SpellDictionaryASpell(Reader phonetic) throws IOException { |
---|
38 | if (phonetic == null) |
---|
39 | tf = new DoubleMeta(); |
---|
40 | else |
---|
41 | tf = new GenericTransformator(phonetic); |
---|
42 | } |
---|
43 | |
---|
44 | |
---|
45 | /** |
---|
46 | * Returns a list of Word objects that are the suggestions to an |
---|
47 | * incorrect word. |
---|
48 | * <p> |
---|
49 | * @param word Suggestions for given mispelt word |
---|
50 | * @param threshold The lower boundary of similarity to mispelt word |
---|
51 | * @return Vector a List of suggestions |
---|
52 | */ |
---|
53 | public List getSuggestions(String word, int threshold) { |
---|
54 | |
---|
55 | Hashtable nearmisscodes = new Hashtable(); |
---|
56 | String code = getCode(word); |
---|
57 | |
---|
58 | // add all words that have the same phonetics |
---|
59 | nearmisscodes.put(code, code); |
---|
60 | Vector phoneticList = getWordsFromCode(word, nearmisscodes); |
---|
61 | |
---|
62 | // do some tranformations to pick up more results |
---|
63 | //interchange |
---|
64 | nearmisscodes = new Hashtable(); |
---|
65 | char[] charArray = word.toCharArray(); |
---|
66 | for (int i = 0; i < word.length() - 1; i++) { |
---|
67 | char a = charArray[i]; |
---|
68 | char b = charArray[i + 1]; |
---|
69 | charArray[i] = b; |
---|
70 | charArray[i + 1] = a; |
---|
71 | String s = getCode(new String(charArray)); |
---|
72 | nearmisscodes.put(s, s); |
---|
73 | charArray[i] = a; |
---|
74 | charArray[i + 1] = b; |
---|
75 | } |
---|
76 | |
---|
77 | char[] replacelist = tf.getReplaceList(); |
---|
78 | |
---|
79 | //change |
---|
80 | charArray = word.toCharArray(); |
---|
81 | for (int i = 0; i < word.length(); i++) { |
---|
82 | char original = charArray[i]; |
---|
83 | for (int j = 0; j < replacelist.length; j++) { |
---|
84 | charArray[i] = replacelist[j]; |
---|
85 | String s = getCode(new String(charArray)); |
---|
86 | nearmisscodes.put(s, s); |
---|
87 | } |
---|
88 | charArray[i] = original; |
---|
89 | } |
---|
90 | |
---|
91 | //add |
---|
92 | charArray = (word += " ").toCharArray(); |
---|
93 | int iy = charArray.length - 1; |
---|
94 | while (true) { |
---|
95 | for (int j = 0; j < replacelist.length; j++) { |
---|
96 | charArray[iy] = replacelist[j]; |
---|
97 | String s = getCode(new String(charArray)); |
---|
98 | nearmisscodes.put(s, s); |
---|
99 | } |
---|
100 | if (iy == 0) |
---|
101 | break; |
---|
102 | charArray[iy] = charArray[iy - 1]; |
---|
103 | --iy; |
---|
104 | } |
---|
105 | |
---|
106 | //delete |
---|
107 | word = word.trim(); |
---|
108 | charArray = word.toCharArray(); |
---|
109 | char[] charArray2 = new char[charArray.length - 1]; |
---|
110 | for (int ix = 0; ix < charArray2.length; ix++) { |
---|
111 | charArray2[ix] = charArray[ix]; |
---|
112 | } |
---|
113 | char a, b; |
---|
114 | a = charArray[charArray.length - 1]; |
---|
115 | int ii = charArray2.length; |
---|
116 | while (true) { |
---|
117 | String s = getCode(new String(charArray)); |
---|
118 | nearmisscodes.put(s, s); |
---|
119 | if (ii == 0) |
---|
120 | break; |
---|
121 | b = a; |
---|
122 | a = charArray2[ii - 1]; |
---|
123 | charArray2[ii - 1] = b; |
---|
124 | --ii; |
---|
125 | } |
---|
126 | |
---|
127 | nearmisscodes.remove(code); //already accounted for in phoneticList |
---|
128 | |
---|
129 | Vector wordlist = getWordsFromCode(word, nearmisscodes); |
---|
130 | |
---|
131 | if (wordlist.size() == 0 && phoneticList.size() == 0) |
---|
132 | addBestGuess(word, phoneticList); |
---|
133 | |
---|
134 | |
---|
135 | // We sort a Vector at the end instead of maintaining a |
---|
136 | // continously sorted TreeSet because everytime you add a collection |
---|
137 | // to a treeset it has to be resorted. It's better to do this operation |
---|
138 | // once at the end. |
---|
139 | |
---|
140 | Collections.sort(phoneticList, new Word()); //always sort phonetic matches along the top |
---|
141 | Collections.sort(wordlist, new Word()); //the non-phonetic matches can be listed below |
---|
142 | |
---|
143 | phoneticList.addAll(wordlist); |
---|
144 | return phoneticList; |
---|
145 | } |
---|
146 | |
---|
147 | /** |
---|
148 | * When we don't come up with any suggestions (probably because the threshold was too strict), |
---|
149 | * then pick the best guesses from the those words that have the same phonetic code. |
---|
150 | * @param word - the word we are trying spell correct |
---|
151 | * @param wordList - the linked list that will get the best guess |
---|
152 | */ |
---|
153 | private void addBestGuess(String word, Vector wordList) { |
---|
154 | if (wordList.size() != 0) |
---|
155 | throw new InvalidParameterException("the wordList vector must be empty"); |
---|
156 | |
---|
157 | int bestScore = Integer.MAX_VALUE; |
---|
158 | |
---|
159 | String code = getCode(word); |
---|
160 | List simwordlist = getWords(code); |
---|
161 | |
---|
162 | LinkedList candidates = new LinkedList(); |
---|
163 | |
---|
164 | for (Iterator j = simwordlist.iterator(); j.hasNext();) { |
---|
165 | String similar = (String) j.next(); |
---|
166 | int distance = EditDistance.getDistance(word, similar); |
---|
167 | if (distance <= bestScore) { |
---|
168 | bestScore = distance; |
---|
169 | Word goodGuess = new Word(similar, distance); |
---|
170 | candidates.add(goodGuess); |
---|
171 | } |
---|
172 | } |
---|
173 | |
---|
174 | //now, only pull out the guesses that had the best score |
---|
175 | for (Iterator iter = candidates.iterator(); iter.hasNext();) { |
---|
176 | Word candidate = (Word) iter.next(); |
---|
177 | if (candidate.getCost() == bestScore) |
---|
178 | wordList.add(candidate); |
---|
179 | } |
---|
180 | |
---|
181 | } |
---|
182 | |
---|
183 | private Vector getWordsFromCode(String word, Hashtable codes) { |
---|
184 | Configuration config = Configuration.getConfiguration(); |
---|
185 | Vector result = new Vector(); |
---|
186 | final int configDistance = config.getInteger(Configuration.SPELL_THRESHOLD); |
---|
187 | |
---|
188 | for (Enumeration i = codes.keys(); i.hasMoreElements();) { |
---|
189 | String code = (String) i.nextElement(); |
---|
190 | |
---|
191 | List simwordlist = getWords(code); |
---|
192 | for (Iterator iter = simwordlist.iterator(); iter.hasNext();) { |
---|
193 | String similar = (String) iter.next(); |
---|
194 | int distance = EditDistance.getDistance(word, similar); |
---|
195 | if (distance < configDistance) { |
---|
196 | Word w = new Word(similar, distance); |
---|
197 | result.addElement(w); |
---|
198 | } |
---|
199 | } |
---|
200 | } |
---|
201 | return result; |
---|
202 | } |
---|
203 | |
---|
204 | /** |
---|
205 | * Returns the phonetic code representing the word. |
---|
206 | */ |
---|
207 | public String getCode(String word) { |
---|
208 | return tf.transform(word); |
---|
209 | } |
---|
210 | |
---|
211 | /** |
---|
212 | * Returns a list of words that have the same phonetic code. |
---|
213 | */ |
---|
214 | protected abstract List getWords(String phoneticCode); |
---|
215 | |
---|
216 | /** |
---|
217 | * Returns true if the word is correctly spelled against the current word list. |
---|
218 | */ |
---|
219 | public boolean isCorrect(String word) { |
---|
220 | List possible = getWords(getCode(word)); |
---|
221 | if (possible.contains(word)) |
---|
222 | return true; |
---|
223 | //JMH should we always try the lowercase version. If I dont then capitalised |
---|
224 | //words are always returned as incorrect. |
---|
225 | else if (possible.contains(word.toLowerCase())) |
---|
226 | return true; |
---|
227 | return false; |
---|
228 | } |
---|
229 | } |
---|