[1014] | 1 | /* Created by bgalbs on Jan 30, 2003 at 11:45:25 PM */ |
---|
| 2 | package com.swabunga.spell.engine; |
---|
| 3 | |
---|
| 4 | import java.io.File; |
---|
| 5 | import java.io.IOException; |
---|
| 6 | import java.io.Reader; |
---|
| 7 | import java.security.InvalidParameterException; |
---|
| 8 | import java.util.*; |
---|
| 9 | |
---|
| 10 | /** |
---|
| 11 | * Container for various methods that any <code>SpellDictionary</code> will use. |
---|
| 12 | * Based on the original Jazzy <a href="http://aspell.net/">aspell</a> port. |
---|
| 13 | * <p/> |
---|
| 14 | * |
---|
| 15 | * |
---|
| 16 | */ |
---|
| 17 | public abstract class SpellDictionaryASpell implements SpellDictionary { |
---|
| 18 | |
---|
| 19 | |
---|
| 20 | /** The reference to a Transformator, used to transform a word into it's phonetic code. */ |
---|
| 21 | protected Transformator tf; |
---|
| 22 | |
---|
| 23 | public SpellDictionaryASpell(File phonetic) throws IOException { |
---|
| 24 | if (phonetic == null) |
---|
| 25 | tf = new DoubleMeta(); |
---|
| 26 | else |
---|
| 27 | tf = new GenericTransformator(phonetic); |
---|
| 28 | } |
---|
| 29 | |
---|
| 30 | public SpellDictionaryASpell(File phonetic, String encoding) throws IOException { |
---|
| 31 | if (phonetic == null) |
---|
| 32 | tf = new DoubleMeta(); |
---|
| 33 | else |
---|
| 34 | tf = new GenericTransformator(phonetic, encoding); |
---|
| 35 | } |
---|
| 36 | |
---|
| 37 | public SpellDictionaryASpell(Reader phonetic) throws IOException { |
---|
| 38 | if (phonetic == null) |
---|
| 39 | tf = new DoubleMeta(); |
---|
| 40 | else |
---|
| 41 | tf = new GenericTransformator(phonetic); |
---|
| 42 | } |
---|
| 43 | |
---|
| 44 | |
---|
| 45 | /** |
---|
| 46 | * Returns a list of Word objects that are the suggestions to an |
---|
| 47 | * incorrect word. |
---|
| 48 | * <p> |
---|
| 49 | * @param word Suggestions for given mispelt word |
---|
| 50 | * @param threshold The lower boundary of similarity to mispelt word |
---|
| 51 | * @return Vector a List of suggestions |
---|
| 52 | */ |
---|
| 53 | public List getSuggestions(String word, int threshold) { |
---|
| 54 | |
---|
| 55 | Hashtable nearmisscodes = new Hashtable(); |
---|
| 56 | String code = getCode(word); |
---|
| 57 | |
---|
| 58 | // add all words that have the same phonetics |
---|
| 59 | nearmisscodes.put(code, code); |
---|
| 60 | Vector phoneticList = getWordsFromCode(word, nearmisscodes); |
---|
| 61 | |
---|
| 62 | // do some tranformations to pick up more results |
---|
| 63 | //interchange |
---|
| 64 | nearmisscodes = new Hashtable(); |
---|
| 65 | char[] charArray = word.toCharArray(); |
---|
| 66 | for (int i = 0; i < word.length() - 1; i++) { |
---|
| 67 | char a = charArray[i]; |
---|
| 68 | char b = charArray[i + 1]; |
---|
| 69 | charArray[i] = b; |
---|
| 70 | charArray[i + 1] = a; |
---|
| 71 | String s = getCode(new String(charArray)); |
---|
| 72 | nearmisscodes.put(s, s); |
---|
| 73 | charArray[i] = a; |
---|
| 74 | charArray[i + 1] = b; |
---|
| 75 | } |
---|
| 76 | |
---|
| 77 | char[] replacelist = tf.getReplaceList(); |
---|
| 78 | |
---|
| 79 | //change |
---|
| 80 | charArray = word.toCharArray(); |
---|
| 81 | for (int i = 0; i < word.length(); i++) { |
---|
| 82 | char original = charArray[i]; |
---|
| 83 | for (int j = 0; j < replacelist.length; j++) { |
---|
| 84 | charArray[i] = replacelist[j]; |
---|
| 85 | String s = getCode(new String(charArray)); |
---|
| 86 | nearmisscodes.put(s, s); |
---|
| 87 | } |
---|
| 88 | charArray[i] = original; |
---|
| 89 | } |
---|
| 90 | |
---|
| 91 | //add |
---|
| 92 | charArray = (word += " ").toCharArray(); |
---|
| 93 | int iy = charArray.length - 1; |
---|
| 94 | while (true) { |
---|
| 95 | for (int j = 0; j < replacelist.length; j++) { |
---|
| 96 | charArray[iy] = replacelist[j]; |
---|
| 97 | String s = getCode(new String(charArray)); |
---|
| 98 | nearmisscodes.put(s, s); |
---|
| 99 | } |
---|
| 100 | if (iy == 0) |
---|
| 101 | break; |
---|
| 102 | charArray[iy] = charArray[iy - 1]; |
---|
| 103 | --iy; |
---|
| 104 | } |
---|
| 105 | |
---|
| 106 | //delete |
---|
| 107 | word = word.trim(); |
---|
| 108 | charArray = word.toCharArray(); |
---|
| 109 | char[] charArray2 = new char[charArray.length - 1]; |
---|
| 110 | for (int ix = 0; ix < charArray2.length; ix++) { |
---|
| 111 | charArray2[ix] = charArray[ix]; |
---|
| 112 | } |
---|
| 113 | char a, b; |
---|
| 114 | a = charArray[charArray.length - 1]; |
---|
| 115 | int ii = charArray2.length; |
---|
| 116 | while (true) { |
---|
| 117 | String s = getCode(new String(charArray)); |
---|
| 118 | nearmisscodes.put(s, s); |
---|
| 119 | if (ii == 0) |
---|
| 120 | break; |
---|
| 121 | b = a; |
---|
| 122 | a = charArray2[ii - 1]; |
---|
| 123 | charArray2[ii - 1] = b; |
---|
| 124 | --ii; |
---|
| 125 | } |
---|
| 126 | |
---|
| 127 | nearmisscodes.remove(code); //already accounted for in phoneticList |
---|
| 128 | |
---|
| 129 | Vector wordlist = getWordsFromCode(word, nearmisscodes); |
---|
| 130 | |
---|
| 131 | if (wordlist.size() == 0 && phoneticList.size() == 0) |
---|
| 132 | addBestGuess(word, phoneticList); |
---|
| 133 | |
---|
| 134 | |
---|
| 135 | // We sort a Vector at the end instead of maintaining a |
---|
| 136 | // continously sorted TreeSet because everytime you add a collection |
---|
| 137 | // to a treeset it has to be resorted. It's better to do this operation |
---|
| 138 | // once at the end. |
---|
| 139 | |
---|
| 140 | Collections.sort(phoneticList, new Word()); //always sort phonetic matches along the top |
---|
| 141 | Collections.sort(wordlist, new Word()); //the non-phonetic matches can be listed below |
---|
| 142 | |
---|
| 143 | phoneticList.addAll(wordlist); |
---|
| 144 | return phoneticList; |
---|
| 145 | } |
---|
| 146 | |
---|
| 147 | /** |
---|
| 148 | * When we don't come up with any suggestions (probably because the threshold was too strict), |
---|
| 149 | * then pick the best guesses from the those words that have the same phonetic code. |
---|
| 150 | * @param word - the word we are trying spell correct |
---|
| 151 | * @param wordList - the linked list that will get the best guess |
---|
| 152 | */ |
---|
| 153 | private void addBestGuess(String word, Vector wordList) { |
---|
| 154 | if (wordList.size() != 0) |
---|
| 155 | throw new InvalidParameterException("the wordList vector must be empty"); |
---|
| 156 | |
---|
| 157 | int bestScore = Integer.MAX_VALUE; |
---|
| 158 | |
---|
| 159 | String code = getCode(word); |
---|
| 160 | List simwordlist = getWords(code); |
---|
| 161 | |
---|
| 162 | LinkedList candidates = new LinkedList(); |
---|
| 163 | |
---|
| 164 | for (Iterator j = simwordlist.iterator(); j.hasNext();) { |
---|
| 165 | String similar = (String) j.next(); |
---|
| 166 | int distance = EditDistance.getDistance(word, similar); |
---|
| 167 | if (distance <= bestScore) { |
---|
| 168 | bestScore = distance; |
---|
| 169 | Word goodGuess = new Word(similar, distance); |
---|
| 170 | candidates.add(goodGuess); |
---|
| 171 | } |
---|
| 172 | } |
---|
| 173 | |
---|
| 174 | //now, only pull out the guesses that had the best score |
---|
| 175 | for (Iterator iter = candidates.iterator(); iter.hasNext();) { |
---|
| 176 | Word candidate = (Word) iter.next(); |
---|
| 177 | if (candidate.getCost() == bestScore) |
---|
| 178 | wordList.add(candidate); |
---|
| 179 | } |
---|
| 180 | |
---|
| 181 | } |
---|
| 182 | |
---|
| 183 | private Vector getWordsFromCode(String word, Hashtable codes) { |
---|
| 184 | Configuration config = Configuration.getConfiguration(); |
---|
| 185 | Vector result = new Vector(); |
---|
| 186 | final int configDistance = config.getInteger(Configuration.SPELL_THRESHOLD); |
---|
| 187 | |
---|
| 188 | for (Enumeration i = codes.keys(); i.hasMoreElements();) { |
---|
| 189 | String code = (String) i.nextElement(); |
---|
| 190 | |
---|
| 191 | List simwordlist = getWords(code); |
---|
| 192 | for (Iterator iter = simwordlist.iterator(); iter.hasNext();) { |
---|
| 193 | String similar = (String) iter.next(); |
---|
| 194 | int distance = EditDistance.getDistance(word, similar); |
---|
| 195 | if (distance < configDistance) { |
---|
| 196 | Word w = new Word(similar, distance); |
---|
| 197 | result.addElement(w); |
---|
| 198 | } |
---|
| 199 | } |
---|
| 200 | } |
---|
| 201 | return result; |
---|
| 202 | } |
---|
| 203 | |
---|
| 204 | /** |
---|
| 205 | * Returns the phonetic code representing the word. |
---|
| 206 | */ |
---|
| 207 | public String getCode(String word) { |
---|
| 208 | return tf.transform(word); |
---|
| 209 | } |
---|
| 210 | |
---|
| 211 | /** |
---|
| 212 | * Returns a list of words that have the same phonetic code. |
---|
| 213 | */ |
---|
| 214 | protected abstract List getWords(String phoneticCode); |
---|
| 215 | |
---|
| 216 | /** |
---|
| 217 | * Returns true if the word is correctly spelled against the current word list. |
---|
| 218 | */ |
---|
| 219 | public boolean isCorrect(String word) { |
---|
| 220 | List possible = getWords(getCode(word)); |
---|
| 221 | if (possible.contains(word)) |
---|
| 222 | return true; |
---|
| 223 | //JMH should we always try the lowercase version. If I dont then capitalised |
---|
| 224 | //words are always returned as incorrect. |
---|
| 225 | else if (possible.contains(word.toLowerCase())) |
---|
| 226 | return true; |
---|
| 227 | return false; |
---|
| 228 | } |
---|
| 229 | } |
---|