[1014] | 1 | package com.swabunga.spell.engine; |
---|
| 2 | |
---|
| 3 | import com.swabunga.util.StringUtility; |
---|
| 4 | |
---|
| 5 | import java.io.*; |
---|
| 6 | import java.util.HashMap; |
---|
| 7 | import java.util.Vector; |
---|
| 8 | |
---|
| 9 | /** |
---|
| 10 | * A Generic implementation of a transformator takes an aspell phonetics file and constructs |
---|
| 11 | * some sort of transformation table using the inner class Rule. |
---|
| 12 | * |
---|
| 13 | * @author Robert Gustavsson (robert@lindesign.se) |
---|
| 14 | */ |
---|
| 15 | public class GenericTransformator implements Transformator { |
---|
| 16 | |
---|
| 17 | |
---|
| 18 | /** |
---|
| 19 | * This replace list is used if no phonetic file is supplied or it doesn't |
---|
| 20 | * contain the alphabet. |
---|
| 21 | */ |
---|
| 22 | private static final char[] defaultEnglishAlphabet = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'}; |
---|
| 23 | |
---|
| 24 | |
---|
| 25 | public static final char ALPHABET_START = '['; |
---|
| 26 | public static final char ALPHABET_END = ']'; |
---|
| 27 | public static final String KEYWORD_ALPHBET = "alphabet"; |
---|
| 28 | public static final String[] IGNORED_KEYWORDS = {"version", "followup", "collapse_result"}; |
---|
| 29 | |
---|
| 30 | public static final char STARTMULTI = '('; |
---|
| 31 | public static final char ENDMULTI = ')'; |
---|
| 32 | public static final String DIGITCODE = "0"; |
---|
| 33 | public static final String REPLACEVOID = "_"; |
---|
| 34 | |
---|
| 35 | private Object[] ruleArray = null; |
---|
| 36 | private char[] alphabetString = defaultEnglishAlphabet; |
---|
| 37 | |
---|
| 38 | public GenericTransformator(File phonetic) throws IOException { |
---|
| 39 | buildRules(new BufferedReader(new FileReader(phonetic))); |
---|
| 40 | alphabetString = washAlphabetIntoReplaceList(getReplaceList()); |
---|
| 41 | |
---|
| 42 | } |
---|
| 43 | |
---|
| 44 | public GenericTransformator(File phonetic, String encoding) throws IOException { |
---|
| 45 | buildRules(new BufferedReader(new InputStreamReader(new FileInputStream(phonetic), encoding))); |
---|
| 46 | alphabetString = washAlphabetIntoReplaceList(getReplaceList()); |
---|
| 47 | } |
---|
| 48 | |
---|
| 49 | public GenericTransformator(Reader phonetic) throws IOException { |
---|
| 50 | buildRules(new BufferedReader(phonetic)); |
---|
| 51 | alphabetString = washAlphabetIntoReplaceList(getReplaceList()); |
---|
| 52 | } |
---|
| 53 | |
---|
| 54 | /** |
---|
| 55 | * Goes through an alphabet and makes sure that only one of those letters |
---|
| 56 | * that are coded equally will be in the replace list. |
---|
| 57 | * In other words, it removes any letters in the alphabet |
---|
| 58 | * that are redundant phonetically. |
---|
| 59 | * |
---|
| 60 | * This is done to improve speed in the getSuggestion method. |
---|
| 61 | * |
---|
| 62 | * @param alphabet The complete alphabet to wash. |
---|
| 63 | * @return The washed alphabet to be used as replace list. |
---|
| 64 | */ |
---|
| 65 | private char[] washAlphabetIntoReplaceList(char[] alphabet) { |
---|
| 66 | |
---|
| 67 | HashMap letters = new HashMap(alphabet.length); |
---|
| 68 | |
---|
| 69 | for (int i = 0; i < alphabet.length; i++) { |
---|
| 70 | String tmp = String.valueOf(alphabet[i]); |
---|
| 71 | String code = transform(tmp); |
---|
| 72 | if (!letters.containsKey(code)) { |
---|
| 73 | letters.put(code, new Character(alphabet[i])); |
---|
| 74 | } |
---|
| 75 | } |
---|
| 76 | |
---|
| 77 | Object[] tmpCharacters = letters.values().toArray(); |
---|
| 78 | char[] washedArray = new char[tmpCharacters.length]; |
---|
| 79 | |
---|
| 80 | for (int i = 0; i < tmpCharacters.length; i++) { |
---|
| 81 | washedArray[i] = ((Character) tmpCharacters[i]).charValue(); |
---|
| 82 | } |
---|
| 83 | |
---|
| 84 | return washedArray; |
---|
| 85 | } |
---|
| 86 | |
---|
| 87 | |
---|
| 88 | /** |
---|
| 89 | * Takes out all single character replacements and put them in a char array. |
---|
| 90 | * This array can later be used for adding or changing letters in getSuggestion(). |
---|
| 91 | * @return char[] An array of chars with replacements characters |
---|
| 92 | */ |
---|
| 93 | public char[] getCodeReplaceList() { |
---|
| 94 | char[] replacements; |
---|
| 95 | TransformationRule rule; |
---|
| 96 | Vector tmp = new Vector(); |
---|
| 97 | |
---|
| 98 | if (ruleArray == null) |
---|
| 99 | return null; |
---|
| 100 | for (int i = 0; i < ruleArray.length; i++) { |
---|
| 101 | rule = (TransformationRule) ruleArray[i]; |
---|
| 102 | if (rule.getReplaceExp().length() == 1) |
---|
| 103 | tmp.addElement(rule.getReplaceExp()); |
---|
| 104 | } |
---|
| 105 | replacements = new char[tmp.size()]; |
---|
| 106 | for (int i = 0; i < tmp.size(); i++) { |
---|
| 107 | replacements[i] = ((String) tmp.elementAt(i)).charAt(0); |
---|
| 108 | } |
---|
| 109 | return replacements; |
---|
| 110 | } |
---|
| 111 | |
---|
| 112 | /** |
---|
| 113 | * Builds up an char array with the chars in the alphabet of the language as it was read from the |
---|
| 114 | * alphabet tag in the phonetic file. |
---|
| 115 | * @return char[] An array of chars representing the alphabet or null if no alphabet was available. |
---|
| 116 | */ |
---|
| 117 | public char[] getReplaceList() { |
---|
| 118 | return alphabetString; |
---|
| 119 | } |
---|
| 120 | |
---|
| 121 | /** |
---|
| 122 | * Returns the phonetic code of the word. |
---|
| 123 | */ |
---|
| 124 | public String transform(String word) { |
---|
| 125 | |
---|
| 126 | if (ruleArray == null) |
---|
| 127 | return null; |
---|
| 128 | |
---|
| 129 | TransformationRule rule; |
---|
| 130 | StringBuffer str = new StringBuffer(word.toUpperCase()); |
---|
| 131 | int strLength = str.length(); |
---|
| 132 | int startPos = 0, add = 1; |
---|
| 133 | |
---|
| 134 | while (startPos < strLength) { |
---|
| 135 | |
---|
| 136 | add = 1; |
---|
| 137 | if (Character.isDigit(str.charAt(startPos))) { |
---|
| 138 | StringUtility.replace(str, startPos, startPos + DIGITCODE.length(), DIGITCODE); |
---|
| 139 | startPos += add; |
---|
| 140 | continue; |
---|
| 141 | } |
---|
| 142 | |
---|
| 143 | for (int i = 0; i < ruleArray.length; i++) { |
---|
| 144 | //System.out.println("Testing rule#:"+i); |
---|
| 145 | rule = (TransformationRule) ruleArray[i]; |
---|
| 146 | if (rule.startsWithExp() && startPos > 0) |
---|
| 147 | continue; |
---|
| 148 | if (startPos + rule.lengthOfMatch() > strLength) { |
---|
| 149 | continue; |
---|
| 150 | } |
---|
| 151 | if (rule.isMatching(str, startPos)) { |
---|
| 152 | String replaceExp = rule.getReplaceExp(); |
---|
| 153 | |
---|
| 154 | add = replaceExp.length(); |
---|
| 155 | StringUtility.replace(str, startPos, startPos + rule.getTakeOut(), replaceExp); |
---|
| 156 | strLength -= rule.getTakeOut(); |
---|
| 157 | strLength += add; |
---|
| 158 | //System.out.println("Replacing with rule#:"+i+" add="+add); |
---|
| 159 | break; |
---|
| 160 | } |
---|
| 161 | } |
---|
| 162 | startPos += add; |
---|
| 163 | } |
---|
| 164 | //System.out.println(word); |
---|
| 165 | //System.out.println(str.toString()); |
---|
| 166 | return str.toString(); |
---|
| 167 | } |
---|
| 168 | |
---|
| 169 | // Used to build up the transformastion table. |
---|
| 170 | private void buildRules(BufferedReader in) throws IOException { |
---|
| 171 | String read = null; |
---|
| 172 | Vector ruleList = new Vector(); |
---|
| 173 | while ((read = in.readLine()) != null) { |
---|
| 174 | buildRule(realTrimmer(read), ruleList); |
---|
| 175 | } |
---|
| 176 | ruleArray = new TransformationRule[ruleList.size()]; |
---|
| 177 | ruleList.copyInto(ruleArray); |
---|
| 178 | } |
---|
| 179 | |
---|
| 180 | // Here is where the real work of reading the phonetics file is done. |
---|
| 181 | private void buildRule(String str, Vector ruleList) { |
---|
| 182 | if (str.length() < 1) |
---|
| 183 | return; |
---|
| 184 | for (int i = 0; i < IGNORED_KEYWORDS.length; i++) { |
---|
| 185 | if (str.startsWith(IGNORED_KEYWORDS[i])) |
---|
| 186 | return; |
---|
| 187 | } |
---|
| 188 | |
---|
| 189 | // A different alphabet is used for this language, will be read into |
---|
| 190 | // the alphabetString variable. |
---|
| 191 | if (str.startsWith(KEYWORD_ALPHBET)) { |
---|
| 192 | int start = str.indexOf(ALPHABET_START); |
---|
| 193 | int end = str.lastIndexOf(ALPHABET_END); |
---|
| 194 | if (end != -1 && start != -1) { |
---|
| 195 | alphabetString = str.substring(++start, end).toCharArray(); |
---|
| 196 | } |
---|
| 197 | return; |
---|
| 198 | } |
---|
| 199 | |
---|
| 200 | TransformationRule rule = null; |
---|
| 201 | StringBuffer matchExp = new StringBuffer(); |
---|
| 202 | StringBuffer replaceExp = new StringBuffer(); |
---|
| 203 | boolean start = false, |
---|
| 204 | end = false; |
---|
| 205 | int takeOutPart = 0, |
---|
| 206 | matchLength = 0; |
---|
| 207 | boolean match = true, |
---|
| 208 | inMulti = false; |
---|
| 209 | for (int i = 0; i < str.length(); i++) { |
---|
| 210 | if (Character.isWhitespace(str.charAt(i))) { |
---|
| 211 | match = false; |
---|
| 212 | } else { |
---|
| 213 | if (match) { |
---|
| 214 | if (!isReservedChar(str.charAt(i))) { |
---|
| 215 | matchExp.append(str.charAt(i)); |
---|
| 216 | if (!inMulti) { |
---|
| 217 | takeOutPart++; |
---|
| 218 | matchLength++; |
---|
| 219 | } |
---|
| 220 | if (str.charAt(i) == STARTMULTI || str.charAt(i) == ENDMULTI) |
---|
| 221 | inMulti = !inMulti; |
---|
| 222 | } |
---|
| 223 | if (str.charAt(i) == '-') |
---|
| 224 | takeOutPart--; |
---|
| 225 | if (str.charAt(i) == '^') |
---|
| 226 | start = true; |
---|
| 227 | if (str.charAt(i) == '$') |
---|
| 228 | end = true; |
---|
| 229 | } else { |
---|
| 230 | replaceExp.append(str.charAt(i)); |
---|
| 231 | } |
---|
| 232 | } |
---|
| 233 | } |
---|
| 234 | if (replaceExp.toString().equals(REPLACEVOID)) { |
---|
| 235 | replaceExp = new StringBuffer(""); |
---|
| 236 | //System.out.println("Changing _ to \"\" for "+matchExp.toString()); |
---|
| 237 | } |
---|
| 238 | rule = new TransformationRule(matchExp.toString(), replaceExp.toString(), takeOutPart, matchLength, start, end); |
---|
| 239 | //System.out.println(rule.toString()); |
---|
| 240 | ruleList.addElement(rule); |
---|
| 241 | } |
---|
| 242 | |
---|
| 243 | // Chars with special meaning to aspell. Not everyone is implemented here. |
---|
| 244 | private boolean isReservedChar(char ch) { |
---|
| 245 | if (ch == '<' || ch == '>' || ch == '^' || ch == '$' || ch == '-' || Character.isDigit(ch)) |
---|
| 246 | return true; |
---|
| 247 | return false; |
---|
| 248 | } |
---|
| 249 | |
---|
| 250 | // Trims off everything we don't care about. |
---|
| 251 | private String realTrimmer(String row) { |
---|
| 252 | int pos = row.indexOf('#'); |
---|
| 253 | if (pos != -1) { |
---|
| 254 | row = row.substring(0, pos); |
---|
| 255 | } |
---|
| 256 | return row.trim(); |
---|
| 257 | } |
---|
| 258 | |
---|
| 259 | // Inner Classes |
---|
| 260 | /* |
---|
| 261 | * Holds the match string and the replace string and all the rule attributes. |
---|
| 262 | * Is responsible for indicating matches. |
---|
| 263 | */ |
---|
| 264 | private class TransformationRule { |
---|
| 265 | |
---|
| 266 | private String replace; |
---|
| 267 | private char[] match; |
---|
| 268 | // takeOut=number of chars to replace; |
---|
| 269 | // matchLength=length of matching string counting multies as one. |
---|
| 270 | private int takeOut, matchLength; |
---|
| 271 | private boolean start, end; |
---|
| 272 | |
---|
| 273 | // Construktor |
---|
| 274 | public TransformationRule(String match, String replace, int takeout, int matchLength, boolean start, boolean end) { |
---|
| 275 | this.match = match.toCharArray(); |
---|
| 276 | this.replace = replace; |
---|
| 277 | this.takeOut = takeout; |
---|
| 278 | this.matchLength = matchLength; |
---|
| 279 | this.start = start; |
---|
| 280 | this.end = end; |
---|
| 281 | } |
---|
| 282 | |
---|
| 283 | /* |
---|
| 284 | * Returns true if word from pos and forward matches the match string. |
---|
| 285 | * Precondition: wordPos+matchLength<word.length() |
---|
| 286 | */ |
---|
| 287 | public boolean isMatching(StringBuffer word, int wordPos) { |
---|
| 288 | boolean matching = true, inMulti = false, multiMatch = false; |
---|
| 289 | char matchCh; |
---|
| 290 | |
---|
| 291 | for (int matchPos = 0; matchPos < match.length; matchPos++) { |
---|
| 292 | matchCh = match[matchPos]; |
---|
| 293 | if (matchCh == STARTMULTI || matchCh == ENDMULTI) { |
---|
| 294 | inMulti = !inMulti; |
---|
| 295 | if (!inMulti) |
---|
| 296 | matching = matching & multiMatch; |
---|
| 297 | else |
---|
| 298 | multiMatch = false; |
---|
| 299 | } else { |
---|
| 300 | if (matchCh != word.charAt(wordPos)) { |
---|
| 301 | if (inMulti) |
---|
| 302 | multiMatch = multiMatch | false; |
---|
| 303 | else |
---|
| 304 | matching = false; |
---|
| 305 | } else { |
---|
| 306 | if (inMulti) |
---|
| 307 | multiMatch = multiMatch | true; |
---|
| 308 | else |
---|
| 309 | matching = true; |
---|
| 310 | } |
---|
| 311 | if (!inMulti) |
---|
| 312 | wordPos++; |
---|
| 313 | if (!matching) |
---|
| 314 | break; |
---|
| 315 | } |
---|
| 316 | } |
---|
| 317 | if (end && wordPos != word.length()) |
---|
| 318 | matching = false; |
---|
| 319 | return matching; |
---|
| 320 | } |
---|
| 321 | |
---|
| 322 | public String getReplaceExp() { |
---|
| 323 | return replace; |
---|
| 324 | } |
---|
| 325 | |
---|
| 326 | public int getTakeOut() { |
---|
| 327 | return takeOut; |
---|
| 328 | } |
---|
| 329 | |
---|
| 330 | public boolean startsWithExp() { |
---|
| 331 | return start; |
---|
| 332 | } |
---|
| 333 | |
---|
| 334 | public int lengthOfMatch() { |
---|
| 335 | return matchLength; |
---|
| 336 | } |
---|
| 337 | |
---|
| 338 | // Just for debugging purposes. |
---|
| 339 | public String toString() { |
---|
| 340 | return "Match:" + String.valueOf(match) + " Replace:" + replace + " TakeOut:" + takeOut + " MatchLength:" + matchLength + " Start:" + start + " End:" + end; |
---|
| 341 | } |
---|
| 342 | |
---|
| 343 | } |
---|
| 344 | } |
---|