1 | package com.swabunga.spell.engine; |
---|
2 | |
---|
3 | import com.swabunga.util.StringUtility; |
---|
4 | |
---|
5 | import java.io.*; |
---|
6 | import java.util.HashMap; |
---|
7 | import java.util.Vector; |
---|
8 | |
---|
9 | /** |
---|
10 | * A Generic implementation of a transformator takes an aspell phonetics file and constructs |
---|
11 | * some sort of transformation table using the inner class Rule. |
---|
12 | * |
---|
13 | * @author Robert Gustavsson (robert@lindesign.se) |
---|
14 | */ |
---|
15 | public class GenericTransformator implements Transformator { |
---|
16 | |
---|
17 | |
---|
18 | /** |
---|
19 | * This replace list is used if no phonetic file is supplied or it doesn't |
---|
20 | * contain the alphabet. |
---|
21 | */ |
---|
22 | private static final char[] defaultEnglishAlphabet = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'}; |
---|
23 | |
---|
24 | |
---|
25 | public static final char ALPHABET_START = '['; |
---|
26 | public static final char ALPHABET_END = ']'; |
---|
27 | public static final String KEYWORD_ALPHBET = "alphabet"; |
---|
28 | public static final String[] IGNORED_KEYWORDS = {"version", "followup", "collapse_result"}; |
---|
29 | |
---|
30 | public static final char STARTMULTI = '('; |
---|
31 | public static final char ENDMULTI = ')'; |
---|
32 | public static final String DIGITCODE = "0"; |
---|
33 | public static final String REPLACEVOID = "_"; |
---|
34 | |
---|
35 | private Object[] ruleArray = null; |
---|
36 | private char[] alphabetString = defaultEnglishAlphabet; |
---|
37 | |
---|
38 | public GenericTransformator(File phonetic) throws IOException { |
---|
39 | buildRules(new BufferedReader(new FileReader(phonetic))); |
---|
40 | alphabetString = washAlphabetIntoReplaceList(getReplaceList()); |
---|
41 | |
---|
42 | } |
---|
43 | |
---|
44 | public GenericTransformator(File phonetic, String encoding) throws IOException { |
---|
45 | buildRules(new BufferedReader(new InputStreamReader(new FileInputStream(phonetic), encoding))); |
---|
46 | alphabetString = washAlphabetIntoReplaceList(getReplaceList()); |
---|
47 | } |
---|
48 | |
---|
49 | public GenericTransformator(Reader phonetic) throws IOException { |
---|
50 | buildRules(new BufferedReader(phonetic)); |
---|
51 | alphabetString = washAlphabetIntoReplaceList(getReplaceList()); |
---|
52 | } |
---|
53 | |
---|
54 | /** |
---|
55 | * Goes through an alphabet and makes sure that only one of those letters |
---|
56 | * that are coded equally will be in the replace list. |
---|
57 | * In other words, it removes any letters in the alphabet |
---|
58 | * that are redundant phonetically. |
---|
59 | * |
---|
60 | * This is done to improve speed in the getSuggestion method. |
---|
61 | * |
---|
62 | * @param alphabet The complete alphabet to wash. |
---|
63 | * @return The washed alphabet to be used as replace list. |
---|
64 | */ |
---|
65 | private char[] washAlphabetIntoReplaceList(char[] alphabet) { |
---|
66 | |
---|
67 | HashMap letters = new HashMap(alphabet.length); |
---|
68 | |
---|
69 | for (int i = 0; i < alphabet.length; i++) { |
---|
70 | String tmp = String.valueOf(alphabet[i]); |
---|
71 | String code = transform(tmp); |
---|
72 | if (!letters.containsKey(code)) { |
---|
73 | letters.put(code, new Character(alphabet[i])); |
---|
74 | } |
---|
75 | } |
---|
76 | |
---|
77 | Object[] tmpCharacters = letters.values().toArray(); |
---|
78 | char[] washedArray = new char[tmpCharacters.length]; |
---|
79 | |
---|
80 | for (int i = 0; i < tmpCharacters.length; i++) { |
---|
81 | washedArray[i] = ((Character) tmpCharacters[i]).charValue(); |
---|
82 | } |
---|
83 | |
---|
84 | return washedArray; |
---|
85 | } |
---|
86 | |
---|
87 | |
---|
88 | /** |
---|
89 | * Takes out all single character replacements and put them in a char array. |
---|
90 | * This array can later be used for adding or changing letters in getSuggestion(). |
---|
91 | * @return char[] An array of chars with replacements characters |
---|
92 | */ |
---|
93 | public char[] getCodeReplaceList() { |
---|
94 | char[] replacements; |
---|
95 | TransformationRule rule; |
---|
96 | Vector tmp = new Vector(); |
---|
97 | |
---|
98 | if (ruleArray == null) |
---|
99 | return null; |
---|
100 | for (int i = 0; i < ruleArray.length; i++) { |
---|
101 | rule = (TransformationRule) ruleArray[i]; |
---|
102 | if (rule.getReplaceExp().length() == 1) |
---|
103 | tmp.addElement(rule.getReplaceExp()); |
---|
104 | } |
---|
105 | replacements = new char[tmp.size()]; |
---|
106 | for (int i = 0; i < tmp.size(); i++) { |
---|
107 | replacements[i] = ((String) tmp.elementAt(i)).charAt(0); |
---|
108 | } |
---|
109 | return replacements; |
---|
110 | } |
---|
111 | |
---|
112 | /** |
---|
113 | * Builds up an char array with the chars in the alphabet of the language as it was read from the |
---|
114 | * alphabet tag in the phonetic file. |
---|
115 | * @return char[] An array of chars representing the alphabet or null if no alphabet was available. |
---|
116 | */ |
---|
117 | public char[] getReplaceList() { |
---|
118 | return alphabetString; |
---|
119 | } |
---|
120 | |
---|
121 | /** |
---|
122 | * Returns the phonetic code of the word. |
---|
123 | */ |
---|
124 | public String transform(String word) { |
---|
125 | |
---|
126 | if (ruleArray == null) |
---|
127 | return null; |
---|
128 | |
---|
129 | TransformationRule rule; |
---|
130 | StringBuffer str = new StringBuffer(word.toUpperCase()); |
---|
131 | int strLength = str.length(); |
---|
132 | int startPos = 0, add = 1; |
---|
133 | |
---|
134 | while (startPos < strLength) { |
---|
135 | |
---|
136 | add = 1; |
---|
137 | if (Character.isDigit(str.charAt(startPos))) { |
---|
138 | StringUtility.replace(str, startPos, startPos + DIGITCODE.length(), DIGITCODE); |
---|
139 | startPos += add; |
---|
140 | continue; |
---|
141 | } |
---|
142 | |
---|
143 | for (int i = 0; i < ruleArray.length; i++) { |
---|
144 | //System.out.println("Testing rule#:"+i); |
---|
145 | rule = (TransformationRule) ruleArray[i]; |
---|
146 | if (rule.startsWithExp() && startPos > 0) |
---|
147 | continue; |
---|
148 | if (startPos + rule.lengthOfMatch() > strLength) { |
---|
149 | continue; |
---|
150 | } |
---|
151 | if (rule.isMatching(str, startPos)) { |
---|
152 | String replaceExp = rule.getReplaceExp(); |
---|
153 | |
---|
154 | add = replaceExp.length(); |
---|
155 | StringUtility.replace(str, startPos, startPos + rule.getTakeOut(), replaceExp); |
---|
156 | strLength -= rule.getTakeOut(); |
---|
157 | strLength += add; |
---|
158 | //System.out.println("Replacing with rule#:"+i+" add="+add); |
---|
159 | break; |
---|
160 | } |
---|
161 | } |
---|
162 | startPos += add; |
---|
163 | } |
---|
164 | //System.out.println(word); |
---|
165 | //System.out.println(str.toString()); |
---|
166 | return str.toString(); |
---|
167 | } |
---|
168 | |
---|
169 | // Used to build up the transformastion table. |
---|
170 | private void buildRules(BufferedReader in) throws IOException { |
---|
171 | String read = null; |
---|
172 | Vector ruleList = new Vector(); |
---|
173 | while ((read = in.readLine()) != null) { |
---|
174 | buildRule(realTrimmer(read), ruleList); |
---|
175 | } |
---|
176 | ruleArray = new TransformationRule[ruleList.size()]; |
---|
177 | ruleList.copyInto(ruleArray); |
---|
178 | } |
---|
179 | |
---|
180 | // Here is where the real work of reading the phonetics file is done. |
---|
181 | private void buildRule(String str, Vector ruleList) { |
---|
182 | if (str.length() < 1) |
---|
183 | return; |
---|
184 | for (int i = 0; i < IGNORED_KEYWORDS.length; i++) { |
---|
185 | if (str.startsWith(IGNORED_KEYWORDS[i])) |
---|
186 | return; |
---|
187 | } |
---|
188 | |
---|
189 | // A different alphabet is used for this language, will be read into |
---|
190 | // the alphabetString variable. |
---|
191 | if (str.startsWith(KEYWORD_ALPHBET)) { |
---|
192 | int start = str.indexOf(ALPHABET_START); |
---|
193 | int end = str.lastIndexOf(ALPHABET_END); |
---|
194 | if (end != -1 && start != -1) { |
---|
195 | alphabetString = str.substring(++start, end).toCharArray(); |
---|
196 | } |
---|
197 | return; |
---|
198 | } |
---|
199 | |
---|
200 | TransformationRule rule = null; |
---|
201 | StringBuffer matchExp = new StringBuffer(); |
---|
202 | StringBuffer replaceExp = new StringBuffer(); |
---|
203 | boolean start = false, |
---|
204 | end = false; |
---|
205 | int takeOutPart = 0, |
---|
206 | matchLength = 0; |
---|
207 | boolean match = true, |
---|
208 | inMulti = false; |
---|
209 | for (int i = 0; i < str.length(); i++) { |
---|
210 | if (Character.isWhitespace(str.charAt(i))) { |
---|
211 | match = false; |
---|
212 | } else { |
---|
213 | if (match) { |
---|
214 | if (!isReservedChar(str.charAt(i))) { |
---|
215 | matchExp.append(str.charAt(i)); |
---|
216 | if (!inMulti) { |
---|
217 | takeOutPart++; |
---|
218 | matchLength++; |
---|
219 | } |
---|
220 | if (str.charAt(i) == STARTMULTI || str.charAt(i) == ENDMULTI) |
---|
221 | inMulti = !inMulti; |
---|
222 | } |
---|
223 | if (str.charAt(i) == '-') |
---|
224 | takeOutPart--; |
---|
225 | if (str.charAt(i) == '^') |
---|
226 | start = true; |
---|
227 | if (str.charAt(i) == '$') |
---|
228 | end = true; |
---|
229 | } else { |
---|
230 | replaceExp.append(str.charAt(i)); |
---|
231 | } |
---|
232 | } |
---|
233 | } |
---|
234 | if (replaceExp.toString().equals(REPLACEVOID)) { |
---|
235 | replaceExp = new StringBuffer(""); |
---|
236 | //System.out.println("Changing _ to \"\" for "+matchExp.toString()); |
---|
237 | } |
---|
238 | rule = new TransformationRule(matchExp.toString(), replaceExp.toString(), takeOutPart, matchLength, start, end); |
---|
239 | //System.out.println(rule.toString()); |
---|
240 | ruleList.addElement(rule); |
---|
241 | } |
---|
242 | |
---|
243 | // Chars with special meaning to aspell. Not everyone is implemented here. |
---|
244 | private boolean isReservedChar(char ch) { |
---|
245 | if (ch == '<' || ch == '>' || ch == '^' || ch == '$' || ch == '-' || Character.isDigit(ch)) |
---|
246 | return true; |
---|
247 | return false; |
---|
248 | } |
---|
249 | |
---|
250 | // Trims off everything we don't care about. |
---|
251 | private String realTrimmer(String row) { |
---|
252 | int pos = row.indexOf('#'); |
---|
253 | if (pos != -1) { |
---|
254 | row = row.substring(0, pos); |
---|
255 | } |
---|
256 | return row.trim(); |
---|
257 | } |
---|
258 | |
---|
259 | // Inner Classes |
---|
260 | /* |
---|
261 | * Holds the match string and the replace string and all the rule attributes. |
---|
262 | * Is responsible for indicating matches. |
---|
263 | */ |
---|
264 | private class TransformationRule { |
---|
265 | |
---|
266 | private String replace; |
---|
267 | private char[] match; |
---|
268 | // takeOut=number of chars to replace; |
---|
269 | // matchLength=length of matching string counting multies as one. |
---|
270 | private int takeOut, matchLength; |
---|
271 | private boolean start, end; |
---|
272 | |
---|
273 | // Construktor |
---|
274 | public TransformationRule(String match, String replace, int takeout, int matchLength, boolean start, boolean end) { |
---|
275 | this.match = match.toCharArray(); |
---|
276 | this.replace = replace; |
---|
277 | this.takeOut = takeout; |
---|
278 | this.matchLength = matchLength; |
---|
279 | this.start = start; |
---|
280 | this.end = end; |
---|
281 | } |
---|
282 | |
---|
283 | /* |
---|
284 | * Returns true if word from pos and forward matches the match string. |
---|
285 | * Precondition: wordPos+matchLength<word.length() |
---|
286 | */ |
---|
287 | public boolean isMatching(StringBuffer word, int wordPos) { |
---|
288 | boolean matching = true, inMulti = false, multiMatch = false; |
---|
289 | char matchCh; |
---|
290 | |
---|
291 | for (int matchPos = 0; matchPos < match.length; matchPos++) { |
---|
292 | matchCh = match[matchPos]; |
---|
293 | if (matchCh == STARTMULTI || matchCh == ENDMULTI) { |
---|
294 | inMulti = !inMulti; |
---|
295 | if (!inMulti) |
---|
296 | matching = matching & multiMatch; |
---|
297 | else |
---|
298 | multiMatch = false; |
---|
299 | } else { |
---|
300 | if (matchCh != word.charAt(wordPos)) { |
---|
301 | if (inMulti) |
---|
302 | multiMatch = multiMatch | false; |
---|
303 | else |
---|
304 | matching = false; |
---|
305 | } else { |
---|
306 | if (inMulti) |
---|
307 | multiMatch = multiMatch | true; |
---|
308 | else |
---|
309 | matching = true; |
---|
310 | } |
---|
311 | if (!inMulti) |
---|
312 | wordPos++; |
---|
313 | if (!matching) |
---|
314 | break; |
---|
315 | } |
---|
316 | } |
---|
317 | if (end && wordPos != word.length()) |
---|
318 | matching = false; |
---|
319 | return matching; |
---|
320 | } |
---|
321 | |
---|
322 | public String getReplaceExp() { |
---|
323 | return replace; |
---|
324 | } |
---|
325 | |
---|
326 | public int getTakeOut() { |
---|
327 | return takeOut; |
---|
328 | } |
---|
329 | |
---|
330 | public boolean startsWithExp() { |
---|
331 | return start; |
---|
332 | } |
---|
333 | |
---|
334 | public int lengthOfMatch() { |
---|
335 | return matchLength; |
---|
336 | } |
---|
337 | |
---|
338 | // Just for debugging purposes. |
---|
339 | public String toString() { |
---|
340 | return "Match:" + String.valueOf(match) + " Replace:" + replace + " TakeOut:" + takeOut + " MatchLength:" + matchLength + " Start:" + start + " End:" + end; |
---|
341 | } |
---|
342 | |
---|
343 | } |
---|
344 | } |
---|