source: branches/2.2/jabberit_messenger/java_source/src/com/swabunga/spell/engine/GenericTransformator.java @ 3102

Revision 3102, 10.5 KB checked in by amuller, 14 years ago (diff)

Ticket #986 - Efetuado merge para o Branch 2.2( atualizacao do modulo)

  • Property svn:executable set to *
Line 
1package com.swabunga.spell.engine;
2
3import com.swabunga.util.StringUtility;
4
5import java.io.*;
6import java.util.HashMap;
7import java.util.Vector;
8
9/**
10 * A Generic implementation of a transformator takes an aspell phonetics file and constructs
11 * some sort of transformation table using the inner class Rule.
12 *
13 * @author Robert Gustavsson (robert@lindesign.se)
14 */
15public class GenericTransformator implements Transformator {
16
17
18  /**
19   * This replace list is used if no phonetic file is supplied or it doesn't
20   * contain the alphabet.
21   */
22  private static final char[] defaultEnglishAlphabet = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'};
23
24
25  public static final char ALPHABET_START = '[';
26  public static final char ALPHABET_END = ']';
27  public static final String KEYWORD_ALPHBET = "alphabet";
28  public static final String[] IGNORED_KEYWORDS = {"version", "followup", "collapse_result"};
29
30  public static final char STARTMULTI = '(';
31  public static final char ENDMULTI = ')';
32  public static final String DIGITCODE = "0";
33  public static final String REPLACEVOID = "_";
34
35  private Object[] ruleArray = null;
36  private char[] alphabetString = defaultEnglishAlphabet;
37
38  public GenericTransformator(File phonetic) throws IOException {
39    buildRules(new BufferedReader(new FileReader(phonetic)));
40    alphabetString = washAlphabetIntoReplaceList(getReplaceList());
41
42  }
43
44  public GenericTransformator(File phonetic, String encoding) throws IOException {
45    buildRules(new BufferedReader(new InputStreamReader(new FileInputStream(phonetic), encoding)));
46    alphabetString = washAlphabetIntoReplaceList(getReplaceList());
47  }
48
49  public GenericTransformator(Reader phonetic) throws IOException {
50    buildRules(new BufferedReader(phonetic));
51    alphabetString = washAlphabetIntoReplaceList(getReplaceList());
52  }
53
54  /**
55   * Goes through an alphabet and makes sure that only one of those letters
56   * that are coded equally will be in the replace list.
57   * In other words, it removes any letters in the alphabet
58   * that are redundant phonetically.
59   *
60   * This is done to improve speed in the getSuggestion method.
61   *
62   * @param alphabet The complete alphabet to wash.
63   * @return The washed alphabet to be used as replace list.
64   */
65  private char[] washAlphabetIntoReplaceList(char[] alphabet) {
66
67    HashMap letters = new HashMap(alphabet.length);
68
69    for (int i = 0; i < alphabet.length; i++) {
70      String tmp = String.valueOf(alphabet[i]);
71      String code = transform(tmp);
72      if (!letters.containsKey(code)) {
73        letters.put(code, new Character(alphabet[i]));
74      }
75    }
76
77    Object[] tmpCharacters = letters.values().toArray();
78    char[] washedArray = new char[tmpCharacters.length];
79
80    for (int i = 0; i < tmpCharacters.length; i++) {
81      washedArray[i] = ((Character) tmpCharacters[i]).charValue();
82    }
83
84    return washedArray;
85  }
86
87
88  /**
89   * Takes out all single character replacements and put them in a char array.
90   * This array can later be used for adding or changing letters in getSuggestion().
91   * @return char[] An array of chars with replacements characters
92   */
93  public char[] getCodeReplaceList() {
94    char[] replacements;
95    TransformationRule rule;
96    Vector tmp = new Vector();
97
98    if (ruleArray == null)
99      return null;
100    for (int i = 0; i < ruleArray.length; i++) {
101      rule = (TransformationRule) ruleArray[i];
102      if (rule.getReplaceExp().length() == 1)
103        tmp.addElement(rule.getReplaceExp());
104    }
105    replacements = new char[tmp.size()];
106    for (int i = 0; i < tmp.size(); i++) {
107      replacements[i] = ((String) tmp.elementAt(i)).charAt(0);
108    }
109    return replacements;
110  }
111
112  /**
113   * Builds up an char array with the chars in the alphabet of the language as it was read from the
114   * alphabet tag in the phonetic file.
115   * @return char[] An array of chars representing the alphabet or null if no alphabet was available.
116   */
117  public char[] getReplaceList() {
118    return alphabetString;
119  }
120
121  /**
122   * Returns the phonetic code of the word.
123   */
124  public String transform(String word) {
125
126    if (ruleArray == null)
127      return null;
128
129    TransformationRule rule;
130    StringBuffer str = new StringBuffer(word.toUpperCase());
131    int strLength = str.length();
132    int startPos = 0, add = 1;
133
134    while (startPos < strLength) {
135
136      add = 1;
137      if (Character.isDigit(str.charAt(startPos))) {
138        StringUtility.replace(str, startPos, startPos + DIGITCODE.length(), DIGITCODE);
139        startPos += add;
140        continue;
141      }
142
143      for (int i = 0; i < ruleArray.length; i++) {
144        //System.out.println("Testing rule#:"+i);
145        rule = (TransformationRule) ruleArray[i];
146        if (rule.startsWithExp() && startPos > 0)
147          continue;
148        if (startPos + rule.lengthOfMatch() > strLength) {
149          continue;
150        }
151        if (rule.isMatching(str, startPos)) {
152          String replaceExp = rule.getReplaceExp();
153
154          add = replaceExp.length();
155          StringUtility.replace(str, startPos, startPos + rule.getTakeOut(), replaceExp);
156          strLength -= rule.getTakeOut();
157          strLength += add;
158          //System.out.println("Replacing with rule#:"+i+" add="+add);
159          break;
160        }
161      }
162      startPos += add;
163    }
164    //System.out.println(word);
165    //System.out.println(str.toString());
166    return str.toString();
167  }
168
169  // Used to build up the transformastion table.
170  private void buildRules(BufferedReader in) throws IOException {
171    String read = null;
172    Vector ruleList = new Vector();
173    while ((read = in.readLine()) != null) {
174      buildRule(realTrimmer(read), ruleList);
175    }
176    ruleArray = new TransformationRule[ruleList.size()];
177    ruleList.copyInto(ruleArray);
178  }
179
180  // Here is where the real work of reading the phonetics file is done.
181  private void buildRule(String str, Vector ruleList) {
182    if (str.length() < 1)
183      return;
184    for (int i = 0; i < IGNORED_KEYWORDS.length; i++) {
185      if (str.startsWith(IGNORED_KEYWORDS[i]))
186        return;
187    }
188
189    // A different alphabet is used for this language, will be read into
190    // the alphabetString variable.
191    if (str.startsWith(KEYWORD_ALPHBET)) {
192      int start = str.indexOf(ALPHABET_START);
193      int end = str.lastIndexOf(ALPHABET_END);
194      if (end != -1 && start != -1) {
195        alphabetString = str.substring(++start, end).toCharArray();
196      }
197      return;
198    }
199
200    TransformationRule rule = null;
201    StringBuffer matchExp = new StringBuffer();
202    StringBuffer replaceExp = new StringBuffer();
203    boolean start = false,
204        end = false;
205    int takeOutPart = 0,
206        matchLength = 0;
207    boolean match = true,
208        inMulti = false;
209    for (int i = 0; i < str.length(); i++) {
210      if (Character.isWhitespace(str.charAt(i))) {
211        match = false;
212      } else {
213        if (match) {
214          if (!isReservedChar(str.charAt(i))) {
215            matchExp.append(str.charAt(i));
216            if (!inMulti) {
217              takeOutPart++;
218              matchLength++;
219            }
220            if (str.charAt(i) == STARTMULTI || str.charAt(i) == ENDMULTI)
221              inMulti = !inMulti;
222          }
223          if (str.charAt(i) == '-')
224            takeOutPart--;
225          if (str.charAt(i) == '^')
226            start = true;
227          if (str.charAt(i) == '$')
228            end = true;
229        } else {
230          replaceExp.append(str.charAt(i));
231        }
232      }
233    }
234    if (replaceExp.toString().equals(REPLACEVOID)) {
235      replaceExp = new StringBuffer("");
236      //System.out.println("Changing _ to \"\" for "+matchExp.toString());
237    }
238    rule = new TransformationRule(matchExp.toString(), replaceExp.toString(), takeOutPart, matchLength, start, end);
239    //System.out.println(rule.toString());
240    ruleList.addElement(rule);
241  }
242
243  // Chars with special meaning to aspell. Not everyone is implemented here.
244  private boolean isReservedChar(char ch) {
245    if (ch == '<' || ch == '>' || ch == '^' || ch == '$' || ch == '-' || Character.isDigit(ch))
246      return true;
247    return false;
248  }
249
250  // Trims off everything we don't care about.
251  private String realTrimmer(String row) {
252    int pos = row.indexOf('#');
253    if (pos != -1) {
254      row = row.substring(0, pos);
255    }
256    return row.trim();
257  }
258
259  // Inner Classes
260  /*
261  * Holds the match string and the replace string and all the rule attributes.
262  * Is responsible for indicating matches.
263  */
264  private class TransformationRule {
265
266    private String replace;
267    private char[] match;
268    // takeOut=number of chars to replace;
269    // matchLength=length of matching string counting multies as one.
270    private int takeOut, matchLength;
271    private boolean start, end;
272
273    // Construktor
274    public TransformationRule(String match, String replace, int takeout, int matchLength, boolean start, boolean end) {
275      this.match = match.toCharArray();
276      this.replace = replace;
277      this.takeOut = takeout;
278      this.matchLength = matchLength;
279      this.start = start;
280      this.end = end;
281    }
282
283    /*
284    * Returns true if word from pos and forward matches the match string.
285    * Precondition: wordPos+matchLength<word.length()
286    */
287    public boolean isMatching(StringBuffer word, int wordPos) {
288      boolean matching = true, inMulti = false, multiMatch = false;
289      char matchCh;
290
291      for (int matchPos = 0; matchPos < match.length; matchPos++) {
292        matchCh = match[matchPos];
293        if (matchCh == STARTMULTI || matchCh == ENDMULTI) {
294          inMulti = !inMulti;
295          if (!inMulti)
296            matching = matching & multiMatch;
297          else
298            multiMatch = false;
299        } else {
300          if (matchCh != word.charAt(wordPos)) {
301            if (inMulti)
302              multiMatch = multiMatch | false;
303            else
304              matching = false;
305          } else {
306            if (inMulti)
307              multiMatch = multiMatch | true;
308            else
309              matching = true;
310          }
311          if (!inMulti)
312            wordPos++;
313          if (!matching)
314            break;
315        }
316      }
317      if (end && wordPos != word.length())
318        matching = false;
319      return matching;
320    }
321
322    public String getReplaceExp() {
323      return replace;
324    }
325
326    public int getTakeOut() {
327      return takeOut;
328    }
329
330    public boolean startsWithExp() {
331      return start;
332    }
333
334    public int lengthOfMatch() {
335      return matchLength;
336    }
337
338    // Just for debugging purposes.
339    public String toString() {
340      return "Match:" + String.valueOf(match) + " Replace:" + replace + " TakeOut:" + takeOut + " MatchLength:" + matchLength + " Start:" + start + " End:" + end;
341    }
342
343  }
344}
Note: See TracBrowser for help on using the repository browser.