source: 3thparty/jmessenger/src/com/swabunga/spell/engine/SpellDictionaryASpell.java @ 3952

Revision 3952, 6.8 KB checked in by alexandrecorreia, 13 years ago (diff)

Ticket #1710 - Adicao do codigo fonte java do componente jmessenger(jabberit_messenger)

  • Property svn:executable set to *
Line 
1/* Created by bgalbs on Jan 30, 2003 at 11:45:25 PM */
2package com.swabunga.spell.engine;
3
4import java.io.File;
5import java.io.IOException;
6import java.io.Reader;
7import java.security.InvalidParameterException;
8import java.util.*;
9
10/**
11 * Container for various methods that any <code>SpellDictionary</code> will use.
12 * Based on the original Jazzy <a href="http://aspell.net/">aspell</a> port.
13 * <p/>
14 *
15 *
16 */
17public abstract class SpellDictionaryASpell implements SpellDictionary {
18
19
20  /** The reference to a Transformator, used to transform a word into it's phonetic code. */
21  protected Transformator tf;
22
23  public SpellDictionaryASpell(File phonetic) throws IOException {
24    if (phonetic == null)
25      tf = new DoubleMeta();
26    else
27      tf = new GenericTransformator(phonetic);
28  }
29
30  public SpellDictionaryASpell(File phonetic, String encoding) throws IOException {
31    if (phonetic == null)
32      tf = new DoubleMeta();
33    else
34      tf = new GenericTransformator(phonetic, encoding);
35  }
36
37  public SpellDictionaryASpell(Reader phonetic) throws IOException {
38    if (phonetic == null)
39      tf = new DoubleMeta();
40    else
41      tf = new GenericTransformator(phonetic);
42  }
43
44
45  /**
46   * Returns a list of Word objects that are the suggestions to an
47   * incorrect word.
48   * <p>
49   * @param word Suggestions for given mispelt word
50   * @param threshold The lower boundary of similarity to mispelt word
51   * @return Vector a List of suggestions
52   */
53  public List getSuggestions(String word, int threshold) {
54
55    Hashtable nearmisscodes = new Hashtable();
56    String code = getCode(word);
57
58    // add all words that have the same phonetics
59    nearmisscodes.put(code, code);
60    Vector phoneticList = getWordsFromCode(word, nearmisscodes);
61
62    // do some tranformations to pick up more results
63    //interchange
64    nearmisscodes = new Hashtable();
65    char[] charArray = word.toCharArray();
66    for (int i = 0; i < word.length() - 1; i++) {
67      char a = charArray[i];
68      char b = charArray[i + 1];
69      charArray[i] = b;
70      charArray[i + 1] = a;
71      String s = getCode(new String(charArray));
72      nearmisscodes.put(s, s);
73      charArray[i] = a;
74      charArray[i + 1] = b;
75    }
76
77    char[] replacelist = tf.getReplaceList();
78
79    //change
80    charArray = word.toCharArray();
81    for (int i = 0; i < word.length(); i++) {
82      char original = charArray[i];
83      for (int j = 0; j < replacelist.length; j++) {
84        charArray[i] = replacelist[j];
85        String s = getCode(new String(charArray));
86        nearmisscodes.put(s, s);
87      }
88      charArray[i] = original;
89    }
90
91    //add
92    charArray = (word += " ").toCharArray();
93    int iy = charArray.length - 1;
94    while (true) {
95      for (int j = 0; j < replacelist.length; j++) {
96        charArray[iy] = replacelist[j];
97        String s = getCode(new String(charArray));
98        nearmisscodes.put(s, s);
99      }
100      if (iy == 0)
101        break;
102      charArray[iy] = charArray[iy - 1];
103      --iy;
104    }
105
106    //delete
107    word = word.trim();
108    charArray = word.toCharArray();
109    char[] charArray2 = new char[charArray.length - 1];
110    for (int ix = 0; ix < charArray2.length; ix++) {
111      charArray2[ix] = charArray[ix];
112    }
113    char a, b;
114    a = charArray[charArray.length - 1];
115    int ii = charArray2.length;
116    while (true) {
117      String s = getCode(new String(charArray));
118      nearmisscodes.put(s, s);
119      if (ii == 0)
120        break;
121      b = a;
122      a = charArray2[ii - 1];
123      charArray2[ii - 1] = b;
124      --ii;
125    }
126
127    nearmisscodes.remove(code); //already accounted for in phoneticList
128
129    Vector wordlist = getWordsFromCode(word, nearmisscodes);
130
131    if (wordlist.size() == 0 && phoneticList.size() == 0)
132      addBestGuess(word, phoneticList);
133
134
135    // We sort a Vector at the end instead of maintaining a
136    // continously sorted TreeSet because everytime you add a collection
137    // to a treeset it has to be resorted. It's better to do this operation
138    // once at the end.
139
140    Collections.sort(phoneticList, new Word()); //always sort phonetic matches along the top
141    Collections.sort(wordlist, new Word()); //the non-phonetic matches can be listed below
142
143    phoneticList.addAll(wordlist);
144    return phoneticList;
145  }
146
147  /**
148   * When we don't come up with any suggestions (probably because the threshold was too strict),
149   * then pick the best guesses from the those words that have the same phonetic code.
150   * @param word - the word we are trying spell correct
151   * @param wordList - the linked list that will get the best guess
152   */
153  private void addBestGuess(String word, Vector wordList) {
154    if (wordList.size() != 0)
155      throw new InvalidParameterException("the wordList vector must be empty");
156
157    int bestScore = Integer.MAX_VALUE;
158
159    String code = getCode(word);
160    List simwordlist = getWords(code);
161
162    LinkedList candidates = new LinkedList();
163
164    for (Iterator j = simwordlist.iterator(); j.hasNext();) {
165      String similar = (String) j.next();
166      int distance = EditDistance.getDistance(word, similar);
167      if (distance <= bestScore) {
168        bestScore = distance;
169        Word goodGuess = new Word(similar, distance);
170        candidates.add(goodGuess);
171      }
172    }
173
174    //now, only pull out the guesses that had the best score
175    for (Iterator iter = candidates.iterator(); iter.hasNext();) {
176      Word candidate = (Word) iter.next();
177      if (candidate.getCost() == bestScore)
178        wordList.add(candidate);
179    }
180
181  }
182
183  private Vector getWordsFromCode(String word, Hashtable codes) {
184    Configuration config = Configuration.getConfiguration();
185    Vector result = new Vector();
186    final int configDistance = config.getInteger(Configuration.SPELL_THRESHOLD);
187
188    for (Enumeration i = codes.keys(); i.hasMoreElements();) {
189      String code = (String) i.nextElement();
190
191      List simwordlist = getWords(code);
192      for (Iterator iter = simwordlist.iterator(); iter.hasNext();) {
193        String similar = (String) iter.next();
194        int distance = EditDistance.getDistance(word, similar);
195        if (distance < configDistance) {
196          Word w = new Word(similar, distance);
197          result.addElement(w);
198        }
199      }
200    }
201    return result;
202  }
203
204  /**
205   * Returns the phonetic code representing the word.
206   */
207  public String getCode(String word) {
208    return tf.transform(word);
209  }
210
211  /**
212   * Returns a list of words that have the same phonetic code.
213   */
214  protected abstract List getWords(String phoneticCode);
215
216  /**
217   * Returns true if the word is correctly spelled against the current word list.
218   */
219  public boolean isCorrect(String word) {
220    List possible = getWords(getCode(word));
221    if (possible.contains(word))
222      return true;
223    //JMH should we always try the lowercase version. If I dont then capitalised
224    //words are always returned as incorrect.
225    else if (possible.contains(word.toLowerCase()))
226      return true;
227    return false;
228  }
229}
Note: See TracBrowser for help on using the repository browser.