source: 3thparty/jmessenger/src/com/swabunga/spell/engine/EditDistance.java @ 3952

Revision 3952, 6.4 KB checked in by alexandrecorreia, 13 years ago (diff)

Ticket #1710 - Adicao do codigo fonte java do componente jmessenger(jabberit_messenger)

  • Property svn:executable set to *
Line 
1package com.swabunga.spell.engine;
2
3import java.io.BufferedReader;
4import java.io.InputStreamReader;
5
6/**
7 * This class is based on Levenshtein Distance algorithms, and it calculates how similar two words are.
8 * If the words are identitical, then the distance is 0. The more that the words have in common, the lower the distance value.
9 * The distance value is based on how many operations it takes to get from one word to the other. Possible operations are
10 * swapping characters, adding a character, deleting a character, and substituting a character.
11 * The resulting distance is the sum of these operations weighted by their cost, which can be set in the Configuration object.
12 * When there are multiple ways to convert one word into the other, the lowest cost distance is returned.
13 * <br/>
14 * Another way to think about this: what are the cheapest operations that would have to be done on the "original" word to end up
15 * with the "similar" word? Each operation has a cost, and these are added up to get the distance.
16 * <br/>
17 *
18 * @see com.swabunga.spell.engine.Configuration#COST_REMOVE_CHAR
19 * @see com.swabunga.spell.engine.Configuration#COST_INSERT_CHAR
20 * @see com.swabunga.spell.engine.Configuration#COST_SUBST_CHARS
21 * @see com.swabunga.spell.engine.Configuration#COST_SWAP_CHARS
22 *
23 */
24
25public class EditDistance {
26
27  /**
28   * JMH Again, there is no need to have a global class matrix variable
29   *  in this class. I have removed it and made the getDistance static final
30   *
31   * DMV: I refactored this method to make it more efficient, more readable, and simpler.
32   * I also fixed a bug with how the distance was being calculated. You could get wrong
33   * distances if you compared ("abc" to "ab") depending on what you had setup your
34   * COST_REMOVE_CHAR and EDIT_INSERTION_COST values to - that is now fixed.
35   *
36   * WRS: I added a distance for case comparison, so a misspelling of "i" would be closer to "I" than
37   * to "a".
38   */
39
40  public static Configuration config = Configuration.getConfiguration();
41
42  public static final int getDistance(String word, String similar) {
43
44    //get the weights for each possible operation
45    final int costOfDeletingSourceCharacter = config.getInteger(Configuration.COST_REMOVE_CHAR);
46    final int costOfInsertingSourceCharacter = config.getInteger(Configuration.COST_INSERT_CHAR);
47    final int costOfSubstitutingLetters = config.getInteger(Configuration.COST_SUBST_CHARS);
48    final int costOfSwappingLetters = config.getInteger(Configuration.COST_SWAP_CHARS);
49    final int costOfChangingCase = config.getInteger(Configuration.COST_CHANGE_CASE);
50
51    int a_size = word.length() + 1;
52    int b_size = similar.length() + 1;
53    int[][] matrix = new int[a_size][b_size];
54    matrix[0][0] = 0;
55
56    for (int i = 1; i != a_size; ++i)
57      matrix[i][0] = matrix[i - 1][0] + costOfInsertingSourceCharacter; //initialize the first column
58
59    for (int j = 1; j != b_size; ++j)
60      matrix[0][j] = matrix[0][j - 1] + costOfDeletingSourceCharacter; //initalize the first row
61
62    word = " " + word;
63    similar = " " + similar;
64
65    for (int i = 1; i != a_size; ++i) {
66      char sourceChar = word.charAt(i);
67      for (int j = 1; j != b_size; ++j) {
68
69        char otherChar = similar.charAt(j);
70        if (sourceChar == otherChar) {
71          matrix[i][j] = matrix[i - 1][j - 1]; //no change required, so just carry the current cost up
72          continue;
73        }
74
75        int costOfSubst = costOfSubstitutingLetters + matrix[i - 1][j - 1];
76        //if needed, add up the cost of doing a swap
77        int costOfSwap = Integer.MAX_VALUE;
78        boolean isSwap = (i != 1) && (j != 1) && sourceChar == similar.charAt(j - 1) && word.charAt(i - 1) == otherChar;
79        if (isSwap)
80          costOfSwap = costOfSwappingLetters + matrix[i - 2][j - 2];
81
82        int costOfDelete = costOfDeletingSourceCharacter + matrix[i][j - 1];
83        int costOfInsertion = costOfInsertingSourceCharacter + matrix[i - 1][j];
84
85        int costOfCaseChange = Integer.MAX_VALUE;
86        String strSrcChar = "" + sourceChar;
87        String strOtherChar = "" + otherChar;
88
89        if (strSrcChar.compareToIgnoreCase(strOtherChar) == 0)
90          costOfCaseChange = costOfChangingCase + matrix[i - 1][j - 1];
91
92        matrix[i][j] = minimum(costOfSubst, costOfSwap, costOfDelete, costOfInsertion, costOfCaseChange);
93      }
94    }
95    int cost = matrix[a_size - 1][b_size - 1];
96
97    if (false)
98      System.out.println(dumpMatrix(word, similar, matrix));
99
100    return cost;
101  }
102
103  /**
104   * For debugging, this creates a string that represents the matrix. To read the matrix, look at any square. That is the cost to get from
105   * the partial letters along the top to the partial letters along the side.
106   * @param src - the source string that the matrix columns are based on
107   * @param dest - the dest string that the matrix rows are based on
108   * @param matrix - a two dimensional array of costs (distances)
109   * @return String
110   */
111  static private String dumpMatrix(String src, String dest, int matrix[][]) {
112    StringBuffer s = new StringBuffer("");
113
114    int cols = matrix.length;
115    int rows = matrix[0].length;
116
117    for (int i = 0; i < cols + 1; i++) {
118      for (int j = 0; j < rows + 1; j++) {
119        if (i == 0 && j == 0) {
120          s.append("\n ");
121          continue;
122
123        }
124        if (i == 0) {
125          s.append("|   ");
126          s.append(dest.charAt(j - 1));
127          continue;
128        }
129        if (j == 0) {
130          s.append(src.charAt(i - 1));
131          continue;
132        }
133        String num = Integer.toString(matrix[i - 1][j - 1]);
134        int padding = 4 - num.length();
135        s.append("|");
136        for (int k = 0; k < padding; k++)
137          s.append(' ');
138        s.append(num);
139      }
140      s.append('\n');
141    }
142    return s.toString();
143
144  }
145
146
147  static private int minimum(int a, int b, int c, int d, int e) {
148    int mi = a;
149    if (b < mi)
150      mi = b;
151    if (c < mi)
152      mi = c;
153    if (d < mi)
154      mi = d;
155    if (e < mi)
156      mi = e;
157
158    return mi;
159  }
160
161
162  public static void main(String[] args) throws Exception {
163    BufferedReader stdin = new BufferedReader(new InputStreamReader(System.in));
164
165    while (true) {
166
167      String input1 = stdin.readLine();
168      if (input1 == null || input1.length() == 0)
169        break;
170
171      String input2 = stdin.readLine();
172      if (input2 == null || input2.length() == 0)
173        break;
174
175      System.out.println(EditDistance.getDistance(input1, input2));
176    }
177    System.out.println("done");
178  }
179}
180
181
Note: See TracBrowser for help on using the repository browser.