1 | package com.swabunga.spell.engine; |
---|
2 | |
---|
3 | import java.io.BufferedReader; |
---|
4 | import java.io.InputStreamReader; |
---|
5 | |
---|
6 | /** |
---|
7 | * This class is based on Levenshtein Distance algorithms, and it calculates how similar two words are. |
---|
8 | * If the words are identitical, then the distance is 0. The more that the words have in common, the lower the distance value. |
---|
9 | * The distance value is based on how many operations it takes to get from one word to the other. Possible operations are |
---|
10 | * swapping characters, adding a character, deleting a character, and substituting a character. |
---|
11 | * The resulting distance is the sum of these operations weighted by their cost, which can be set in the Configuration object. |
---|
12 | * When there are multiple ways to convert one word into the other, the lowest cost distance is returned. |
---|
13 | * <br/> |
---|
14 | * Another way to think about this: what are the cheapest operations that would have to be done on the "original" word to end up |
---|
15 | * with the "similar" word? Each operation has a cost, and these are added up to get the distance. |
---|
16 | * <br/> |
---|
17 | * |
---|
18 | * @see com.swabunga.spell.engine.Configuration#COST_REMOVE_CHAR |
---|
19 | * @see com.swabunga.spell.engine.Configuration#COST_INSERT_CHAR |
---|
20 | * @see com.swabunga.spell.engine.Configuration#COST_SUBST_CHARS |
---|
21 | * @see com.swabunga.spell.engine.Configuration#COST_SWAP_CHARS |
---|
22 | * |
---|
23 | */ |
---|
24 | |
---|
25 | public class EditDistance { |
---|
26 | |
---|
27 | /** |
---|
28 | * JMH Again, there is no need to have a global class matrix variable |
---|
29 | * in this class. I have removed it and made the getDistance static final |
---|
30 | * |
---|
31 | * DMV: I refactored this method to make it more efficient, more readable, and simpler. |
---|
32 | * I also fixed a bug with how the distance was being calculated. You could get wrong |
---|
33 | * distances if you compared ("abc" to "ab") depending on what you had setup your |
---|
34 | * COST_REMOVE_CHAR and EDIT_INSERTION_COST values to - that is now fixed. |
---|
35 | * |
---|
36 | * WRS: I added a distance for case comparison, so a misspelling of "i" would be closer to "I" than |
---|
37 | * to "a". |
---|
38 | */ |
---|
39 | |
---|
40 | public static Configuration config = Configuration.getConfiguration(); |
---|
41 | |
---|
42 | public static final int getDistance(String word, String similar) { |
---|
43 | |
---|
44 | //get the weights for each possible operation |
---|
45 | final int costOfDeletingSourceCharacter = config.getInteger(Configuration.COST_REMOVE_CHAR); |
---|
46 | final int costOfInsertingSourceCharacter = config.getInteger(Configuration.COST_INSERT_CHAR); |
---|
47 | final int costOfSubstitutingLetters = config.getInteger(Configuration.COST_SUBST_CHARS); |
---|
48 | final int costOfSwappingLetters = config.getInteger(Configuration.COST_SWAP_CHARS); |
---|
49 | final int costOfChangingCase = config.getInteger(Configuration.COST_CHANGE_CASE); |
---|
50 | |
---|
51 | int a_size = word.length() + 1; |
---|
52 | int b_size = similar.length() + 1; |
---|
53 | int[][] matrix = new int[a_size][b_size]; |
---|
54 | matrix[0][0] = 0; |
---|
55 | |
---|
56 | for (int i = 1; i != a_size; ++i) |
---|
57 | matrix[i][0] = matrix[i - 1][0] + costOfInsertingSourceCharacter; //initialize the first column |
---|
58 | |
---|
59 | for (int j = 1; j != b_size; ++j) |
---|
60 | matrix[0][j] = matrix[0][j - 1] + costOfDeletingSourceCharacter; //initalize the first row |
---|
61 | |
---|
62 | word = " " + word; |
---|
63 | similar = " " + similar; |
---|
64 | |
---|
65 | for (int i = 1; i != a_size; ++i) { |
---|
66 | char sourceChar = word.charAt(i); |
---|
67 | for (int j = 1; j != b_size; ++j) { |
---|
68 | |
---|
69 | char otherChar = similar.charAt(j); |
---|
70 | if (sourceChar == otherChar) { |
---|
71 | matrix[i][j] = matrix[i - 1][j - 1]; //no change required, so just carry the current cost up |
---|
72 | continue; |
---|
73 | } |
---|
74 | |
---|
75 | int costOfSubst = costOfSubstitutingLetters + matrix[i - 1][j - 1]; |
---|
76 | //if needed, add up the cost of doing a swap |
---|
77 | int costOfSwap = Integer.MAX_VALUE; |
---|
78 | boolean isSwap = (i != 1) && (j != 1) && sourceChar == similar.charAt(j - 1) && word.charAt(i - 1) == otherChar; |
---|
79 | if (isSwap) |
---|
80 | costOfSwap = costOfSwappingLetters + matrix[i - 2][j - 2]; |
---|
81 | |
---|
82 | int costOfDelete = costOfDeletingSourceCharacter + matrix[i][j - 1]; |
---|
83 | int costOfInsertion = costOfInsertingSourceCharacter + matrix[i - 1][j]; |
---|
84 | |
---|
85 | int costOfCaseChange = Integer.MAX_VALUE; |
---|
86 | String strSrcChar = "" + sourceChar; |
---|
87 | String strOtherChar = "" + otherChar; |
---|
88 | |
---|
89 | if (strSrcChar.compareToIgnoreCase(strOtherChar) == 0) |
---|
90 | costOfCaseChange = costOfChangingCase + matrix[i - 1][j - 1]; |
---|
91 | |
---|
92 | matrix[i][j] = minimum(costOfSubst, costOfSwap, costOfDelete, costOfInsertion, costOfCaseChange); |
---|
93 | } |
---|
94 | } |
---|
95 | int cost = matrix[a_size - 1][b_size - 1]; |
---|
96 | |
---|
97 | if (false) |
---|
98 | System.out.println(dumpMatrix(word, similar, matrix)); |
---|
99 | |
---|
100 | return cost; |
---|
101 | } |
---|
102 | |
---|
103 | /** |
---|
104 | * For debugging, this creates a string that represents the matrix. To read the matrix, look at any square. That is the cost to get from |
---|
105 | * the partial letters along the top to the partial letters along the side. |
---|
106 | * @param src - the source string that the matrix columns are based on |
---|
107 | * @param dest - the dest string that the matrix rows are based on |
---|
108 | * @param matrix - a two dimensional array of costs (distances) |
---|
109 | * @return String |
---|
110 | */ |
---|
111 | static private String dumpMatrix(String src, String dest, int matrix[][]) { |
---|
112 | StringBuffer s = new StringBuffer(""); |
---|
113 | |
---|
114 | int cols = matrix.length; |
---|
115 | int rows = matrix[0].length; |
---|
116 | |
---|
117 | for (int i = 0; i < cols + 1; i++) { |
---|
118 | for (int j = 0; j < rows + 1; j++) { |
---|
119 | if (i == 0 && j == 0) { |
---|
120 | s.append("\n "); |
---|
121 | continue; |
---|
122 | |
---|
123 | } |
---|
124 | if (i == 0) { |
---|
125 | s.append("| "); |
---|
126 | s.append(dest.charAt(j - 1)); |
---|
127 | continue; |
---|
128 | } |
---|
129 | if (j == 0) { |
---|
130 | s.append(src.charAt(i - 1)); |
---|
131 | continue; |
---|
132 | } |
---|
133 | String num = Integer.toString(matrix[i - 1][j - 1]); |
---|
134 | int padding = 4 - num.length(); |
---|
135 | s.append("|"); |
---|
136 | for (int k = 0; k < padding; k++) |
---|
137 | s.append(' '); |
---|
138 | s.append(num); |
---|
139 | } |
---|
140 | s.append('\n'); |
---|
141 | } |
---|
142 | return s.toString(); |
---|
143 | |
---|
144 | } |
---|
145 | |
---|
146 | |
---|
147 | static private int minimum(int a, int b, int c, int d, int e) { |
---|
148 | int mi = a; |
---|
149 | if (b < mi) |
---|
150 | mi = b; |
---|
151 | if (c < mi) |
---|
152 | mi = c; |
---|
153 | if (d < mi) |
---|
154 | mi = d; |
---|
155 | if (e < mi) |
---|
156 | mi = e; |
---|
157 | |
---|
158 | return mi; |
---|
159 | } |
---|
160 | |
---|
161 | |
---|
162 | public static void main(String[] args) throws Exception { |
---|
163 | BufferedReader stdin = new BufferedReader(new InputStreamReader(System.in)); |
---|
164 | |
---|
165 | while (true) { |
---|
166 | |
---|
167 | String input1 = stdin.readLine(); |
---|
168 | if (input1 == null || input1.length() == 0) |
---|
169 | break; |
---|
170 | |
---|
171 | String input2 = stdin.readLine(); |
---|
172 | if (input2 == null || input2.length() == 0) |
---|
173 | break; |
---|
174 | |
---|
175 | System.out.println(EditDistance.getDistance(input1, input2)); |
---|
176 | } |
---|
177 | System.out.println("done"); |
---|
178 | } |
---|
179 | } |
---|
180 | |
---|
181 | |
---|