[3952] | 1 | package com.swabunga.spell.engine; |
---|
| 2 | |
---|
| 3 | import java.io.BufferedReader; |
---|
| 4 | import java.io.InputStreamReader; |
---|
| 5 | |
---|
| 6 | /** |
---|
| 7 | * This class is based on Levenshtein Distance algorithms, and it calculates how similar two words are. |
---|
| 8 | * If the words are identitical, then the distance is 0. The more that the words have in common, the lower the distance value. |
---|
| 9 | * The distance value is based on how many operations it takes to get from one word to the other. Possible operations are |
---|
| 10 | * swapping characters, adding a character, deleting a character, and substituting a character. |
---|
| 11 | * The resulting distance is the sum of these operations weighted by their cost, which can be set in the Configuration object. |
---|
| 12 | * When there are multiple ways to convert one word into the other, the lowest cost distance is returned. |
---|
| 13 | * <br/> |
---|
| 14 | * Another way to think about this: what are the cheapest operations that would have to be done on the "original" word to end up |
---|
| 15 | * with the "similar" word? Each operation has a cost, and these are added up to get the distance. |
---|
| 16 | * <br/> |
---|
| 17 | * |
---|
| 18 | * @see com.swabunga.spell.engine.Configuration#COST_REMOVE_CHAR |
---|
| 19 | * @see com.swabunga.spell.engine.Configuration#COST_INSERT_CHAR |
---|
| 20 | * @see com.swabunga.spell.engine.Configuration#COST_SUBST_CHARS |
---|
| 21 | * @see com.swabunga.spell.engine.Configuration#COST_SWAP_CHARS |
---|
| 22 | * |
---|
| 23 | */ |
---|
| 24 | |
---|
| 25 | public class EditDistance { |
---|
| 26 | |
---|
| 27 | /** |
---|
| 28 | * JMH Again, there is no need to have a global class matrix variable |
---|
| 29 | * in this class. I have removed it and made the getDistance static final |
---|
| 30 | * |
---|
| 31 | * DMV: I refactored this method to make it more efficient, more readable, and simpler. |
---|
| 32 | * I also fixed a bug with how the distance was being calculated. You could get wrong |
---|
| 33 | * distances if you compared ("abc" to "ab") depending on what you had setup your |
---|
| 34 | * COST_REMOVE_CHAR and EDIT_INSERTION_COST values to - that is now fixed. |
---|
| 35 | * |
---|
| 36 | * WRS: I added a distance for case comparison, so a misspelling of "i" would be closer to "I" than |
---|
| 37 | * to "a". |
---|
| 38 | */ |
---|
| 39 | |
---|
| 40 | public static Configuration config = Configuration.getConfiguration(); |
---|
| 41 | |
---|
| 42 | public static final int getDistance(String word, String similar) { |
---|
| 43 | |
---|
| 44 | //get the weights for each possible operation |
---|
| 45 | final int costOfDeletingSourceCharacter = config.getInteger(Configuration.COST_REMOVE_CHAR); |
---|
| 46 | final int costOfInsertingSourceCharacter = config.getInteger(Configuration.COST_INSERT_CHAR); |
---|
| 47 | final int costOfSubstitutingLetters = config.getInteger(Configuration.COST_SUBST_CHARS); |
---|
| 48 | final int costOfSwappingLetters = config.getInteger(Configuration.COST_SWAP_CHARS); |
---|
| 49 | final int costOfChangingCase = config.getInteger(Configuration.COST_CHANGE_CASE); |
---|
| 50 | |
---|
| 51 | int a_size = word.length() + 1; |
---|
| 52 | int b_size = similar.length() + 1; |
---|
| 53 | int[][] matrix = new int[a_size][b_size]; |
---|
| 54 | matrix[0][0] = 0; |
---|
| 55 | |
---|
| 56 | for (int i = 1; i != a_size; ++i) |
---|
| 57 | matrix[i][0] = matrix[i - 1][0] + costOfInsertingSourceCharacter; //initialize the first column |
---|
| 58 | |
---|
| 59 | for (int j = 1; j != b_size; ++j) |
---|
| 60 | matrix[0][j] = matrix[0][j - 1] + costOfDeletingSourceCharacter; //initalize the first row |
---|
| 61 | |
---|
| 62 | word = " " + word; |
---|
| 63 | similar = " " + similar; |
---|
| 64 | |
---|
| 65 | for (int i = 1; i != a_size; ++i) { |
---|
| 66 | char sourceChar = word.charAt(i); |
---|
| 67 | for (int j = 1; j != b_size; ++j) { |
---|
| 68 | |
---|
| 69 | char otherChar = similar.charAt(j); |
---|
| 70 | if (sourceChar == otherChar) { |
---|
| 71 | matrix[i][j] = matrix[i - 1][j - 1]; //no change required, so just carry the current cost up |
---|
| 72 | continue; |
---|
| 73 | } |
---|
| 74 | |
---|
| 75 | int costOfSubst = costOfSubstitutingLetters + matrix[i - 1][j - 1]; |
---|
| 76 | //if needed, add up the cost of doing a swap |
---|
| 77 | int costOfSwap = Integer.MAX_VALUE; |
---|
| 78 | boolean isSwap = (i != 1) && (j != 1) && sourceChar == similar.charAt(j - 1) && word.charAt(i - 1) == otherChar; |
---|
| 79 | if (isSwap) |
---|
| 80 | costOfSwap = costOfSwappingLetters + matrix[i - 2][j - 2]; |
---|
| 81 | |
---|
| 82 | int costOfDelete = costOfDeletingSourceCharacter + matrix[i][j - 1]; |
---|
| 83 | int costOfInsertion = costOfInsertingSourceCharacter + matrix[i - 1][j]; |
---|
| 84 | |
---|
| 85 | int costOfCaseChange = Integer.MAX_VALUE; |
---|
| 86 | String strSrcChar = "" + sourceChar; |
---|
| 87 | String strOtherChar = "" + otherChar; |
---|
| 88 | |
---|
| 89 | if (strSrcChar.compareToIgnoreCase(strOtherChar) == 0) |
---|
| 90 | costOfCaseChange = costOfChangingCase + matrix[i - 1][j - 1]; |
---|
| 91 | |
---|
| 92 | matrix[i][j] = minimum(costOfSubst, costOfSwap, costOfDelete, costOfInsertion, costOfCaseChange); |
---|
| 93 | } |
---|
| 94 | } |
---|
| 95 | int cost = matrix[a_size - 1][b_size - 1]; |
---|
| 96 | |
---|
| 97 | if (false) |
---|
| 98 | System.out.println(dumpMatrix(word, similar, matrix)); |
---|
| 99 | |
---|
| 100 | return cost; |
---|
| 101 | } |
---|
| 102 | |
---|
| 103 | /** |
---|
| 104 | * For debugging, this creates a string that represents the matrix. To read the matrix, look at any square. That is the cost to get from |
---|
| 105 | * the partial letters along the top to the partial letters along the side. |
---|
| 106 | * @param src - the source string that the matrix columns are based on |
---|
| 107 | * @param dest - the dest string that the matrix rows are based on |
---|
| 108 | * @param matrix - a two dimensional array of costs (distances) |
---|
| 109 | * @return String |
---|
| 110 | */ |
---|
| 111 | static private String dumpMatrix(String src, String dest, int matrix[][]) { |
---|
| 112 | StringBuffer s = new StringBuffer(""); |
---|
| 113 | |
---|
| 114 | int cols = matrix.length; |
---|
| 115 | int rows = matrix[0].length; |
---|
| 116 | |
---|
| 117 | for (int i = 0; i < cols + 1; i++) { |
---|
| 118 | for (int j = 0; j < rows + 1; j++) { |
---|
| 119 | if (i == 0 && j == 0) { |
---|
| 120 | s.append("\n "); |
---|
| 121 | continue; |
---|
| 122 | |
---|
| 123 | } |
---|
| 124 | if (i == 0) { |
---|
| 125 | s.append("| "); |
---|
| 126 | s.append(dest.charAt(j - 1)); |
---|
| 127 | continue; |
---|
| 128 | } |
---|
| 129 | if (j == 0) { |
---|
| 130 | s.append(src.charAt(i - 1)); |
---|
| 131 | continue; |
---|
| 132 | } |
---|
| 133 | String num = Integer.toString(matrix[i - 1][j - 1]); |
---|
| 134 | int padding = 4 - num.length(); |
---|
| 135 | s.append("|"); |
---|
| 136 | for (int k = 0; k < padding; k++) |
---|
| 137 | s.append(' '); |
---|
| 138 | s.append(num); |
---|
| 139 | } |
---|
| 140 | s.append('\n'); |
---|
| 141 | } |
---|
| 142 | return s.toString(); |
---|
| 143 | |
---|
| 144 | } |
---|
| 145 | |
---|
| 146 | |
---|
| 147 | static private int minimum(int a, int b, int c, int d, int e) { |
---|
| 148 | int mi = a; |
---|
| 149 | if (b < mi) |
---|
| 150 | mi = b; |
---|
| 151 | if (c < mi) |
---|
| 152 | mi = c; |
---|
| 153 | if (d < mi) |
---|
| 154 | mi = d; |
---|
| 155 | if (e < mi) |
---|
| 156 | mi = e; |
---|
| 157 | |
---|
| 158 | return mi; |
---|
| 159 | } |
---|
| 160 | |
---|
| 161 | |
---|
| 162 | public static void main(String[] args) throws Exception { |
---|
| 163 | BufferedReader stdin = new BufferedReader(new InputStreamReader(System.in)); |
---|
| 164 | |
---|
| 165 | while (true) { |
---|
| 166 | |
---|
| 167 | String input1 = stdin.readLine(); |
---|
| 168 | if (input1 == null || input1.length() == 0) |
---|
| 169 | break; |
---|
| 170 | |
---|
| 171 | String input2 = stdin.readLine(); |
---|
| 172 | if (input2 == null || input2.length() == 0) |
---|
| 173 | break; |
---|
| 174 | |
---|
| 175 | System.out.println(EditDistance.getDistance(input1, input2)); |
---|
| 176 | } |
---|
| 177 | System.out.println("done"); |
---|
| 178 | } |
---|
| 179 | } |
---|
| 180 | |
---|
| 181 | |
---|