package com.swabunga.spell.engine; import java.io.BufferedReader; import java.io.InputStreamReader; /** * This class is based on Levenshtein Distance algorithms, and it calculates how similar two words are. * If the words are identitical, then the distance is 0. The more that the words have in common, the lower the distance value. * The distance value is based on how many operations it takes to get from one word to the other. Possible operations are * swapping characters, adding a character, deleting a character, and substituting a character. * The resulting distance is the sum of these operations weighted by their cost, which can be set in the Configuration object. * When there are multiple ways to convert one word into the other, the lowest cost distance is returned. *
* Another way to think about this: what are the cheapest operations that would have to be done on the "original" word to end up * with the "similar" word? Each operation has a cost, and these are added up to get the distance. *
* * @see com.swabunga.spell.engine.Configuration#COST_REMOVE_CHAR * @see com.swabunga.spell.engine.Configuration#COST_INSERT_CHAR * @see com.swabunga.spell.engine.Configuration#COST_SUBST_CHARS * @see com.swabunga.spell.engine.Configuration#COST_SWAP_CHARS * */ public class EditDistance { /** * JMH Again, there is no need to have a global class matrix variable * in this class. I have removed it and made the getDistance static final * * DMV: I refactored this method to make it more efficient, more readable, and simpler. * I also fixed a bug with how the distance was being calculated. You could get wrong * distances if you compared ("abc" to "ab") depending on what you had setup your * COST_REMOVE_CHAR and EDIT_INSERTION_COST values to - that is now fixed. * * WRS: I added a distance for case comparison, so a misspelling of "i" would be closer to "I" than * to "a". */ public static Configuration config = Configuration.getConfiguration(); public static final int getDistance(String word, String similar) { //get the weights for each possible operation final int costOfDeletingSourceCharacter = config.getInteger(Configuration.COST_REMOVE_CHAR); final int costOfInsertingSourceCharacter = config.getInteger(Configuration.COST_INSERT_CHAR); final int costOfSubstitutingLetters = config.getInteger(Configuration.COST_SUBST_CHARS); final int costOfSwappingLetters = config.getInteger(Configuration.COST_SWAP_CHARS); final int costOfChangingCase = config.getInteger(Configuration.COST_CHANGE_CASE); int a_size = word.length() + 1; int b_size = similar.length() + 1; int[][] matrix = new int[a_size][b_size]; matrix[0][0] = 0; for (int i = 1; i != a_size; ++i) matrix[i][0] = matrix[i - 1][0] + costOfInsertingSourceCharacter; //initialize the first column for (int j = 1; j != b_size; ++j) matrix[0][j] = matrix[0][j - 1] + costOfDeletingSourceCharacter; //initalize the first row word = " " + word; similar = " " + similar; for (int i = 1; i != a_size; ++i) { char sourceChar = word.charAt(i); for (int j = 1; j != b_size; ++j) { char otherChar = similar.charAt(j); if (sourceChar == otherChar) { matrix[i][j] = matrix[i - 1][j - 1]; //no change required, so just carry the current cost up continue; } int costOfSubst = costOfSubstitutingLetters + matrix[i - 1][j - 1]; //if needed, add up the cost of doing a swap int costOfSwap = Integer.MAX_VALUE; boolean isSwap = (i != 1) && (j != 1) && sourceChar == similar.charAt(j - 1) && word.charAt(i - 1) == otherChar; if (isSwap) costOfSwap = costOfSwappingLetters + matrix[i - 2][j - 2]; int costOfDelete = costOfDeletingSourceCharacter + matrix[i][j - 1]; int costOfInsertion = costOfInsertingSourceCharacter + matrix[i - 1][j]; int costOfCaseChange = Integer.MAX_VALUE; String strSrcChar = "" + sourceChar; String strOtherChar = "" + otherChar; if (strSrcChar.compareToIgnoreCase(strOtherChar) == 0) costOfCaseChange = costOfChangingCase + matrix[i - 1][j - 1]; matrix[i][j] = minimum(costOfSubst, costOfSwap, costOfDelete, costOfInsertion, costOfCaseChange); } } int cost = matrix[a_size - 1][b_size - 1]; if (false) System.out.println(dumpMatrix(word, similar, matrix)); return cost; } /** * For debugging, this creates a string that represents the matrix. To read the matrix, look at any square. That is the cost to get from * the partial letters along the top to the partial letters along the side. * @param src - the source string that the matrix columns are based on * @param dest - the dest string that the matrix rows are based on * @param matrix - a two dimensional array of costs (distances) * @return String */ static private String dumpMatrix(String src, String dest, int matrix[][]) { StringBuffer s = new StringBuffer(""); int cols = matrix.length; int rows = matrix[0].length; for (int i = 0; i < cols + 1; i++) { for (int j = 0; j < rows + 1; j++) { if (i == 0 && j == 0) { s.append("\n "); continue; } if (i == 0) { s.append("| "); s.append(dest.charAt(j - 1)); continue; } if (j == 0) { s.append(src.charAt(i - 1)); continue; } String num = Integer.toString(matrix[i - 1][j - 1]); int padding = 4 - num.length(); s.append("|"); for (int k = 0; k < padding; k++) s.append(' '); s.append(num); } s.append('\n'); } return s.toString(); } static private int minimum(int a, int b, int c, int d, int e) { int mi = a; if (b < mi) mi = b; if (c < mi) mi = c; if (d < mi) mi = d; if (e < mi) mi = e; return mi; } public static void main(String[] args) throws Exception { BufferedReader stdin = new BufferedReader(new InputStreamReader(System.in)); while (true) { String input1 = stdin.readLine(); if (input1 == null || input1.length() == 0) break; String input2 = stdin.readLine(); if (input2 == null || input2.length() == 0) break; System.out.println(EditDistance.getDistance(input1, input2)); } System.out.println("done"); } }