Context Navigation

source: trunk/jabberit_messenger/java_source/src/com/swabunga/spell/engine/GenericTransformator.java @ 1014

Revision 1014, 10.5 KB checked in by alexandrecorreia, 15 years ago (diff)
Ticket #552 - Inclusão do projeto Java referente ao applet do módulo.

Rev	Line
[1014]	1	package com.swabunga.spell.engine;
	2
	3	import com.swabunga.util.StringUtility;
	4
	5	import java.io.*;
	6	import java.util.HashMap;
	7	import java.util.Vector;
	8
	9	/**
	10	* A Generic implementation of a transformator takes an aspell phonetics file and constructs
	11	* some sort of transformation table using the inner class Rule.
	12	*
	13	* @author Robert Gustavsson (robert@lindesign.se)
	14	*/
	15	public class GenericTransformator implements Transformator {
	16
	17
	18	/**
	19	* This replace list is used if no phonetic file is supplied or it doesn't
	20	* contain the alphabet.
	21	*/
	22	private static final char[] defaultEnglishAlphabet = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'};
	23
	24
	25	public static final char ALPHABET_START = '[';
	26	public static final char ALPHABET_END = ']';
	27	public static final String KEYWORD_ALPHBET = "alphabet";
	28	public static final String[] IGNORED_KEYWORDS = {"version", "followup", "collapse_result"};
	29
	30	public static final char STARTMULTI = '(';
	31	public static final char ENDMULTI = ')';
	32	public static final String DIGITCODE = "0";
	33	public static final String REPLACEVOID = "_";
	34
	35	private Object[] ruleArray = null;
	36	private char[] alphabetString = defaultEnglishAlphabet;
	37
	38	public GenericTransformator(File phonetic) throws IOException {
	39	buildRules(new BufferedReader(new FileReader(phonetic)));
	40	alphabetString = washAlphabetIntoReplaceList(getReplaceList());
	41
	42	}
	43
	44	public GenericTransformator(File phonetic, String encoding) throws IOException {
	45	buildRules(new BufferedReader(new InputStreamReader(new FileInputStream(phonetic), encoding)));
	46	alphabetString = washAlphabetIntoReplaceList(getReplaceList());
	47	}
	48
	49	public GenericTransformator(Reader phonetic) throws IOException {
	50	buildRules(new BufferedReader(phonetic));
	51	alphabetString = washAlphabetIntoReplaceList(getReplaceList());
	52	}
	53
	54	/**
	55	* Goes through an alphabet and makes sure that only one of those letters
	56	* that are coded equally will be in the replace list.
	57	* In other words, it removes any letters in the alphabet
	58	* that are redundant phonetically.
	59	*
	60	* This is done to improve speed in the getSuggestion method.
	61	*
	62	* @param alphabet The complete alphabet to wash.
	63	* @return The washed alphabet to be used as replace list.
	64	*/
	65	private char[] washAlphabetIntoReplaceList(char[] alphabet) {
	66
	67	HashMap letters = new HashMap(alphabet.length);
	68
	69	for (int i = 0; i < alphabet.length; i++) {
	70	String tmp = String.valueOf(alphabet[i]);
	71	String code = transform(tmp);
	72	if (!letters.containsKey(code)) {
	73	letters.put(code, new Character(alphabet[i]));
	74	}
	75	}
	76
	77	Object[] tmpCharacters = letters.values().toArray();
	78	char[] washedArray = new char[tmpCharacters.length];
	79
	80	for (int i = 0; i < tmpCharacters.length; i++) {
	81	washedArray[i] = ((Character) tmpCharacters[i]).charValue();
	82	}
	83
	84	return washedArray;
	85	}
	86
	87
	88	/**
	89	* Takes out all single character replacements and put them in a char array.
	90	* This array can later be used for adding or changing letters in getSuggestion().
	91	* @return char[] An array of chars with replacements characters
	92	*/
	93	public char[] getCodeReplaceList() {
	94	char[] replacements;
	95	TransformationRule rule;
	96	Vector tmp = new Vector();
	97
	98	if (ruleArray == null)
	99	return null;
	100	for (int i = 0; i < ruleArray.length; i++) {
	101	rule = (TransformationRule) ruleArray[i];
	102	if (rule.getReplaceExp().length() == 1)
	103	tmp.addElement(rule.getReplaceExp());
	104	}
	105	replacements = new char[tmp.size()];
	106	for (int i = 0; i < tmp.size(); i++) {
	107	replacements[i] = ((String) tmp.elementAt(i)).charAt(0);
	108	}
	109	return replacements;
	110	}
	111
	112	/**
	113	* Builds up an char array with the chars in the alphabet of the language as it was read from the
	114	* alphabet tag in the phonetic file.
	115	* @return char[] An array of chars representing the alphabet or null if no alphabet was available.
	116	*/
	117	public char[] getReplaceList() {
	118	return alphabetString;
	119	}
	120
	121	/**
	122	* Returns the phonetic code of the word.
	123	*/
	124	public String transform(String word) {
	125
	126	if (ruleArray == null)
	127	return null;
	128
	129	TransformationRule rule;
	130	StringBuffer str = new StringBuffer(word.toUpperCase());
	131	int strLength = str.length();
	132	int startPos = 0, add = 1;
	133
	134	while (startPos < strLength) {
	135
	136	add = 1;
	137	if (Character.isDigit(str.charAt(startPos))) {
	138	StringUtility.replace(str, startPos, startPos + DIGITCODE.length(), DIGITCODE);
	139	startPos += add;
	140	continue;
	141	}
	142
	143	for (int i = 0; i < ruleArray.length; i++) {
	144	//System.out.println("Testing rule#:"+i);
	145	rule = (TransformationRule) ruleArray[i];
	146	if (rule.startsWithExp() && startPos > 0)
	147	continue;
	148	if (startPos + rule.lengthOfMatch() > strLength) {
	149	continue;
	150	}
	151	if (rule.isMatching(str, startPos)) {
	152	String replaceExp = rule.getReplaceExp();
	153
	154	add = replaceExp.length();
	155	StringUtility.replace(str, startPos, startPos + rule.getTakeOut(), replaceExp);
	156	strLength -= rule.getTakeOut();
	157	strLength += add;
	158	//System.out.println("Replacing with rule#:"+i+" add="+add);
	159	break;
	160	}
	161	}
	162	startPos += add;
	163	}
	164	//System.out.println(word);
	165	//System.out.println(str.toString());
	166	return str.toString();
	167	}
	168
	169	// Used to build up the transformastion table.
	170	private void buildRules(BufferedReader in) throws IOException {
	171	String read = null;
	172	Vector ruleList = new Vector();
	173	while ((read = in.readLine()) != null) {
	174	buildRule(realTrimmer(read), ruleList);
	175	}
	176	ruleArray = new TransformationRule[ruleList.size()];
	177	ruleList.copyInto(ruleArray);
	178	}
	179
	180	// Here is where the real work of reading the phonetics file is done.
	181	private void buildRule(String str, Vector ruleList) {
	182	if (str.length() < 1)
	183	return;
	184	for (int i = 0; i < IGNORED_KEYWORDS.length; i++) {
	185	if (str.startsWith(IGNORED_KEYWORDS[i]))
	186	return;
	187	}
	188
	189	// A different alphabet is used for this language, will be read into
	190	// the alphabetString variable.
	191	if (str.startsWith(KEYWORD_ALPHBET)) {
	192	int start = str.indexOf(ALPHABET_START);
	193	int end = str.lastIndexOf(ALPHABET_END);
	194	if (end != -1 && start != -1) {
	195	alphabetString = str.substring(++start, end).toCharArray();
	196	}
	197	return;
	198	}
	199
	200	TransformationRule rule = null;
	201	StringBuffer matchExp = new StringBuffer();
	202	StringBuffer replaceExp = new StringBuffer();
	203	boolean start = false,
	204	end = false;
	205	int takeOutPart = 0,
	206	matchLength = 0;
	207	boolean match = true,
	208	inMulti = false;
	209	for (int i = 0; i < str.length(); i++) {
	210	if (Character.isWhitespace(str.charAt(i))) {
	211	match = false;
	212	} else {
	213	if (match) {
	214	if (!isReservedChar(str.charAt(i))) {
	215	matchExp.append(str.charAt(i));
	216	if (!inMulti) {
	217	takeOutPart++;
	218	matchLength++;
	219	}
	220	if (str.charAt(i) == STARTMULTI \|\| str.charAt(i) == ENDMULTI)
	221	inMulti = !inMulti;
	222	}
	223	if (str.charAt(i) == '-')
	224	takeOutPart--;
	225	if (str.charAt(i) == '^')
	226	start = true;
	227	if (str.charAt(i) == '$')
	228	end = true;
	229	} else {
	230	replaceExp.append(str.charAt(i));
	231	}
	232	}
	233	}
	234	if (replaceExp.toString().equals(REPLACEVOID)) {
	235	replaceExp = new StringBuffer("");
	236	//System.out.println("Changing _ to \"\" for "+matchExp.toString());
	237	}
	238	rule = new TransformationRule(matchExp.toString(), replaceExp.toString(), takeOutPart, matchLength, start, end);
	239	//System.out.println(rule.toString());
	240	ruleList.addElement(rule);
	241	}
	242
	243	// Chars with special meaning to aspell. Not everyone is implemented here.
	244	private boolean isReservedChar(char ch) {
	245	if (ch == '<' \|\| ch == '>' \|\| ch == '^' \|\| ch == '$' \|\| ch == '-' \|\| Character.isDigit(ch))
	246	return true;
	247	return false;
	248	}
	249
	250	// Trims off everything we don't care about.
	251	private String realTrimmer(String row) {
	252	int pos = row.indexOf('#');
	253	if (pos != -1) {
	254	row = row.substring(0, pos);
	255	}
	256	return row.trim();
	257	}
	258
	259	// Inner Classes
	260	/*
	261	* Holds the match string and the replace string and all the rule attributes.
	262	* Is responsible for indicating matches.
	263	*/
	264	private class TransformationRule {
	265
	266	private String replace;
	267	private char[] match;
	268	// takeOut=number of chars to replace;
	269	// matchLength=length of matching string counting multies as one.
	270	private int takeOut, matchLength;
	271	private boolean start, end;
	272
	273	// Construktor
	274	public TransformationRule(String match, String replace, int takeout, int matchLength, boolean start, boolean end) {
	275	this.match = match.toCharArray();
	276	this.replace = replace;
	277	this.takeOut = takeout;
	278	this.matchLength = matchLength;
	279	this.start = start;
	280	this.end = end;
	281	}
	282
	283	/*
	284	* Returns true if word from pos and forward matches the match string.
	285	* Precondition: wordPos+matchLength<word.length()
	286	*/
	287	public boolean isMatching(StringBuffer word, int wordPos) {
	288	boolean matching = true, inMulti = false, multiMatch = false;
	289	char matchCh;
	290
	291	for (int matchPos = 0; matchPos < match.length; matchPos++) {
	292	matchCh = match[matchPos];
	293	if (matchCh == STARTMULTI \|\| matchCh == ENDMULTI) {
	294	inMulti = !inMulti;
	295	if (!inMulti)
	296	matching = matching & multiMatch;
	297	else
	298	multiMatch = false;
	299	} else {
	300	if (matchCh != word.charAt(wordPos)) {
	301	if (inMulti)
	302	multiMatch = multiMatch \| false;
	303	else
	304	matching = false;
	305	} else {
	306	if (inMulti)
	307	multiMatch = multiMatch \| true;
	308	else
	309	matching = true;
	310	}
	311	if (!inMulti)
	312	wordPos++;
	313	if (!matching)
	314	break;
	315	}
	316	}
	317	if (end && wordPos != word.length())
	318	matching = false;
	319	return matching;
	320	}
	321
	322	public String getReplaceExp() {
	323	return replace;
	324	}
	325
	326	public int getTakeOut() {
	327	return takeOut;
	328	}
	329
	330	public boolean startsWithExp() {
	331	return start;
	332	}
	333
	334	public int lengthOfMatch() {
	335	return matchLength;
	336	}
	337
	338	// Just for debugging purposes.
	339	public String toString() {
	340	return "Match:" + String.valueOf(match) + " Replace:" + replace + " TakeOut:" + takeOut + " MatchLength:" + matchLength + " Start:" + start + " End:" + end;
	341	}
	342
	343	}
	344	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: