Context Navigation

source: branches/2.2/jabberit_messenger/java_source/src/com/swabunga/spell/engine/GenericTransformator.java @ 3102

Revision 3102, 10.5 KB checked in by amuller, 14 years ago (diff)
Ticket #986 - Efetuado merge para o Branch 2.2( atualizacao do modulo)
Property svn:executable set to ``*

Line
1	package com.swabunga.spell.engine;
2
3	import com.swabunga.util.StringUtility;
4
5	import java.io.*;
6	import java.util.HashMap;
7	import java.util.Vector;
8
9	/**
10	* A Generic implementation of a transformator takes an aspell phonetics file and constructs
11	* some sort of transformation table using the inner class Rule.
12	*
13	* @author Robert Gustavsson (robert@lindesign.se)
14	*/
15	public class GenericTransformator implements Transformator {
16
17
18	/**
19	* This replace list is used if no phonetic file is supplied or it doesn't
20	* contain the alphabet.
21	*/
22	private static final char[] defaultEnglishAlphabet = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'};
23
24
25	public static final char ALPHABET_START = '[';
26	public static final char ALPHABET_END = ']';
27	public static final String KEYWORD_ALPHBET = "alphabet";
28	public static final String[] IGNORED_KEYWORDS = {"version", "followup", "collapse_result"};
29
30	public static final char STARTMULTI = '(';
31	public static final char ENDMULTI = ')';
32	public static final String DIGITCODE = "0";
33	public static final String REPLACEVOID = "_";
34
35	private Object[] ruleArray = null;
36	private char[] alphabetString = defaultEnglishAlphabet;
37
38	public GenericTransformator(File phonetic) throws IOException {
39	buildRules(new BufferedReader(new FileReader(phonetic)));
40	alphabetString = washAlphabetIntoReplaceList(getReplaceList());
41
42	}
43
44	public GenericTransformator(File phonetic, String encoding) throws IOException {
45	buildRules(new BufferedReader(new InputStreamReader(new FileInputStream(phonetic), encoding)));
46	alphabetString = washAlphabetIntoReplaceList(getReplaceList());
47	}
48
49	public GenericTransformator(Reader phonetic) throws IOException {
50	buildRules(new BufferedReader(phonetic));
51	alphabetString = washAlphabetIntoReplaceList(getReplaceList());
52	}
53
54	/**
55	* Goes through an alphabet and makes sure that only one of those letters
56	* that are coded equally will be in the replace list.
57	* In other words, it removes any letters in the alphabet
58	* that are redundant phonetically.
59	*
60	* This is done to improve speed in the getSuggestion method.
61	*
62	* @param alphabet The complete alphabet to wash.
63	* @return The washed alphabet to be used as replace list.
64	*/
65	private char[] washAlphabetIntoReplaceList(char[] alphabet) {
66
67	HashMap letters = new HashMap(alphabet.length);
68
69	for (int i = 0; i < alphabet.length; i++) {
70	String tmp = String.valueOf(alphabet[i]);
71	String code = transform(tmp);
72	if (!letters.containsKey(code)) {
73	letters.put(code, new Character(alphabet[i]));
74	}
75	}
76
77	Object[] tmpCharacters = letters.values().toArray();
78	char[] washedArray = new char[tmpCharacters.length];
79
80	for (int i = 0; i < tmpCharacters.length; i++) {
81	washedArray[i] = ((Character) tmpCharacters[i]).charValue();
82	}
83
84	return washedArray;
85	}
86
87
88	/**
89	* Takes out all single character replacements and put them in a char array.
90	* This array can later be used for adding or changing letters in getSuggestion().
91	* @return char[] An array of chars with replacements characters
92	*/
93	public char[] getCodeReplaceList() {
94	char[] replacements;
95	TransformationRule rule;
96	Vector tmp = new Vector();
97
98	if (ruleArray == null)
99	return null;
100	for (int i = 0; i < ruleArray.length; i++) {
101	rule = (TransformationRule) ruleArray[i];
102	if (rule.getReplaceExp().length() == 1)
103	tmp.addElement(rule.getReplaceExp());
104	}
105	replacements = new char[tmp.size()];
106	for (int i = 0; i < tmp.size(); i++) {
107	replacements[i] = ((String) tmp.elementAt(i)).charAt(0);
108	}
109	return replacements;
110	}
111
112	/**
113	* Builds up an char array with the chars in the alphabet of the language as it was read from the
114	* alphabet tag in the phonetic file.
115	* @return char[] An array of chars representing the alphabet or null if no alphabet was available.
116	*/
117	public char[] getReplaceList() {
118	return alphabetString;
119	}
120
121	/**
122	* Returns the phonetic code of the word.
123	*/
124	public String transform(String word) {
125
126	if (ruleArray == null)
127	return null;
128
129	TransformationRule rule;
130	StringBuffer str = new StringBuffer(word.toUpperCase());
131	int strLength = str.length();
132	int startPos = 0, add = 1;
133
134	while (startPos < strLength) {
135
136	add = 1;
137	if (Character.isDigit(str.charAt(startPos))) {
138	StringUtility.replace(str, startPos, startPos + DIGITCODE.length(), DIGITCODE);
139	startPos += add;
140	continue;
141	}
142
143	for (int i = 0; i < ruleArray.length; i++) {
144	//System.out.println("Testing rule#:"+i);
145	rule = (TransformationRule) ruleArray[i];
146	if (rule.startsWithExp() && startPos > 0)
147	continue;
148	if (startPos + rule.lengthOfMatch() > strLength) {
149	continue;
150	}
151	if (rule.isMatching(str, startPos)) {
152	String replaceExp = rule.getReplaceExp();
153
154	add = replaceExp.length();
155	StringUtility.replace(str, startPos, startPos + rule.getTakeOut(), replaceExp);
156	strLength -= rule.getTakeOut();
157	strLength += add;
158	//System.out.println("Replacing with rule#:"+i+" add="+add);
159	break;
160	}
161	}
162	startPos += add;
163	}
164	//System.out.println(word);
165	//System.out.println(str.toString());
166	return str.toString();
167	}
168
169	// Used to build up the transformastion table.
170	private void buildRules(BufferedReader in) throws IOException {
171	String read = null;
172	Vector ruleList = new Vector();
173	while ((read = in.readLine()) != null) {
174	buildRule(realTrimmer(read), ruleList);
175	}
176	ruleArray = new TransformationRule[ruleList.size()];
177	ruleList.copyInto(ruleArray);
178	}
179
180	// Here is where the real work of reading the phonetics file is done.
181	private void buildRule(String str, Vector ruleList) {
182	if (str.length() < 1)
183	return;
184	for (int i = 0; i < IGNORED_KEYWORDS.length; i++) {
185	if (str.startsWith(IGNORED_KEYWORDS[i]))
186	return;
187	}
188
189	// A different alphabet is used for this language, will be read into
190	// the alphabetString variable.
191	if (str.startsWith(KEYWORD_ALPHBET)) {
192	int start = str.indexOf(ALPHABET_START);
193	int end = str.lastIndexOf(ALPHABET_END);
194	if (end != -1 && start != -1) {
195	alphabetString = str.substring(++start, end).toCharArray();
196	}
197	return;
198	}
199
200	TransformationRule rule = null;
201	StringBuffer matchExp = new StringBuffer();
202	StringBuffer replaceExp = new StringBuffer();
203	boolean start = false,
204	end = false;
205	int takeOutPart = 0,
206	matchLength = 0;
207	boolean match = true,
208	inMulti = false;
209	for (int i = 0; i < str.length(); i++) {
210	if (Character.isWhitespace(str.charAt(i))) {
211	match = false;
212	} else {
213	if (match) {
214	if (!isReservedChar(str.charAt(i))) {
215	matchExp.append(str.charAt(i));
216	if (!inMulti) {
217	takeOutPart++;
218	matchLength++;
219	}
220	if (str.charAt(i) == STARTMULTI \|\| str.charAt(i) == ENDMULTI)
221	inMulti = !inMulti;
222	}
223	if (str.charAt(i) == '-')
224	takeOutPart--;
225	if (str.charAt(i) == '^')
226	start = true;
227	if (str.charAt(i) == '$')
228	end = true;
229	} else {
230	replaceExp.append(str.charAt(i));
231	}
232	}
233	}
234	if (replaceExp.toString().equals(REPLACEVOID)) {
235	replaceExp = new StringBuffer("");
236	//System.out.println("Changing _ to \"\" for "+matchExp.toString());
237	}
238	rule = new TransformationRule(matchExp.toString(), replaceExp.toString(), takeOutPart, matchLength, start, end);
239	//System.out.println(rule.toString());
240	ruleList.addElement(rule);
241	}
242
243	// Chars with special meaning to aspell. Not everyone is implemented here.
244	private boolean isReservedChar(char ch) {
245	if (ch == '<' \|\| ch == '>' \|\| ch == '^' \|\| ch == '$' \|\| ch == '-' \|\| Character.isDigit(ch))
246	return true;
247	return false;
248	}
249
250	// Trims off everything we don't care about.
251	private String realTrimmer(String row) {
252	int pos = row.indexOf('#');
253	if (pos != -1) {
254	row = row.substring(0, pos);
255	}
256	return row.trim();
257	}
258
259	// Inner Classes
260	/*
261	* Holds the match string and the replace string and all the rule attributes.
262	* Is responsible for indicating matches.
263	*/
264	private class TransformationRule {
265
266	private String replace;
267	private char[] match;
268	// takeOut=number of chars to replace;
269	// matchLength=length of matching string counting multies as one.
270	private int takeOut, matchLength;
271	private boolean start, end;
272
273	// Construktor
274	public TransformationRule(String match, String replace, int takeout, int matchLength, boolean start, boolean end) {
275	this.match = match.toCharArray();
276	this.replace = replace;
277	this.takeOut = takeout;
278	this.matchLength = matchLength;
279	this.start = start;
280	this.end = end;
281	}
282
283	/*
284	* Returns true if word from pos and forward matches the match string.
285	* Precondition: wordPos+matchLength<word.length()
286	*/
287	public boolean isMatching(StringBuffer word, int wordPos) {
288	boolean matching = true, inMulti = false, multiMatch = false;
289	char matchCh;
290
291	for (int matchPos = 0; matchPos < match.length; matchPos++) {
292	matchCh = match[matchPos];
293	if (matchCh == STARTMULTI \|\| matchCh == ENDMULTI) {
294	inMulti = !inMulti;
295	if (!inMulti)
296	matching = matching & multiMatch;
297	else
298	multiMatch = false;
299	} else {
300	if (matchCh != word.charAt(wordPos)) {
301	if (inMulti)
302	multiMatch = multiMatch \| false;
303	else
304	matching = false;
305	} else {
306	if (inMulti)
307	multiMatch = multiMatch \| true;
308	else
309	matching = true;
310	}
311	if (!inMulti)
312	wordPos++;
313	if (!matching)
314	break;
315	}
316	}
317	if (end && wordPos != word.length())
318	matching = false;
319	return matching;
320	}
321
322	public String getReplaceExp() {
323	return replace;
324	}
325
326	public int getTakeOut() {
327	return takeOut;
328	}
329
330	public boolean startsWithExp() {
331	return start;
332	}
333
334	public int lengthOfMatch() {
335	return matchLength;
336	}
337
338	// Just for debugging purposes.
339	public String toString() {
340	return "Match:" + String.valueOf(match) + " Replace:" + replace + " TakeOut:" + takeOut + " MatchLength:" + matchLength + " Start:" + start + " End:" + end;
341	}
342
343	}
344	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: