1 | package com.swabunga.spell.event; |
---|
2 | |
---|
3 | |
---|
4 | import javax.swing.text.BadLocationException; |
---|
5 | import javax.swing.text.Document; |
---|
6 | import javax.swing.text.Segment; |
---|
7 | import java.text.BreakIterator; |
---|
8 | |
---|
9 | |
---|
10 | /** This class tokenizes a swing document model. It also allows for the |
---|
11 | * document model to be changed when corrections occur. |
---|
12 | * |
---|
13 | * @author Jason Height (jheight@chariot.net.au) |
---|
14 | */ |
---|
15 | public class DocumentWordTokenizer implements WordTokenizer { |
---|
16 | /** Holds the start character position of the current word*/ |
---|
17 | private int currentWordPos = 0; |
---|
18 | /** Holds the end character position of the current word*/ |
---|
19 | private int currentWordEnd = 0; |
---|
20 | /** Holds the start character position of the next word*/ |
---|
21 | private int nextWordPos = -1; |
---|
22 | /** The actual text that is being tokenized*/ |
---|
23 | private Document document; |
---|
24 | /** The character iterator over the document*/ |
---|
25 | private Segment text; |
---|
26 | /** The cumulative word count that have been processed*/ |
---|
27 | private int wordCount = 0; |
---|
28 | /** Flag indicating if there are any more tokens (words) left*/ |
---|
29 | private boolean moreTokens = true; |
---|
30 | /** Is this a special case where the currentWordStart, currntWordEnd and |
---|
31 | * nextWordPos have already been calculated. (see nextWord) |
---|
32 | */ |
---|
33 | private boolean first = true; |
---|
34 | private BreakIterator sentenceIterator; |
---|
35 | private boolean startsSentence = true; |
---|
36 | |
---|
37 | public DocumentWordTokenizer(Document document) { |
---|
38 | this.document = document; |
---|
39 | //Create a text segment over the etire document |
---|
40 | text = new Segment(); |
---|
41 | sentenceIterator = BreakIterator.getSentenceInstance(); |
---|
42 | try { |
---|
43 | document.getText(0, document.getLength(), text); |
---|
44 | sentenceIterator.setText(text); |
---|
45 | currentWordPos = getNextWordStart(text, 0); |
---|
46 | //If the current word pos is -1 then the string was all white space |
---|
47 | if (currentWordPos != -1) { |
---|
48 | currentWordEnd = getNextWordEnd(text, currentWordPos); |
---|
49 | nextWordPos = getNextWordStart(text, currentWordEnd); |
---|
50 | } else { |
---|
51 | moreTokens = false; |
---|
52 | } |
---|
53 | } catch (BadLocationException ex) { |
---|
54 | moreTokens = false; |
---|
55 | } |
---|
56 | } |
---|
57 | |
---|
58 | /** This helper method will return the start character of the next |
---|
59 | * word in the buffer from the start position |
---|
60 | */ |
---|
61 | private static int getNextWordStart(Segment text, int startPos) { |
---|
62 | if (startPos <= text.getEndIndex()) |
---|
63 | for (char ch = text.setIndex(startPos); ch != Segment.DONE; ch = text.next()) { |
---|
64 | if (Character.isLetterOrDigit(ch)) { |
---|
65 | return text.getIndex(); |
---|
66 | } |
---|
67 | } |
---|
68 | return -1; |
---|
69 | } |
---|
70 | |
---|
71 | /** This helper method will return the end of the next word in the buffer. |
---|
72 | * |
---|
73 | */ |
---|
74 | private static int getNextWordEnd(Segment text, int startPos) { |
---|
75 | for (char ch = text.setIndex(startPos); ch != Segment.DONE; ch = text.next()) { |
---|
76 | if (!Character.isLetterOrDigit(ch)) { |
---|
77 | if (ch == '-' || ch == '\'') { // handle ' and - inside words |
---|
78 | char ch2 = text.next(); |
---|
79 | text.previous(); |
---|
80 | if (ch2 != Segment.DONE && Character.isLetterOrDigit(ch2)) |
---|
81 | continue; |
---|
82 | } |
---|
83 | return text.getIndex(); |
---|
84 | } |
---|
85 | } |
---|
86 | return text.getEndIndex(); |
---|
87 | } |
---|
88 | |
---|
89 | /** Returns true if there are more words that can be processed in the string |
---|
90 | * |
---|
91 | */ |
---|
92 | public boolean hasMoreWords() { |
---|
93 | return moreTokens; |
---|
94 | } |
---|
95 | |
---|
96 | /** Returns the current character position in the text |
---|
97 | * |
---|
98 | */ |
---|
99 | public int getCurrentWordPosition() { |
---|
100 | return currentWordPos; |
---|
101 | } |
---|
102 | |
---|
103 | /** Returns the current end word position in the text |
---|
104 | * |
---|
105 | */ |
---|
106 | public int getCurrentWordEnd() { |
---|
107 | return currentWordEnd; |
---|
108 | } |
---|
109 | |
---|
110 | /** Returns the next word in the text |
---|
111 | * |
---|
112 | */ |
---|
113 | public String nextWord() { |
---|
114 | if (!first) { |
---|
115 | currentWordPos = nextWordPos; |
---|
116 | currentWordEnd = getNextWordEnd(text, currentWordPos); |
---|
117 | nextWordPos = getNextWordStart(text, currentWordEnd + 1); |
---|
118 | } |
---|
119 | int current = sentenceIterator.current(); |
---|
120 | if (current == currentWordPos) |
---|
121 | startsSentence = true; |
---|
122 | else { |
---|
123 | startsSentence = false; |
---|
124 | if (currentWordEnd > current) |
---|
125 | sentenceIterator.next(); |
---|
126 | } |
---|
127 | //The nextWordPos has already been populated |
---|
128 | String word = null; |
---|
129 | try { |
---|
130 | word = document.getText(currentWordPos, currentWordEnd - currentWordPos); |
---|
131 | } catch (BadLocationException ex) { |
---|
132 | moreTokens = false; |
---|
133 | } |
---|
134 | wordCount++; |
---|
135 | first = false; |
---|
136 | if (nextWordPos == -1) |
---|
137 | moreTokens = false; |
---|
138 | return word; |
---|
139 | } |
---|
140 | |
---|
141 | /** Returns the current number of words that have been processed |
---|
142 | * |
---|
143 | */ |
---|
144 | public int getCurrentWordCount() { |
---|
145 | return wordCount; |
---|
146 | } |
---|
147 | |
---|
148 | /** Replaces the current word token*/ |
---|
149 | public void replaceWord(String newWord) { |
---|
150 | if (currentWordPos != -1) { |
---|
151 | try { |
---|
152 | document.remove(currentWordPos, currentWordEnd - currentWordPos); |
---|
153 | document.insertString(currentWordPos, newWord, null); |
---|
154 | //Need to reset the segment |
---|
155 | document.getText(0, document.getLength(), text); |
---|
156 | } catch (BadLocationException ex) { |
---|
157 | throw new RuntimeException(ex.getMessage()); |
---|
158 | } |
---|
159 | //Position after the newly replaced word(s) |
---|
160 | first = true; |
---|
161 | currentWordPos = getNextWordStart(text, currentWordPos + newWord.length()); |
---|
162 | if (currentWordPos != -1) { |
---|
163 | currentWordEnd = getNextWordEnd(text, currentWordPos); |
---|
164 | nextWordPos = getNextWordStart(text, currentWordEnd); |
---|
165 | sentenceIterator.setText(text); |
---|
166 | sentenceIterator.following(currentWordPos); |
---|
167 | } else |
---|
168 | moreTokens = false; |
---|
169 | } |
---|
170 | } |
---|
171 | |
---|
172 | /** Returns the current text that is being tokenized (includes any changes |
---|
173 | * that have been made) |
---|
174 | */ |
---|
175 | public String getContext() { |
---|
176 | return text.toString(); |
---|
177 | } |
---|
178 | |
---|
179 | /** Returns true if the current word is at the start of a sentence*/ |
---|
180 | public boolean isNewSentence() { |
---|
181 | return startsSentence; |
---|
182 | } |
---|
183 | } |
---|