[3952] | 1 | package com.swabunga.spell.event; |
---|
| 2 | |
---|
| 3 | |
---|
| 4 | import javax.swing.text.BadLocationException; |
---|
| 5 | import javax.swing.text.Document; |
---|
| 6 | import javax.swing.text.Segment; |
---|
| 7 | import java.text.BreakIterator; |
---|
| 8 | |
---|
| 9 | |
---|
| 10 | /** This class tokenizes a swing document model. It also allows for the |
---|
| 11 | * document model to be changed when corrections occur. |
---|
| 12 | * |
---|
| 13 | * @author Jason Height (jheight@chariot.net.au) |
---|
| 14 | */ |
---|
| 15 | public class DocumentWordTokenizer implements WordTokenizer { |
---|
| 16 | /** Holds the start character position of the current word*/ |
---|
| 17 | private int currentWordPos = 0; |
---|
| 18 | /** Holds the end character position of the current word*/ |
---|
| 19 | private int currentWordEnd = 0; |
---|
| 20 | /** Holds the start character position of the next word*/ |
---|
| 21 | private int nextWordPos = -1; |
---|
| 22 | /** The actual text that is being tokenized*/ |
---|
| 23 | private Document document; |
---|
| 24 | /** The character iterator over the document*/ |
---|
| 25 | private Segment text; |
---|
| 26 | /** The cumulative word count that have been processed*/ |
---|
| 27 | private int wordCount = 0; |
---|
| 28 | /** Flag indicating if there are any more tokens (words) left*/ |
---|
| 29 | private boolean moreTokens = true; |
---|
| 30 | /** Is this a special case where the currentWordStart, currntWordEnd and |
---|
| 31 | * nextWordPos have already been calculated. (see nextWord) |
---|
| 32 | */ |
---|
| 33 | private boolean first = true; |
---|
| 34 | private BreakIterator sentenceIterator; |
---|
| 35 | private boolean startsSentence = true; |
---|
| 36 | |
---|
| 37 | public DocumentWordTokenizer(Document document) { |
---|
| 38 | this.document = document; |
---|
| 39 | //Create a text segment over the etire document |
---|
| 40 | text = new Segment(); |
---|
| 41 | sentenceIterator = BreakIterator.getSentenceInstance(); |
---|
| 42 | try { |
---|
| 43 | document.getText(0, document.getLength(), text); |
---|
| 44 | sentenceIterator.setText(text); |
---|
| 45 | currentWordPos = getNextWordStart(text, 0); |
---|
| 46 | //If the current word pos is -1 then the string was all white space |
---|
| 47 | if (currentWordPos != -1) { |
---|
| 48 | currentWordEnd = getNextWordEnd(text, currentWordPos); |
---|
| 49 | nextWordPos = getNextWordStart(text, currentWordEnd); |
---|
| 50 | } else { |
---|
| 51 | moreTokens = false; |
---|
| 52 | } |
---|
| 53 | } catch (BadLocationException ex) { |
---|
| 54 | moreTokens = false; |
---|
| 55 | } |
---|
| 56 | } |
---|
| 57 | |
---|
| 58 | /** This helper method will return the start character of the next |
---|
| 59 | * word in the buffer from the start position |
---|
| 60 | */ |
---|
| 61 | private static int getNextWordStart(Segment text, int startPos) { |
---|
| 62 | if (startPos <= text.getEndIndex()) |
---|
| 63 | for (char ch = text.setIndex(startPos); ch != Segment.DONE; ch = text.next()) { |
---|
| 64 | if (Character.isLetterOrDigit(ch)) { |
---|
| 65 | return text.getIndex(); |
---|
| 66 | } |
---|
| 67 | } |
---|
| 68 | return -1; |
---|
| 69 | } |
---|
| 70 | |
---|
| 71 | /** This helper method will return the end of the next word in the buffer. |
---|
| 72 | * |
---|
| 73 | */ |
---|
| 74 | private static int getNextWordEnd(Segment text, int startPos) { |
---|
| 75 | for (char ch = text.setIndex(startPos); ch != Segment.DONE; ch = text.next()) { |
---|
| 76 | if (!Character.isLetterOrDigit(ch)) { |
---|
| 77 | if (ch == '-' || ch == '\'') { // handle ' and - inside words |
---|
| 78 | char ch2 = text.next(); |
---|
| 79 | text.previous(); |
---|
| 80 | if (ch2 != Segment.DONE && Character.isLetterOrDigit(ch2)) |
---|
| 81 | continue; |
---|
| 82 | } |
---|
| 83 | return text.getIndex(); |
---|
| 84 | } |
---|
| 85 | } |
---|
| 86 | return text.getEndIndex(); |
---|
| 87 | } |
---|
| 88 | |
---|
| 89 | /** Returns true if there are more words that can be processed in the string |
---|
| 90 | * |
---|
| 91 | */ |
---|
| 92 | public boolean hasMoreWords() { |
---|
| 93 | return moreTokens; |
---|
| 94 | } |
---|
| 95 | |
---|
| 96 | /** Returns the current character position in the text |
---|
| 97 | * |
---|
| 98 | */ |
---|
| 99 | public int getCurrentWordPosition() { |
---|
| 100 | return currentWordPos; |
---|
| 101 | } |
---|
| 102 | |
---|
| 103 | /** Returns the current end word position in the text |
---|
| 104 | * |
---|
| 105 | */ |
---|
| 106 | public int getCurrentWordEnd() { |
---|
| 107 | return currentWordEnd; |
---|
| 108 | } |
---|
| 109 | |
---|
| 110 | /** Returns the next word in the text |
---|
| 111 | * |
---|
| 112 | */ |
---|
| 113 | public String nextWord() { |
---|
| 114 | if (!first) { |
---|
| 115 | currentWordPos = nextWordPos; |
---|
| 116 | currentWordEnd = getNextWordEnd(text, currentWordPos); |
---|
| 117 | nextWordPos = getNextWordStart(text, currentWordEnd + 1); |
---|
| 118 | } |
---|
| 119 | int current = sentenceIterator.current(); |
---|
| 120 | if (current == currentWordPos) |
---|
| 121 | startsSentence = true; |
---|
| 122 | else { |
---|
| 123 | startsSentence = false; |
---|
| 124 | if (currentWordEnd > current) |
---|
| 125 | sentenceIterator.next(); |
---|
| 126 | } |
---|
| 127 | //The nextWordPos has already been populated |
---|
| 128 | String word = null; |
---|
| 129 | try { |
---|
| 130 | word = document.getText(currentWordPos, currentWordEnd - currentWordPos); |
---|
| 131 | } catch (BadLocationException ex) { |
---|
| 132 | moreTokens = false; |
---|
| 133 | } |
---|
| 134 | wordCount++; |
---|
| 135 | first = false; |
---|
| 136 | if (nextWordPos == -1) |
---|
| 137 | moreTokens = false; |
---|
| 138 | return word; |
---|
| 139 | } |
---|
| 140 | |
---|
| 141 | /** Returns the current number of words that have been processed |
---|
| 142 | * |
---|
| 143 | */ |
---|
| 144 | public int getCurrentWordCount() { |
---|
| 145 | return wordCount; |
---|
| 146 | } |
---|
| 147 | |
---|
| 148 | /** Replaces the current word token*/ |
---|
| 149 | public void replaceWord(String newWord) { |
---|
| 150 | if (currentWordPos != -1) { |
---|
| 151 | try { |
---|
| 152 | document.remove(currentWordPos, currentWordEnd - currentWordPos); |
---|
| 153 | document.insertString(currentWordPos, newWord, null); |
---|
| 154 | //Need to reset the segment |
---|
| 155 | document.getText(0, document.getLength(), text); |
---|
| 156 | } catch (BadLocationException ex) { |
---|
| 157 | throw new RuntimeException(ex.getMessage()); |
---|
| 158 | } |
---|
| 159 | //Position after the newly replaced word(s) |
---|
| 160 | first = true; |
---|
| 161 | currentWordPos = getNextWordStart(text, currentWordPos + newWord.length()); |
---|
| 162 | if (currentWordPos != -1) { |
---|
| 163 | currentWordEnd = getNextWordEnd(text, currentWordPos); |
---|
| 164 | nextWordPos = getNextWordStart(text, currentWordEnd); |
---|
| 165 | sentenceIterator.setText(text); |
---|
| 166 | sentenceIterator.following(currentWordPos); |
---|
| 167 | } else |
---|
| 168 | moreTokens = false; |
---|
| 169 | } |
---|
| 170 | } |
---|
| 171 | |
---|
| 172 | /** Returns the current text that is being tokenized (includes any changes |
---|
| 173 | * that have been made) |
---|
| 174 | */ |
---|
| 175 | public String getContext() { |
---|
| 176 | return text.toString(); |
---|
| 177 | } |
---|
| 178 | |
---|
| 179 | /** Returns true if the current word is at the start of a sentence*/ |
---|
| 180 | public boolean isNewSentence() { |
---|
| 181 | return startsSentence; |
---|
| 182 | } |
---|
| 183 | } |
---|