1 | /**************************************************************** |
---|
2 | * Licensed to the Apache Software Foundation (ASF) under one * |
---|
3 | * or more contributor license agreements. See the NOTICE file * |
---|
4 | * distributed with this work for additional information * |
---|
5 | * regarding copyright ownership. The ASF licenses this file * |
---|
6 | * to you under the Apache License, Version 2.0 (the * |
---|
7 | * "License"); you may not use this file except in compliance * |
---|
8 | * with the License. You may obtain a copy of the License at * |
---|
9 | * * |
---|
10 | * http://www.apache.org/licenses/LICENSE-2.0 * |
---|
11 | * * |
---|
12 | * Unless required by applicable law or agreed to in writing, * |
---|
13 | * software distributed under the License is distributed on an * |
---|
14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * |
---|
15 | * KIND, either express or implied. See the License for the * |
---|
16 | * specific language governing permissions and limitations * |
---|
17 | * under the License. * |
---|
18 | ****************************************************************/ |
---|
19 | |
---|
20 | package org.apache.james.mime4j.codec; |
---|
21 | |
---|
22 | import java.nio.ByteBuffer; |
---|
23 | import java.nio.charset.Charset; |
---|
24 | import java.util.BitSet; |
---|
25 | import java.util.Locale; |
---|
26 | |
---|
27 | import org.apache.james.mime4j.util.CharsetUtil; |
---|
28 | |
---|
29 | /** |
---|
30 | * Static methods for encoding header field values. This includes encoded-words |
---|
31 | * as defined in <a href='http://www.faqs.org/rfcs/rfc2047.html'>RFC 2047</a> |
---|
32 | * or display-names of an e-mail address, for example. |
---|
33 | */ |
---|
34 | public class EncoderUtil { |
---|
35 | private static final byte[] BASE64_TABLE = Base64OutputStream.BASE64_TABLE; |
---|
36 | private static final char BASE64_PAD = '='; |
---|
37 | |
---|
38 | private static final BitSet Q_REGULAR_CHARS = initChars("=_?"); |
---|
39 | |
---|
40 | private static final BitSet Q_RESTRICTED_CHARS = initChars("=_?\"#$%&'(),.:;<>@[\\]^`{|}~"); |
---|
41 | |
---|
42 | private static final int MAX_USED_CHARACTERS = 50; |
---|
43 | |
---|
44 | private static final String ENC_WORD_PREFIX = "=?"; |
---|
45 | private static final String ENC_WORD_SUFFIX = "?="; |
---|
46 | |
---|
47 | private static final int ENCODED_WORD_MAX_LENGTH = 75; // RFC 2047 |
---|
48 | |
---|
49 | private static final BitSet TOKEN_CHARS = initChars("()<>@,;:\\\"/[]?="); |
---|
50 | |
---|
51 | private static final BitSet ATEXT_CHARS = initChars("()<>@.,;:\\\"[]"); |
---|
52 | |
---|
53 | private static BitSet initChars(String specials) { |
---|
54 | BitSet bs = new BitSet(128); |
---|
55 | for (char ch = 33; ch < 127; ch++) { |
---|
56 | if (specials.indexOf(ch) == -1) { |
---|
57 | bs.set(ch); |
---|
58 | } |
---|
59 | } |
---|
60 | return bs; |
---|
61 | } |
---|
62 | |
---|
63 | /** |
---|
64 | * Selects one of the two encodings specified in RFC 2047. |
---|
65 | */ |
---|
66 | public enum Encoding { |
---|
67 | /** The B encoding (identical to base64 defined in RFC 2045). */ |
---|
68 | B, |
---|
69 | /** The Q encoding (similar to quoted-printable defined in RFC 2045). */ |
---|
70 | Q |
---|
71 | } |
---|
72 | |
---|
73 | /** |
---|
74 | * Indicates the intended usage of an encoded word. |
---|
75 | */ |
---|
76 | public enum Usage { |
---|
77 | /** |
---|
78 | * Encoded word is used to replace a 'text' token in any Subject or |
---|
79 | * Comments header field. |
---|
80 | */ |
---|
81 | TEXT_TOKEN, |
---|
82 | /** |
---|
83 | * Encoded word is used to replace a 'word' entity within a 'phrase', |
---|
84 | * for example, one that precedes an address in a From, To, or Cc |
---|
85 | * header. |
---|
86 | */ |
---|
87 | WORD_ENTITY |
---|
88 | } |
---|
89 | |
---|
90 | private EncoderUtil() { |
---|
91 | } |
---|
92 | |
---|
93 | /** |
---|
94 | * Encodes the display-name portion of an address. See <a |
---|
95 | * href='http://www.faqs.org/rfcs/rfc5322.html'>RFC 5322</a> section 3.4 |
---|
96 | * and <a href='http://www.faqs.org/rfcs/rfc2047.html'>RFC 2047</a> section |
---|
97 | * 5.3. The specified string should not be folded. |
---|
98 | * |
---|
99 | * @param displayName |
---|
100 | * display-name to encode. |
---|
101 | * @return encoded display-name. |
---|
102 | */ |
---|
103 | public static String encodeAddressDisplayName(String displayName) { |
---|
104 | // display-name = phrase |
---|
105 | // phrase = 1*( encoded-word / word ) |
---|
106 | // word = atom / quoted-string |
---|
107 | // atom = [CFWS] 1*atext [CFWS] |
---|
108 | // CFWS = comment or folding white space |
---|
109 | |
---|
110 | if (isAtomPhrase(displayName)) { |
---|
111 | return displayName; |
---|
112 | } else if (hasToBeEncoded(displayName, 0)) { |
---|
113 | return encodeEncodedWord(displayName, Usage.WORD_ENTITY); |
---|
114 | } else { |
---|
115 | return quote(displayName); |
---|
116 | } |
---|
117 | } |
---|
118 | |
---|
119 | /** |
---|
120 | * Encodes the local part of an address specification as described in RFC |
---|
121 | * 5322 section 3.4.1. Leading and trailing CFWS should have been removed |
---|
122 | * before calling this method. The specified string should not contain any |
---|
123 | * illegal (control or non-ASCII) characters. |
---|
124 | * |
---|
125 | * @param localPart |
---|
126 | * the local part to encode |
---|
127 | * @return the encoded local part. |
---|
128 | */ |
---|
129 | public static String encodeAddressLocalPart(String localPart) { |
---|
130 | // local-part = dot-atom / quoted-string |
---|
131 | // dot-atom = [CFWS] dot-atom-text [CFWS] |
---|
132 | // CFWS = comment or folding white space |
---|
133 | |
---|
134 | if (isDotAtomText(localPart)) { |
---|
135 | return localPart; |
---|
136 | } else { |
---|
137 | return quote(localPart); |
---|
138 | } |
---|
139 | } |
---|
140 | |
---|
141 | /** |
---|
142 | * Encodes the specified strings into a header parameter as described in RFC |
---|
143 | * 2045 section 5.1 and RFC 2183 section 2. The specified strings should not |
---|
144 | * contain any illegal (control or non-ASCII) characters. |
---|
145 | * |
---|
146 | * @param name |
---|
147 | * parameter name. |
---|
148 | * @param value |
---|
149 | * parameter value. |
---|
150 | * @return encoded result. |
---|
151 | */ |
---|
152 | public static String encodeHeaderParameter(String name, String value) { |
---|
153 | name = name.toLowerCase(Locale.US); |
---|
154 | |
---|
155 | // value := token / quoted-string |
---|
156 | if (isToken(value)) { |
---|
157 | return name + "=" + value; |
---|
158 | } else { |
---|
159 | return name + "=" + quote(value); |
---|
160 | } |
---|
161 | } |
---|
162 | |
---|
163 | /** |
---|
164 | * Shortcut method that encodes the specified text into an encoded-word if |
---|
165 | * the text has to be encoded. |
---|
166 | * |
---|
167 | * @param text |
---|
168 | * text to encode. |
---|
169 | * @param usage |
---|
170 | * whether the encoded-word is to be used to replace a text token |
---|
171 | * or a word entity (see RFC 822). |
---|
172 | * @param usedCharacters |
---|
173 | * number of characters already used up (<code>0 <= usedCharacters <= 50</code>). |
---|
174 | * @return the specified text if encoding is not necessary or an encoded |
---|
175 | * word or a sequence of encoded words otherwise. |
---|
176 | */ |
---|
177 | public static String encodeIfNecessary(String text, Usage usage, |
---|
178 | int usedCharacters) { |
---|
179 | if (hasToBeEncoded(text, usedCharacters)) |
---|
180 | return encodeEncodedWord(text, usage, usedCharacters); |
---|
181 | else |
---|
182 | return text; |
---|
183 | } |
---|
184 | |
---|
185 | /** |
---|
186 | * Determines if the specified string has to encoded into an encoded-word. |
---|
187 | * Returns <code>true</code> if the text contains characters that don't |
---|
188 | * fall into the printable ASCII character set or if the text contains a |
---|
189 | * 'word' (sequence of non-whitespace characters) longer than 77 characters |
---|
190 | * (including characters already used up in the line). |
---|
191 | * |
---|
192 | * @param text |
---|
193 | * text to analyze. |
---|
194 | * @param usedCharacters |
---|
195 | * number of characters already used up (<code>0 <= usedCharacters <= 50</code>). |
---|
196 | * @return <code>true</code> if the specified text has to be encoded into |
---|
197 | * an encoded-word, <code>false</code> otherwise. |
---|
198 | */ |
---|
199 | public static boolean hasToBeEncoded(String text, int usedCharacters) { |
---|
200 | if (text == null) |
---|
201 | throw new IllegalArgumentException(); |
---|
202 | if (usedCharacters < 0 || usedCharacters > MAX_USED_CHARACTERS) |
---|
203 | throw new IllegalArgumentException(); |
---|
204 | |
---|
205 | int nonWhiteSpaceCount = usedCharacters; |
---|
206 | |
---|
207 | for (int idx = 0; idx < text.length(); idx++) { |
---|
208 | char ch = text.charAt(idx); |
---|
209 | if (ch == '\t' || ch == ' ') { |
---|
210 | nonWhiteSpaceCount = 0; |
---|
211 | } else { |
---|
212 | nonWhiteSpaceCount++; |
---|
213 | if (nonWhiteSpaceCount > 77) { |
---|
214 | // Line cannot be folded into multiple lines with no more |
---|
215 | // than 78 characters each. Encoding as encoded-words makes |
---|
216 | // that possible. One character has to be reserved for |
---|
217 | // folding white space; that leaves 77 characters. |
---|
218 | return true; |
---|
219 | } |
---|
220 | |
---|
221 | if (ch < 32 || ch >= 127) { |
---|
222 | // non-printable ascii character has to be encoded |
---|
223 | return true; |
---|
224 | } |
---|
225 | } |
---|
226 | } |
---|
227 | |
---|
228 | return false; |
---|
229 | } |
---|
230 | |
---|
231 | /** |
---|
232 | * Encodes the specified text into an encoded word or a sequence of encoded |
---|
233 | * words separated by space. The text is separated into a sequence of |
---|
234 | * encoded words if it does not fit in a single one. |
---|
235 | * <p> |
---|
236 | * The charset to encode the specified text into a byte array and the |
---|
237 | * encoding to use for the encoded-word are detected automatically. |
---|
238 | * <p> |
---|
239 | * This method assumes that zero characters have already been used up in the |
---|
240 | * current line. |
---|
241 | * |
---|
242 | * @param text |
---|
243 | * text to encode. |
---|
244 | * @param usage |
---|
245 | * whether the encoded-word is to be used to replace a text token |
---|
246 | * or a word entity (see RFC 822). |
---|
247 | * @return the encoded word (or sequence of encoded words if the given text |
---|
248 | * does not fit in a single encoded word). |
---|
249 | * @see #hasToBeEncoded(String, int) |
---|
250 | */ |
---|
251 | public static String encodeEncodedWord(String text, Usage usage) { |
---|
252 | return encodeEncodedWord(text, usage, 0, null, null); |
---|
253 | } |
---|
254 | |
---|
255 | /** |
---|
256 | * Encodes the specified text into an encoded word or a sequence of encoded |
---|
257 | * words separated by space. The text is separated into a sequence of |
---|
258 | * encoded words if it does not fit in a single one. |
---|
259 | * <p> |
---|
260 | * The charset to encode the specified text into a byte array and the |
---|
261 | * encoding to use for the encoded-word are detected automatically. |
---|
262 | * |
---|
263 | * @param text |
---|
264 | * text to encode. |
---|
265 | * @param usage |
---|
266 | * whether the encoded-word is to be used to replace a text token |
---|
267 | * or a word entity (see RFC 822). |
---|
268 | * @param usedCharacters |
---|
269 | * number of characters already used up (<code>0 <= usedCharacters <= 50</code>). |
---|
270 | * @return the encoded word (or sequence of encoded words if the given text |
---|
271 | * does not fit in a single encoded word). |
---|
272 | * @see #hasToBeEncoded(String, int) |
---|
273 | */ |
---|
274 | public static String encodeEncodedWord(String text, Usage usage, |
---|
275 | int usedCharacters) { |
---|
276 | return encodeEncodedWord(text, usage, usedCharacters, null, null); |
---|
277 | } |
---|
278 | |
---|
279 | /** |
---|
280 | * Encodes the specified text into an encoded word or a sequence of encoded |
---|
281 | * words separated by space. The text is separated into a sequence of |
---|
282 | * encoded words if it does not fit in a single one. |
---|
283 | * |
---|
284 | * @param text |
---|
285 | * text to encode. |
---|
286 | * @param usage |
---|
287 | * whether the encoded-word is to be used to replace a text token |
---|
288 | * or a word entity (see RFC 822). |
---|
289 | * @param usedCharacters |
---|
290 | * number of characters already used up (<code>0 <= usedCharacters <= 50</code>). |
---|
291 | * @param charset |
---|
292 | * the Java charset that should be used to encode the specified |
---|
293 | * string into a byte array. A suitable charset is detected |
---|
294 | * automatically if this parameter is <code>null</code>. |
---|
295 | * @param encoding |
---|
296 | * the encoding to use for the encoded-word (either B or Q). A |
---|
297 | * suitable encoding is automatically chosen if this parameter is |
---|
298 | * <code>null</code>. |
---|
299 | * @return the encoded word (or sequence of encoded words if the given text |
---|
300 | * does not fit in a single encoded word). |
---|
301 | * @see #hasToBeEncoded(String, int) |
---|
302 | */ |
---|
303 | public static String encodeEncodedWord(String text, Usage usage, |
---|
304 | int usedCharacters, Charset charset, Encoding encoding) { |
---|
305 | if (text == null) |
---|
306 | throw new IllegalArgumentException(); |
---|
307 | if (usedCharacters < 0 || usedCharacters > MAX_USED_CHARACTERS) |
---|
308 | throw new IllegalArgumentException(); |
---|
309 | |
---|
310 | if (charset == null) |
---|
311 | charset = determineCharset(text); |
---|
312 | |
---|
313 | byte[] bytes = encode(text, charset); |
---|
314 | |
---|
315 | if (encoding == null) |
---|
316 | encoding = determineEncoding(bytes, usage); |
---|
317 | |
---|
318 | if (encoding == Encoding.B) { |
---|
319 | String prefix = ENC_WORD_PREFIX + charset.name() + "?B?"; |
---|
320 | return encodeB(prefix, text, usedCharacters, charset, bytes); |
---|
321 | } else { |
---|
322 | String prefix = ENC_WORD_PREFIX + charset.name() + "?Q?"; |
---|
323 | return encodeQ(prefix, text, usage, usedCharacters, charset, bytes); |
---|
324 | } |
---|
325 | } |
---|
326 | |
---|
327 | /** |
---|
328 | * Encodes the specified byte array using the B encoding defined in RFC |
---|
329 | * 2047. |
---|
330 | * |
---|
331 | * @param bytes |
---|
332 | * byte array to encode. |
---|
333 | * @return encoded string. |
---|
334 | */ |
---|
335 | public static String encodeB(byte[] bytes) { |
---|
336 | StringBuilder sb = new StringBuilder(); |
---|
337 | |
---|
338 | int idx = 0; |
---|
339 | final int end = bytes.length; |
---|
340 | for (; idx < end - 2; idx += 3) { |
---|
341 | int data = (bytes[idx] & 0xff) << 16 | (bytes[idx + 1] & 0xff) << 8 |
---|
342 | | bytes[idx + 2] & 0xff; |
---|
343 | sb.append((char) BASE64_TABLE[data >> 18 & 0x3f]); |
---|
344 | sb.append((char) BASE64_TABLE[data >> 12 & 0x3f]); |
---|
345 | sb.append((char) BASE64_TABLE[data >> 6 & 0x3f]); |
---|
346 | sb.append((char) BASE64_TABLE[data & 0x3f]); |
---|
347 | } |
---|
348 | |
---|
349 | if (idx == end - 2) { |
---|
350 | int data = (bytes[idx] & 0xff) << 16 | (bytes[idx + 1] & 0xff) << 8; |
---|
351 | sb.append((char) BASE64_TABLE[data >> 18 & 0x3f]); |
---|
352 | sb.append((char) BASE64_TABLE[data >> 12 & 0x3f]); |
---|
353 | sb.append((char) BASE64_TABLE[data >> 6 & 0x3f]); |
---|
354 | sb.append(BASE64_PAD); |
---|
355 | |
---|
356 | } else if (idx == end - 1) { |
---|
357 | int data = (bytes[idx] & 0xff) << 16; |
---|
358 | sb.append((char) BASE64_TABLE[data >> 18 & 0x3f]); |
---|
359 | sb.append((char) BASE64_TABLE[data >> 12 & 0x3f]); |
---|
360 | sb.append(BASE64_PAD); |
---|
361 | sb.append(BASE64_PAD); |
---|
362 | } |
---|
363 | |
---|
364 | return sb.toString(); |
---|
365 | } |
---|
366 | |
---|
367 | /** |
---|
368 | * Encodes the specified byte array using the Q encoding defined in RFC |
---|
369 | * 2047. |
---|
370 | * |
---|
371 | * @param bytes |
---|
372 | * byte array to encode. |
---|
373 | * @param usage |
---|
374 | * whether the encoded-word is to be used to replace a text token |
---|
375 | * or a word entity (see RFC 822). |
---|
376 | * @return encoded string. |
---|
377 | */ |
---|
378 | public static String encodeQ(byte[] bytes, Usage usage) { |
---|
379 | BitSet qChars = usage == Usage.TEXT_TOKEN ? Q_REGULAR_CHARS |
---|
380 | : Q_RESTRICTED_CHARS; |
---|
381 | |
---|
382 | StringBuilder sb = new StringBuilder(); |
---|
383 | |
---|
384 | final int end = bytes.length; |
---|
385 | for (int idx = 0; idx < end; idx++) { |
---|
386 | int v = bytes[idx] & 0xff; |
---|
387 | if (v == 32) { |
---|
388 | sb.append('_'); |
---|
389 | } else if (!qChars.get(v)) { |
---|
390 | sb.append('='); |
---|
391 | sb.append(hexDigit(v >>> 4)); |
---|
392 | sb.append(hexDigit(v & 0xf)); |
---|
393 | } else { |
---|
394 | sb.append((char) v); |
---|
395 | } |
---|
396 | } |
---|
397 | |
---|
398 | return sb.toString(); |
---|
399 | } |
---|
400 | |
---|
401 | /** |
---|
402 | * Tests whether the specified string is a token as defined in RFC 2045 |
---|
403 | * section 5.1. |
---|
404 | * |
---|
405 | * @param str |
---|
406 | * string to test. |
---|
407 | * @return <code>true</code> if the specified string is a RFC 2045 token, |
---|
408 | * <code>false</code> otherwise. |
---|
409 | */ |
---|
410 | public static boolean isToken(String str) { |
---|
411 | // token := 1*<any (US-ASCII) CHAR except SPACE, CTLs, or tspecials> |
---|
412 | // tspecials := "(" / ")" / "<" / ">" / "@" / "," / ";" / ":" / "\" / |
---|
413 | // <"> / "/" / "[" / "]" / "?" / "=" |
---|
414 | // CTL := 0.- 31., 127. |
---|
415 | |
---|
416 | final int length = str.length(); |
---|
417 | if (length == 0) |
---|
418 | return false; |
---|
419 | |
---|
420 | for (int idx = 0; idx < length; idx++) { |
---|
421 | char ch = str.charAt(idx); |
---|
422 | if (!TOKEN_CHARS.get(ch)) |
---|
423 | return false; |
---|
424 | } |
---|
425 | |
---|
426 | return true; |
---|
427 | } |
---|
428 | |
---|
429 | private static boolean isAtomPhrase(String str) { |
---|
430 | // atom = [CFWS] 1*atext [CFWS] |
---|
431 | |
---|
432 | boolean containsAText = false; |
---|
433 | |
---|
434 | final int length = str.length(); |
---|
435 | for (int idx = 0; idx < length; idx++) { |
---|
436 | char ch = str.charAt(idx); |
---|
437 | if (ATEXT_CHARS.get(ch)) { |
---|
438 | containsAText = true; |
---|
439 | } else if (!CharsetUtil.isWhitespace(ch)) { |
---|
440 | return false; |
---|
441 | } |
---|
442 | } |
---|
443 | |
---|
444 | return containsAText; |
---|
445 | } |
---|
446 | |
---|
447 | // RFC 5322 section 3.2.3 |
---|
448 | private static boolean isDotAtomText(String str) { |
---|
449 | // dot-atom-text = 1*atext *("." 1*atext) |
---|
450 | // atext = ALPHA / DIGIT / "!" / "#" / "$" / "%" / "&" / "'" / "*" / |
---|
451 | // "+" / "-" / "/" / "=" / "?" / "^" / "_" / "`" / "{" / "|" / "}" / "~" |
---|
452 | |
---|
453 | char prev = '.'; |
---|
454 | |
---|
455 | final int length = str.length(); |
---|
456 | if (length == 0) |
---|
457 | return false; |
---|
458 | |
---|
459 | for (int idx = 0; idx < length; idx++) { |
---|
460 | char ch = str.charAt(idx); |
---|
461 | |
---|
462 | if (ch == '.') { |
---|
463 | if (prev == '.' || idx == length - 1) |
---|
464 | return false; |
---|
465 | } else { |
---|
466 | if (!ATEXT_CHARS.get(ch)) |
---|
467 | return false; |
---|
468 | } |
---|
469 | |
---|
470 | prev = ch; |
---|
471 | } |
---|
472 | |
---|
473 | return true; |
---|
474 | } |
---|
475 | |
---|
476 | // RFC 5322 section 3.2.4 |
---|
477 | private static String quote(String str) { |
---|
478 | // quoted-string = [CFWS] DQUOTE *([FWS] qcontent) [FWS] DQUOTE [CFWS] |
---|
479 | // qcontent = qtext / quoted-pair |
---|
480 | // qtext = %d33 / %d35-91 / %d93-126 |
---|
481 | // quoted-pair = ("\" (VCHAR / WSP)) |
---|
482 | // VCHAR = %x21-7E |
---|
483 | // DQUOTE = %x22 |
---|
484 | |
---|
485 | String escaped = str.replaceAll("[\\\\\"]", "\\\\$0"); |
---|
486 | return "\"" + escaped + "\""; |
---|
487 | } |
---|
488 | |
---|
489 | private static String encodeB(String prefix, String text, |
---|
490 | int usedCharacters, Charset charset, byte[] bytes) { |
---|
491 | int encodedLength = bEncodedLength(bytes); |
---|
492 | |
---|
493 | int totalLength = prefix.length() + encodedLength |
---|
494 | + ENC_WORD_SUFFIX.length(); |
---|
495 | if (totalLength <= ENCODED_WORD_MAX_LENGTH - usedCharacters) { |
---|
496 | return prefix + encodeB(bytes) + ENC_WORD_SUFFIX; |
---|
497 | } else { |
---|
498 | String part1 = text.substring(0, text.length() / 2); |
---|
499 | byte[] bytes1 = encode(part1, charset); |
---|
500 | String word1 = encodeB(prefix, part1, usedCharacters, charset, |
---|
501 | bytes1); |
---|
502 | |
---|
503 | String part2 = text.substring(text.length() / 2); |
---|
504 | byte[] bytes2 = encode(part2, charset); |
---|
505 | String word2 = encodeB(prefix, part2, 0, charset, bytes2); |
---|
506 | |
---|
507 | return word1 + " " + word2; |
---|
508 | } |
---|
509 | } |
---|
510 | |
---|
511 | private static int bEncodedLength(byte[] bytes) { |
---|
512 | return (bytes.length + 2) / 3 * 4; |
---|
513 | } |
---|
514 | |
---|
515 | private static String encodeQ(String prefix, String text, Usage usage, |
---|
516 | int usedCharacters, Charset charset, byte[] bytes) { |
---|
517 | int encodedLength = qEncodedLength(bytes, usage); |
---|
518 | |
---|
519 | int totalLength = prefix.length() + encodedLength |
---|
520 | + ENC_WORD_SUFFIX.length(); |
---|
521 | if (totalLength <= ENCODED_WORD_MAX_LENGTH - usedCharacters) { |
---|
522 | return prefix + encodeQ(bytes, usage) + ENC_WORD_SUFFIX; |
---|
523 | } else { |
---|
524 | String part1 = text.substring(0, text.length() / 2); |
---|
525 | byte[] bytes1 = encode(part1, charset); |
---|
526 | String word1 = encodeQ(prefix, part1, usage, usedCharacters, |
---|
527 | charset, bytes1); |
---|
528 | |
---|
529 | String part2 = text.substring(text.length() / 2); |
---|
530 | byte[] bytes2 = encode(part2, charset); |
---|
531 | String word2 = encodeQ(prefix, part2, usage, 0, charset, bytes2); |
---|
532 | |
---|
533 | return word1 + " " + word2; |
---|
534 | } |
---|
535 | } |
---|
536 | |
---|
537 | private static int qEncodedLength(byte[] bytes, Usage usage) { |
---|
538 | BitSet qChars = usage == Usage.TEXT_TOKEN ? Q_REGULAR_CHARS |
---|
539 | : Q_RESTRICTED_CHARS; |
---|
540 | |
---|
541 | int count = 0; |
---|
542 | |
---|
543 | for (int idx = 0; idx < bytes.length; idx++) { |
---|
544 | int v = bytes[idx] & 0xff; |
---|
545 | if (v == 32) { |
---|
546 | count++; |
---|
547 | } else if (!qChars.get(v)) { |
---|
548 | count += 3; |
---|
549 | } else { |
---|
550 | count++; |
---|
551 | } |
---|
552 | } |
---|
553 | |
---|
554 | return count; |
---|
555 | } |
---|
556 | |
---|
557 | private static byte[] encode(String text, Charset charset) { |
---|
558 | ByteBuffer buffer = charset.encode(text); |
---|
559 | byte[] bytes = new byte[buffer.limit()]; |
---|
560 | buffer.get(bytes); |
---|
561 | return bytes; |
---|
562 | } |
---|
563 | |
---|
564 | private static Charset determineCharset(String text) { |
---|
565 | // it is an important property of iso-8859-1 that it directly maps |
---|
566 | // unicode code points 0000 to 00ff to byte values 00 to ff. |
---|
567 | boolean ascii = true; |
---|
568 | final int len = text.length(); |
---|
569 | for (int index = 0; index < len; index++) { |
---|
570 | char ch = text.charAt(index); |
---|
571 | if (ch > 0xff) { |
---|
572 | return CharsetUtil.UTF_8; |
---|
573 | } |
---|
574 | if (ch > 0x7f) { |
---|
575 | ascii = false; |
---|
576 | } |
---|
577 | } |
---|
578 | return ascii ? CharsetUtil.US_ASCII : CharsetUtil.ISO_8859_1; |
---|
579 | } |
---|
580 | |
---|
581 | private static Encoding determineEncoding(byte[] bytes, Usage usage) { |
---|
582 | if (bytes.length == 0) |
---|
583 | return Encoding.Q; |
---|
584 | |
---|
585 | BitSet qChars = usage == Usage.TEXT_TOKEN ? Q_REGULAR_CHARS |
---|
586 | : Q_RESTRICTED_CHARS; |
---|
587 | |
---|
588 | int qEncoded = 0; |
---|
589 | for (int i = 0; i < bytes.length; i++) { |
---|
590 | int v = bytes[i] & 0xff; |
---|
591 | if (v != 32 && !qChars.get(v)) { |
---|
592 | qEncoded++; |
---|
593 | } |
---|
594 | } |
---|
595 | |
---|
596 | int percentage = qEncoded * 100 / bytes.length; |
---|
597 | return percentage > 30 ? Encoding.B : Encoding.Q; |
---|
598 | } |
---|
599 | |
---|
600 | private static char hexDigit(int i) { |
---|
601 | return i < 10 ? (char) (i + '0') : (char) (i - 10 + 'A'); |
---|
602 | } |
---|
603 | } |
---|