Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Annotate
Revision Log

EncoderUtil.java @ 6785

Revision 6785, 21.4 KB checked in by rafaelraymundo, 12 years ago (diff)
Ticket #2946 - Liberado codigo do MailArchiver?. Documentação na subpasta DOCS.

Line
1	/****************************************************************
2	* Licensed to the Apache Software Foundation (ASF) under one *
3	* or more contributor license agreements. See the NOTICE file *
4	* distributed with this work for additional information *
5	* regarding copyright ownership. The ASF licenses this file *
6	* to you under the Apache License, Version 2.0 (the *
7	* "License"); you may not use this file except in compliance *
8	* with the License. You may obtain a copy of the License at *
9	* *
10	* http://www.apache.org/licenses/LICENSE-2.0 *
11	* *
12	* Unless required by applicable law or agreed to in writing, *
13	* software distributed under the License is distributed on an *
14	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY *
15	* KIND, either express or implied. See the License for the *
16	* specific language governing permissions and limitations *
17	* under the License. *
18	****************************************************************/
19
20	package org.apache.james.mime4j.codec;
21
22	import java.nio.ByteBuffer;
23	import java.nio.charset.Charset;
24	import java.util.BitSet;
25	import java.util.Locale;
26
27	import org.apache.james.mime4j.util.CharsetUtil;
28
29	/**
30	* Static methods for encoding header field values. This includes encoded-words
31	* as defined in <a href='http://www.faqs.org/rfcs/rfc2047.html'>RFC 2047</a>
32	* or display-names of an e-mail address, for example.
33	*/
34	public class EncoderUtil {
35	private static final byte[] BASE64_TABLE = Base64OutputStream.BASE64_TABLE;
36	private static final char BASE64_PAD = '=';
37
38	private static final BitSet Q_REGULAR_CHARS = initChars("=_?");
39
40	private static final BitSet Q_RESTRICTED_CHARS = initChars("=_?\"#$%&'(),.:;<>@[\\]^`{\|}~");
41
42	private static final int MAX_USED_CHARACTERS = 50;
43
44	private static final String ENC_WORD_PREFIX = "=?";
45	private static final String ENC_WORD_SUFFIX = "?=";
46
47	private static final int ENCODED_WORD_MAX_LENGTH = 75; // RFC 2047
48
49	private static final BitSet TOKEN_CHARS = initChars("()<>@,;:\\\"/[]?=");
50
51	private static final BitSet ATEXT_CHARS = initChars("()<>@.,;:\\\"[]");
52
53	private static BitSet initChars(String specials) {
54	BitSet bs = new BitSet(128);
55	for (char ch = 33; ch < 127; ch++) {
56	if (specials.indexOf(ch) == -1) {
57	bs.set(ch);
58	}
59	}
60	return bs;
61	}
62
63	/**
64	* Selects one of the two encodings specified in RFC 2047.
65	*/
66	public enum Encoding {
67	/** The B encoding (identical to base64 defined in RFC 2045). */
68	B,
69	/** The Q encoding (similar to quoted-printable defined in RFC 2045). */
70	Q
71	}
72
73	/**
74	* Indicates the intended usage of an encoded word.
75	*/
76	public enum Usage {
77	/**
78	* Encoded word is used to replace a 'text' token in any Subject or
79	* Comments header field.
80	*/
81	TEXT_TOKEN,
82	/**
83	* Encoded word is used to replace a 'word' entity within a 'phrase',
84	* for example, one that precedes an address in a From, To, or Cc
85	* header.
86	*/
87	WORD_ENTITY
88	}
89
90	private EncoderUtil() {
91	}
92
93	/**
94	* Encodes the display-name portion of an address. See <a
95	* href='http://www.faqs.org/rfcs/rfc5322.html'>RFC 5322</a> section 3.4
96	* and <a href='http://www.faqs.org/rfcs/rfc2047.html'>RFC 2047</a> section
97	* 5.3. The specified string should not be folded.
98	*
99	* @param displayName
100	* display-name to encode.
101	* @return encoded display-name.
102	*/
103	public static String encodeAddressDisplayName(String displayName) {
104	// display-name = phrase
105	// phrase = 1*( encoded-word / word )
106	// word = atom / quoted-string
107	// atom = [CFWS] 1*atext [CFWS]
108	// CFWS = comment or folding white space
109
110	if (isAtomPhrase(displayName)) {
111	return displayName;
112	} else if (hasToBeEncoded(displayName, 0)) {
113	return encodeEncodedWord(displayName, Usage.WORD_ENTITY);
114	} else {
115	return quote(displayName);
116	}
117	}
118
119	/**
120	* Encodes the local part of an address specification as described in RFC
121	* 5322 section 3.4.1. Leading and trailing CFWS should have been removed
122	* before calling this method. The specified string should not contain any
123	* illegal (control or non-ASCII) characters.
124	*
125	* @param localPart
126	* the local part to encode
127	* @return the encoded local part.
128	*/
129	public static String encodeAddressLocalPart(String localPart) {
130	// local-part = dot-atom / quoted-string
131	// dot-atom = [CFWS] dot-atom-text [CFWS]
132	// CFWS = comment or folding white space
133
134	if (isDotAtomText(localPart)) {
135	return localPart;
136	} else {
137	return quote(localPart);
138	}
139	}
140
141	/**
142	* Encodes the specified strings into a header parameter as described in RFC
143	* 2045 section 5.1 and RFC 2183 section 2. The specified strings should not
144	* contain any illegal (control or non-ASCII) characters.
145	*
146	* @param name
147	* parameter name.
148	* @param value
149	* parameter value.
150	* @return encoded result.
151	*/
152	public static String encodeHeaderParameter(String name, String value) {
153	name = name.toLowerCase(Locale.US);
154
155	// value := token / quoted-string
156	if (isToken(value)) {
157	return name + "=" + value;
158	} else {
159	return name + "=" + quote(value);
160	}
161	}
162
163	/**
164	* Shortcut method that encodes the specified text into an encoded-word if
165	* the text has to be encoded.
166	*
167	* @param text
168	* text to encode.
169	* @param usage
170	* whether the encoded-word is to be used to replace a text token
171	* or a word entity (see RFC 822).
172	* @param usedCharacters
173	* number of characters already used up (<code>0 <= usedCharacters <= 50</code>).
174	* @return the specified text if encoding is not necessary or an encoded
175	* word or a sequence of encoded words otherwise.
176	*/
177	public static String encodeIfNecessary(String text, Usage usage,
178	int usedCharacters) {
179	if (hasToBeEncoded(text, usedCharacters))
180	return encodeEncodedWord(text, usage, usedCharacters);
181	else
182	return text;
183	}
184
185	/**
186	* Determines if the specified string has to encoded into an encoded-word.
187	* Returns <code>true</code> if the text contains characters that don't
188	* fall into the printable ASCII character set or if the text contains a
189	* 'word' (sequence of non-whitespace characters) longer than 77 characters
190	* (including characters already used up in the line).
191	*
192	* @param text
193	* text to analyze.
194	* @param usedCharacters
195	* number of characters already used up (<code>0 <= usedCharacters <= 50</code>).
196	* @return <code>true</code> if the specified text has to be encoded into
197	* an encoded-word, <code>false</code> otherwise.
198	*/
199	public static boolean hasToBeEncoded(String text, int usedCharacters) {
200	if (text == null)
201	throw new IllegalArgumentException();
202	if (usedCharacters < 0 \|\| usedCharacters > MAX_USED_CHARACTERS)
203	throw new IllegalArgumentException();
204
205	int nonWhiteSpaceCount = usedCharacters;
206
207	for (int idx = 0; idx < text.length(); idx++) {
208	char ch = text.charAt(idx);
209	if (ch == '\t' \|\| ch == ' ') {
210	nonWhiteSpaceCount = 0;
211	} else {
212	nonWhiteSpaceCount++;
213	if (nonWhiteSpaceCount > 77) {
214	// Line cannot be folded into multiple lines with no more
215	// than 78 characters each. Encoding as encoded-words makes
216	// that possible. One character has to be reserved for
217	// folding white space; that leaves 77 characters.
218	return true;
219	}
220
221	if (ch < 32 \|\| ch >= 127) {
222	// non-printable ascii character has to be encoded
223	return true;
224	}
225	}
226	}
227
228	return false;
229	}
230
231	/**
232	* Encodes the specified text into an encoded word or a sequence of encoded
233	* words separated by space. The text is separated into a sequence of
234	* encoded words if it does not fit in a single one.
235	* <p>
236	* The charset to encode the specified text into a byte array and the
237	* encoding to use for the encoded-word are detected automatically.
238	* <p>
239	* This method assumes that zero characters have already been used up in the
240	* current line.
241	*
242	* @param text
243	* text to encode.
244	* @param usage
245	* whether the encoded-word is to be used to replace a text token
246	* or a word entity (see RFC 822).
247	* @return the encoded word (or sequence of encoded words if the given text
248	* does not fit in a single encoded word).
249	* @see #hasToBeEncoded(String, int)
250	*/
251	public static String encodeEncodedWord(String text, Usage usage) {
252	return encodeEncodedWord(text, usage, 0, null, null);
253	}
254
255	/**
256	* Encodes the specified text into an encoded word or a sequence of encoded
257	* words separated by space. The text is separated into a sequence of
258	* encoded words if it does not fit in a single one.
259	* <p>
260	* The charset to encode the specified text into a byte array and the
261	* encoding to use for the encoded-word are detected automatically.
262	*
263	* @param text
264	* text to encode.
265	* @param usage
266	* whether the encoded-word is to be used to replace a text token
267	* or a word entity (see RFC 822).
268	* @param usedCharacters
269	* number of characters already used up (<code>0 <= usedCharacters <= 50</code>).
270	* @return the encoded word (or sequence of encoded words if the given text
271	* does not fit in a single encoded word).
272	* @see #hasToBeEncoded(String, int)
273	*/
274	public static String encodeEncodedWord(String text, Usage usage,
275	int usedCharacters) {
276	return encodeEncodedWord(text, usage, usedCharacters, null, null);
277	}
278
279	/**
280	* Encodes the specified text into an encoded word or a sequence of encoded
281	* words separated by space. The text is separated into a sequence of
282	* encoded words if it does not fit in a single one.
283	*
284	* @param text
285	* text to encode.
286	* @param usage
287	* whether the encoded-word is to be used to replace a text token
288	* or a word entity (see RFC 822).
289	* @param usedCharacters
290	* number of characters already used up (<code>0 <= usedCharacters <= 50</code>).
291	* @param charset
292	* the Java charset that should be used to encode the specified
293	* string into a byte array. A suitable charset is detected
294	* automatically if this parameter is <code>null</code>.
295	* @param encoding
296	* the encoding to use for the encoded-word (either B or Q). A
297	* suitable encoding is automatically chosen if this parameter is
298	* <code>null</code>.
299	* @return the encoded word (or sequence of encoded words if the given text
300	* does not fit in a single encoded word).
301	* @see #hasToBeEncoded(String, int)
302	*/
303	public static String encodeEncodedWord(String text, Usage usage,
304	int usedCharacters, Charset charset, Encoding encoding) {
305	if (text == null)
306	throw new IllegalArgumentException();
307	if (usedCharacters < 0 \|\| usedCharacters > MAX_USED_CHARACTERS)
308	throw new IllegalArgumentException();
309
310	if (charset == null)
311	charset = determineCharset(text);
312
313	byte[] bytes = encode(text, charset);
314
315	if (encoding == null)
316	encoding = determineEncoding(bytes, usage);
317
318	if (encoding == Encoding.B) {
319	String prefix = ENC_WORD_PREFIX + charset.name() + "?B?";
320	return encodeB(prefix, text, usedCharacters, charset, bytes);
321	} else {
322	String prefix = ENC_WORD_PREFIX + charset.name() + "?Q?";
323	return encodeQ(prefix, text, usage, usedCharacters, charset, bytes);
324	}
325	}
326
327	/**
328	* Encodes the specified byte array using the B encoding defined in RFC
329	* 2047.
330	*
331	* @param bytes
332	* byte array to encode.
333	* @return encoded string.
334	*/
335	public static String encodeB(byte[] bytes) {
336	StringBuilder sb = new StringBuilder();
337
338	int idx = 0;
339	final int end = bytes.length;
340	for (; idx < end - 2; idx += 3) {
341	int data = (bytes[idx] & 0xff) << 16 \| (bytes[idx + 1] & 0xff) << 8
342	\| bytes[idx + 2] & 0xff;
343	sb.append((char) BASE64_TABLE[data >> 18 & 0x3f]);
344	sb.append((char) BASE64_TABLE[data >> 12 & 0x3f]);
345	sb.append((char) BASE64_TABLE[data >> 6 & 0x3f]);
346	sb.append((char) BASE64_TABLE[data & 0x3f]);
347	}
348
349	if (idx == end - 2) {
350	int data = (bytes[idx] & 0xff) << 16 \| (bytes[idx + 1] & 0xff) << 8;
351	sb.append((char) BASE64_TABLE[data >> 18 & 0x3f]);
352	sb.append((char) BASE64_TABLE[data >> 12 & 0x3f]);
353	sb.append((char) BASE64_TABLE[data >> 6 & 0x3f]);
354	sb.append(BASE64_PAD);
355
356	} else if (idx == end - 1) {
357	int data = (bytes[idx] & 0xff) << 16;
358	sb.append((char) BASE64_TABLE[data >> 18 & 0x3f]);
359	sb.append((char) BASE64_TABLE[data >> 12 & 0x3f]);
360	sb.append(BASE64_PAD);
361	sb.append(BASE64_PAD);
362	}
363
364	return sb.toString();
365	}
366
367	/**
368	* Encodes the specified byte array using the Q encoding defined in RFC
369	* 2047.
370	*
371	* @param bytes
372	* byte array to encode.
373	* @param usage
374	* whether the encoded-word is to be used to replace a text token
375	* or a word entity (see RFC 822).
376	* @return encoded string.
377	*/
378	public static String encodeQ(byte[] bytes, Usage usage) {
379	BitSet qChars = usage == Usage.TEXT_TOKEN ? Q_REGULAR_CHARS
380	: Q_RESTRICTED_CHARS;
381
382	StringBuilder sb = new StringBuilder();
383
384	final int end = bytes.length;
385	for (int idx = 0; idx < end; idx++) {
386	int v = bytes[idx] & 0xff;
387	if (v == 32) {
388	sb.append('_');
389	} else if (!qChars.get(v)) {
390	sb.append('=');
391	sb.append(hexDigit(v >>> 4));
392	sb.append(hexDigit(v & 0xf));
393	} else {
394	sb.append((char) v);
395	}
396	}
397
398	return sb.toString();
399	}
400
401	/**
402	* Tests whether the specified string is a token as defined in RFC 2045
403	* section 5.1.
404	*
405	* @param str
406	* string to test.
407	* @return <code>true</code> if the specified string is a RFC 2045 token,
408	* <code>false</code> otherwise.
409	*/
410	public static boolean isToken(String str) {
411	// token := 1*<any (US-ASCII) CHAR except SPACE, CTLs, or tspecials>
412	// tspecials := "(" / ")" / "<" / ">" / "@" / "," / ";" / ":" / "\" /
413	// <"> / "/" / "[" / "]" / "?" / "="
414	// CTL := 0.- 31., 127.
415
416	final int length = str.length();
417	if (length == 0)
418	return false;
419
420	for (int idx = 0; idx < length; idx++) {
421	char ch = str.charAt(idx);
422	if (!TOKEN_CHARS.get(ch))
423	return false;
424	}
425
426	return true;
427	}
428
429	private static boolean isAtomPhrase(String str) {
430	// atom = [CFWS] 1*atext [CFWS]
431
432	boolean containsAText = false;
433
434	final int length = str.length();
435	for (int idx = 0; idx < length; idx++) {
436	char ch = str.charAt(idx);
437	if (ATEXT_CHARS.get(ch)) {
438	containsAText = true;
439	} else if (!CharsetUtil.isWhitespace(ch)) {
440	return false;
441	}
442	}
443
444	return containsAText;
445	}
446
447	// RFC 5322 section 3.2.3
448	private static boolean isDotAtomText(String str) {
449	// dot-atom-text = 1atext ("." 1*atext)
450	// atext = ALPHA / DIGIT / "!" / "#" / "$" / "%" / "&" / "'" / "*" /
451	// "+" / "-" / "/" / "=" / "?" / "^" / "_" / "`" / "{" / "\|" / "}" / "~"
452
453	char prev = '.';
454
455	final int length = str.length();
456	if (length == 0)
457	return false;
458
459	for (int idx = 0; idx < length; idx++) {
460	char ch = str.charAt(idx);
461
462	if (ch == '.') {
463	if (prev == '.' \|\| idx == length - 1)
464	return false;
465	} else {
466	if (!ATEXT_CHARS.get(ch))
467	return false;
468	}
469
470	prev = ch;
471	}
472
473	return true;
474	}
475
476	// RFC 5322 section 3.2.4
477	private static String quote(String str) {
478	// quoted-string = [CFWS] DQUOTE *([FWS] qcontent) [FWS] DQUOTE [CFWS]
479	// qcontent = qtext / quoted-pair
480	// qtext = %d33 / %d35-91 / %d93-126
481	// quoted-pair = ("\" (VCHAR / WSP))
482	// VCHAR = %x21-7E
483	// DQUOTE = %x22
484
485	String escaped = str.replaceAll("[\\\\\"]", "\\\\$0");
486	return "\"" + escaped + "\"";
487	}
488
489	private static String encodeB(String prefix, String text,
490	int usedCharacters, Charset charset, byte[] bytes) {
491	int encodedLength = bEncodedLength(bytes);
492
493	int totalLength = prefix.length() + encodedLength
494	+ ENC_WORD_SUFFIX.length();
495	if (totalLength <= ENCODED_WORD_MAX_LENGTH - usedCharacters) {
496	return prefix + encodeB(bytes) + ENC_WORD_SUFFIX;
497	} else {
498	String part1 = text.substring(0, text.length() / 2);
499	byte[] bytes1 = encode(part1, charset);
500	String word1 = encodeB(prefix, part1, usedCharacters, charset,
501	bytes1);
502
503	String part2 = text.substring(text.length() / 2);
504	byte[] bytes2 = encode(part2, charset);
505	String word2 = encodeB(prefix, part2, 0, charset, bytes2);
506
507	return word1 + " " + word2;
508	}
509	}
510
511	private static int bEncodedLength(byte[] bytes) {
512	return (bytes.length + 2) / 3 * 4;
513	}
514
515	private static String encodeQ(String prefix, String text, Usage usage,
516	int usedCharacters, Charset charset, byte[] bytes) {
517	int encodedLength = qEncodedLength(bytes, usage);
518
519	int totalLength = prefix.length() + encodedLength
520	+ ENC_WORD_SUFFIX.length();
521	if (totalLength <= ENCODED_WORD_MAX_LENGTH - usedCharacters) {
522	return prefix + encodeQ(bytes, usage) + ENC_WORD_SUFFIX;
523	} else {
524	String part1 = text.substring(0, text.length() / 2);
525	byte[] bytes1 = encode(part1, charset);
526	String word1 = encodeQ(prefix, part1, usage, usedCharacters,
527	charset, bytes1);
528
529	String part2 = text.substring(text.length() / 2);
530	byte[] bytes2 = encode(part2, charset);
531	String word2 = encodeQ(prefix, part2, usage, 0, charset, bytes2);
532
533	return word1 + " " + word2;
534	}
535	}
536
537	private static int qEncodedLength(byte[] bytes, Usage usage) {
538	BitSet qChars = usage == Usage.TEXT_TOKEN ? Q_REGULAR_CHARS
539	: Q_RESTRICTED_CHARS;
540
541	int count = 0;
542
543	for (int idx = 0; idx < bytes.length; idx++) {
544	int v = bytes[idx] & 0xff;
545	if (v == 32) {
546	count++;
547	} else if (!qChars.get(v)) {
548	count += 3;
549	} else {
550	count++;
551	}
552	}
553
554	return count;
555	}
556
557	private static byte[] encode(String text, Charset charset) {
558	ByteBuffer buffer = charset.encode(text);
559	byte[] bytes = new byte[buffer.limit()];
560	buffer.get(bytes);
561	return bytes;
562	}
563
564	private static Charset determineCharset(String text) {
565	// it is an important property of iso-8859-1 that it directly maps
566	// unicode code points 0000 to 00ff to byte values 00 to ff.
567	boolean ascii = true;
568	final int len = text.length();
569	for (int index = 0; index < len; index++) {
570	char ch = text.charAt(index);
571	if (ch > 0xff) {
572	return CharsetUtil.UTF_8;
573	}
574	if (ch > 0x7f) {
575	ascii = false;
576	}
577	}
578	return ascii ? CharsetUtil.US_ASCII : CharsetUtil.ISO_8859_1;
579	}
580
581	private static Encoding determineEncoding(byte[] bytes, Usage usage) {
582	if (bytes.length == 0)
583	return Encoding.Q;
584
585	BitSet qChars = usage == Usage.TEXT_TOKEN ? Q_REGULAR_CHARS
586	: Q_RESTRICTED_CHARS;
587
588	int qEncoded = 0;
589	for (int i = 0; i < bytes.length; i++) {
590	int v = bytes[i] & 0xff;
591	if (v != 32 && !qChars.get(v)) {
592	qEncoded++;
593	}
594	}
595
596	int percentage = qEncoded * 100 / bytes.length;
597	return percentage > 30 ? Encoding.B : Encoding.Q;
598	}
599
600	private static char hexDigit(int i) {
601	return i < 10 ? (char) (i + '0') : (char) (i - 10 + 'A');
602	}
603	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: contrib/MailArchiver/sources/vendor/mime4j/custom/core/src/main/java/org/apache/james/mime4j/codec/EncoderUtil.java @ 6785

Download in other formats: