Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

EncoderUtil.java @ 6785

Revision 6785, 21.6 KB checked in by rafaelraymundo, 12 years ago (diff)
Ticket #2946 - Liberado codigo do MailArchiver?. Documentação na subpasta DOCS.

Rev	Line
[6785]	1	/****************************************************************
	2	* Licensed to the Apache Software Foundation (ASF) under one *
	3	* or more contributor license agreements. See the NOTICE file *
	4	* distributed with this work for additional information *
	5	* regarding copyright ownership. The ASF licenses this file *
	6	* to you under the Apache License, Version 2.0 (the *
	7	* "License"); you may not use this file except in compliance *
	8	* with the License. You may obtain a copy of the License at *
	9	* *
	10	* http://www.apache.org/licenses/LICENSE-2.0 *
	11	* *
	12	* Unless required by applicable law or agreed to in writing, *
	13	* software distributed under the License is distributed on an *
	14	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY *
	15	* KIND, either express or implied. See the License for the *
	16	* specific language governing permissions and limitations *
	17	* under the License. *
	18	****************************************************************/
	19
	20	package org.apache.james.mime4j.codec;
	21
	22	import java.nio.ByteBuffer;
	23	import java.nio.charset.Charset;
	24	import java.util.BitSet;
	25	import java.util.Locale;
	26
	27	import org.apache.james.mime4j.util.CharsetUtil;
	28
	29	/**
	30	* Static methods for encoding header field values. This includes encoded-words
	31	* as defined in <a href='http://www.faqs.org/rfcs/rfc2047.html'>RFC 2047</a>
	32	* or display-names of an e-mail address, for example.
	33	*/
	34	public class EncoderUtil {
	35	private static final byte[] BASE64_TABLE = Base64OutputStream.BASE64_TABLE;
	36	private static final char BASE64_PAD = '=';
	37
	38	private static final BitSet Q_REGULAR_CHARS = initChars("=_?");
	39
	40	private static final BitSet Q_RESTRICTED_CHARS = initChars("=_?\"#$%&'(),.:;<>@[\\]^`{\|}~");
	41
	42	private static final int MAX_USED_CHARACTERS = 50;
	43
	44	private static final String ENC_WORD_PREFIX = "=?";
	45	private static final String ENC_WORD_SUFFIX = "?=";
	46
	47	private static final int ENCODED_WORD_MAX_LENGTH = 75; // RFC 2047
	48
	49	private static final BitSet TOKEN_CHARS = initChars("()<>@,;:\\\"/[]?=");
	50
	51	private static final BitSet ATEXT_CHARS = initChars("()<>@.,;:\\\"[]");
	52
	53	private static BitSet initChars(String specials) {
	54	BitSet bs = new BitSet(128);
	55	for (char ch = 33; ch < 127; ch++) {
	56	if (specials.indexOf(ch) == -1) {
	57	bs.set(ch);
	58	}
	59	}
	60	return bs;
	61	}
	62
	63	/**
	64	* Selects one of the two encodings specified in RFC 2047.
	65	*/
	66	public enum Encoding {
	67	/** The B encoding (identical to base64 defined in RFC 2045). */
	68	B,
	69	/** The Q encoding (similar to quoted-printable defined in RFC 2045). */
	70	Q
	71	}
	72
	73	/**
	74	* Indicates the intended usage of an encoded word.
	75	*/
	76	public enum Usage {
	77	/**
	78	* Encoded word is used to replace a 'text' token in any Subject or
	79	* Comments header field.
	80	*/
	81	TEXT_TOKEN,
	82	/**
	83	* Encoded word is used to replace a 'word' entity within a 'phrase',
	84	* for example, one that precedes an address in a From, To, or Cc
	85	* header.
	86	*/
	87	WORD_ENTITY
	88	}
	89
	90	private EncoderUtil() {
	91	}
	92
	93	/**
	94	* Encodes the display-name portion of an address. See <a
	95	* href='http://www.faqs.org/rfcs/rfc5322.html'>RFC 5322</a> section 3.4
	96	* and <a href='http://www.faqs.org/rfcs/rfc2047.html'>RFC 2047</a> section
	97	* 5.3. The specified string should not be folded.
	98	*
	99	* @param displayName
	100	* display-name to encode.
	101	* @return encoded display-name.
	102	*/
	103	public static String encodeAddressDisplayName(String displayName) {
	104	// display-name = phrase
	105	// phrase = 1*( encoded-word / word )
	106	// word = atom / quoted-string
	107	// atom = [CFWS] 1*atext [CFWS]
	108	// CFWS = comment or folding white space
	109
	110	if (isAtomPhrase(displayName)) {
	111	return displayName;
	112	} else if (hasToBeEncoded(displayName, 0)) {
	113	return encodeEncodedWord(displayName, Usage.WORD_ENTITY);
	114	} else {
	115	return quote(displayName);
	116	}
	117	}
	118
	119	/**
	120	* Encodes the local part of an address specification as described in RFC
	121	* 5322 section 3.4.1. Leading and trailing CFWS should have been removed
	122	* before calling this method. The specified string should not contain any
	123	* illegal (control or non-ASCII) characters.
	124	*
	125	* @param localPart
	126	* the local part to encode
	127	* @return the encoded local part.
	128	*/
	129	public static String encodeAddressLocalPart(String localPart) {
	130	// local-part = dot-atom / quoted-string
	131	// dot-atom = [CFWS] dot-atom-text [CFWS]
	132	// CFWS = comment or folding white space
	133
	134	if (isDotAtomText(localPart)) {
	135	return localPart;
	136	} else {
	137	return quote(localPart);
	138	}
	139	}
	140
	141	/**
	142	* Encodes the specified strings into a header parameter as described in RFC
	143	* 2045 section 5.1 and RFC 2183 section 2. The specified strings should not
	144	* contain any illegal (control or non-ASCII) characters.
	145	*
	146	* @param name
	147	* parameter name.
	148	* @param value
	149	* parameter value.
	150	* @return encoded result.
	151	*/
	152	public static String encodeHeaderParameter(String name, String value) {
	153	name = name.toLowerCase(Locale.US);
	154
	155	// value := token / quoted-string
	156	if (isToken(value)) {
	157	return name + "=" + value;
	158	} else {
	159	return name + "=" + quote(value);
	160	}
	161	}
	162
	163	/**
	164	* Shortcut method that encodes the specified text into an encoded-word if
	165	* the text has to be encoded.
	166	*
	167	* @param text
	168	* text to encode.
	169	* @param usage
	170	* whether the encoded-word is to be used to replace a text token
	171	* or a word entity (see RFC 822).
	172	* @param usedCharacters
	173	* number of characters already used up (<code>0 <= usedCharacters <= 50</code>).
	174	* @return the specified text if encoding is not necessary or an encoded
	175	* word or a sequence of encoded words otherwise.
	176	*/
	177	public static String encodeIfNecessary(String text, Usage usage,
	178	int usedCharacters) {
	179	if (hasToBeEncoded(text, usedCharacters))
	180	return encodeEncodedWord(text, usage, usedCharacters);
	181	else
	182	return text;
	183	}
	184
	185	/**
	186	* Determines if the specified string has to encoded into an encoded-word.
	187	* Returns <code>true</code> if the text contains characters that don't
	188	* fall into the printable ASCII character set or if the text contains a
	189	* 'word' (sequence of non-whitespace characters) longer than 77 characters
	190	* (including characters already used up in the line).
	191	*
	192	* @param text
	193	* text to analyze.
	194	* @param usedCharacters
	195	* number of characters already used up (<code>0 <= usedCharacters <= 50</code>).
	196	* @return <code>true</code> if the specified text has to be encoded into
	197	* an encoded-word, <code>false</code> otherwise.
	198	*/
	199	public static boolean hasToBeEncoded(String text, int usedCharacters) {
	200	if (text == null)
	201	throw new IllegalArgumentException();
	202	if (usedCharacters < 0 \|\| usedCharacters > MAX_USED_CHARACTERS)
	203	throw new IllegalArgumentException();
	204
	205	int nonWhiteSpaceCount = usedCharacters;
	206
	207	for (int idx = 0; idx < text.length(); idx++) {
	208	char ch = text.charAt(idx);
	209	if (ch == '\t' \|\| ch == ' ') {
	210	nonWhiteSpaceCount = 0;
	211	} else {
	212	nonWhiteSpaceCount++;
	213	if (nonWhiteSpaceCount > 77) {
	214	// Line cannot be folded into multiple lines with no more
	215	// than 78 characters each. Encoding as encoded-words makes
	216	// that possible. One character has to be reserved for
	217	// folding white space; that leaves 77 characters.
	218	return true;
	219	}
	220
	221	if (ch < 32 \|\| ch >= 127) {
	222	// non-printable ascii character has to be encoded
	223	return true;
	224	}
	225	}
	226	}
	227
	228	return false;
	229	}
	230
	231	/**
	232	* Encodes the specified text into an encoded word or a sequence of encoded
	233	* words separated by space. The text is separated into a sequence of
	234	* encoded words if it does not fit in a single one.
	235	* <p>
	236	* The charset to encode the specified text into a byte array and the
	237	* encoding to use for the encoded-word are detected automatically.
	238	* <p>
	239	* This method assumes that zero characters have already been used up in the
	240	* current line.
	241	*
	242	* @param text
	243	* text to encode.
	244	* @param usage
	245	* whether the encoded-word is to be used to replace a text token
	246	* or a word entity (see RFC 822).
	247	* @return the encoded word (or sequence of encoded words if the given text
	248	* does not fit in a single encoded word).
	249	* @see #hasToBeEncoded(String, int)
	250	*/
	251	public static String encodeEncodedWord(String text, Usage usage) {
	252	return encodeEncodedWord(text, usage, 0, null, null);
	253	}
	254
	255	/**
	256	* Encodes the specified text into an encoded word or a sequence of encoded
	257	* words separated by space. The text is separated into a sequence of
	258	* encoded words if it does not fit in a single one.
	259	* <p>
	260	* The charset to encode the specified text into a byte array and the
	261	* encoding to use for the encoded-word are detected automatically.
	262	*
	263	* @param text
	264	* text to encode.
	265	* @param usage
	266	* whether the encoded-word is to be used to replace a text token
	267	* or a word entity (see RFC 822).
	268	* @param usedCharacters
	269	* number of characters already used up (<code>0 <= usedCharacters <= 50</code>).
	270	* @return the encoded word (or sequence of encoded words if the given text
	271	* does not fit in a single encoded word).
	272	* @see #hasToBeEncoded(String, int)
	273	*/
	274	public static String encodeEncodedWord(String text, Usage usage,
	275	int usedCharacters) {
	276	return encodeEncodedWord(text, usage, usedCharacters, null, null);
	277	}
	278
	279	/**
	280	* Encodes the specified text into an encoded word or a sequence of encoded
	281	* words separated by space. The text is separated into a sequence of
	282	* encoded words if it does not fit in a single one.
	283	*
	284	* @param text
	285	* text to encode.
	286	* @param usage
	287	* whether the encoded-word is to be used to replace a text token
	288	* or a word entity (see RFC 822).
	289	* @param usedCharacters
	290	* number of characters already used up (<code>0 <= usedCharacters <= 50</code>).
	291	* @param charset
	292	* the Java charset that should be used to encode the specified
	293	* string into a byte array. A suitable charset is detected
	294	* automatically if this parameter is <code>null</code>.
	295	* @param encoding
	296	* the encoding to use for the encoded-word (either B or Q). A
	297	* suitable encoding is automatically chosen if this parameter is
	298	* <code>null</code>.
	299	* @return the encoded word (or sequence of encoded words if the given text
	300	* does not fit in a single encoded word).
	301	* @see #hasToBeEncoded(String, int)
	302	*/
	303	public static String encodeEncodedWord(String text, Usage usage,
	304	int usedCharacters, Charset charset, Encoding encoding) {
	305	if (text == null)
	306	throw new IllegalArgumentException();
	307	if (usedCharacters < 0 \|\| usedCharacters > MAX_USED_CHARACTERS)
	308	throw new IllegalArgumentException();
	309
	310	if (charset == null)
	311	charset = determineCharset(text);
	312
	313	String mimeCharset = CharsetUtil.toMimeCharset(charset.name());
	314	if (mimeCharset == null) {
	315	// cannot happen if charset was originally null
	316	throw new IllegalArgumentException("Unsupported charset");
	317	}
	318
	319	byte[] bytes = encode(text, charset);
	320
	321	if (encoding == null)
	322	encoding = determineEncoding(bytes, usage);
	323
	324	if (encoding == Encoding.B) {
	325	String prefix = ENC_WORD_PREFIX + mimeCharset + "?B?";
	326	return encodeB(prefix, text, usedCharacters, charset, bytes);
	327	} else {
	328	String prefix = ENC_WORD_PREFIX + mimeCharset + "?Q?";
	329	return encodeQ(prefix, text, usage, usedCharacters, charset, bytes);
	330	}
	331	}
	332
	333	/**
	334	* Encodes the specified byte array using the B encoding defined in RFC
	335	* 2047.
	336	*
	337	* @param bytes
	338	* byte array to encode.
	339	* @return encoded string.
	340	*/
	341	public static String encodeB(byte[] bytes) {
	342	StringBuilder sb = new StringBuilder();
	343
	344	int idx = 0;
	345	final int end = bytes.length;
	346	for (; idx < end - 2; idx += 3) {
	347	int data = (bytes[idx] & 0xff) << 16 \| (bytes[idx + 1] & 0xff) << 8
	348	\| bytes[idx + 2] & 0xff;
	349	sb.append((char) BASE64_TABLE[data >> 18 & 0x3f]);
	350	sb.append((char) BASE64_TABLE[data >> 12 & 0x3f]);
	351	sb.append((char) BASE64_TABLE[data >> 6 & 0x3f]);
	352	sb.append((char) BASE64_TABLE[data & 0x3f]);
	353	}
	354
	355	if (idx == end - 2) {
	356	int data = (bytes[idx] & 0xff) << 16 \| (bytes[idx + 1] & 0xff) << 8;
	357	sb.append((char) BASE64_TABLE[data >> 18 & 0x3f]);
	358	sb.append((char) BASE64_TABLE[data >> 12 & 0x3f]);
	359	sb.append((char) BASE64_TABLE[data >> 6 & 0x3f]);
	360	sb.append(BASE64_PAD);
	361
	362	} else if (idx == end - 1) {
	363	int data = (bytes[idx] & 0xff) << 16;
	364	sb.append((char) BASE64_TABLE[data >> 18 & 0x3f]);
	365	sb.append((char) BASE64_TABLE[data >> 12 & 0x3f]);
	366	sb.append(BASE64_PAD);
	367	sb.append(BASE64_PAD);
	368	}
	369
	370	return sb.toString();
	371	}
	372
	373	/**
	374	* Encodes the specified byte array using the Q encoding defined in RFC
	375	* 2047.
	376	*
	377	* @param bytes
	378	* byte array to encode.
	379	* @param usage
	380	* whether the encoded-word is to be used to replace a text token
	381	* or a word entity (see RFC 822).
	382	* @return encoded string.
	383	*/
	384	public static String encodeQ(byte[] bytes, Usage usage) {
	385	BitSet qChars = usage == Usage.TEXT_TOKEN ? Q_REGULAR_CHARS
	386	: Q_RESTRICTED_CHARS;
	387
	388	StringBuilder sb = new StringBuilder();
	389
	390	final int end = bytes.length;
	391	for (int idx = 0; idx < end; idx++) {
	392	int v = bytes[idx] & 0xff;
	393	if (v == 32) {
	394	sb.append('_');
	395	} else if (!qChars.get(v)) {
	396	sb.append('=');
	397	sb.append(hexDigit(v >>> 4));
	398	sb.append(hexDigit(v & 0xf));
	399	} else {
	400	sb.append((char) v);
	401	}
	402	}
	403
	404	return sb.toString();
	405	}
	406
	407	/**
	408	* Tests whether the specified string is a token as defined in RFC 2045
	409	* section 5.1.
	410	*
	411	* @param str
	412	* string to test.
	413	* @return <code>true</code> if the specified string is a RFC 2045 token,
	414	* <code>false</code> otherwise.
	415	*/
	416	public static boolean isToken(String str) {
	417	// token := 1*<any (US-ASCII) CHAR except SPACE, CTLs, or tspecials>
	418	// tspecials := "(" / ")" / "<" / ">" / "@" / "," / ";" / ":" / "\" /
	419	// <"> / "/" / "[" / "]" / "?" / "="
	420	// CTL := 0.- 31., 127.
	421
	422	final int length = str.length();
	423	if (length == 0)
	424	return false;
	425
	426	for (int idx = 0; idx < length; idx++) {
	427	char ch = str.charAt(idx);
	428	if (!TOKEN_CHARS.get(ch))
	429	return false;
	430	}
	431
	432	return true;
	433	}
	434
	435	private static boolean isAtomPhrase(String str) {
	436	// atom = [CFWS] 1*atext [CFWS]
	437
	438	boolean containsAText = false;
	439
	440	final int length = str.length();
	441	for (int idx = 0; idx < length; idx++) {
	442	char ch = str.charAt(idx);
	443	if (ATEXT_CHARS.get(ch)) {
	444	containsAText = true;
	445	} else if (!CharsetUtil.isWhitespace(ch)) {
	446	return false;
	447	}
	448	}
	449
	450	return containsAText;
	451	}
	452
	453	// RFC 5322 section 3.2.3
	454	private static boolean isDotAtomText(String str) {
	455	// dot-atom-text = 1atext ("." 1*atext)
	456	// atext = ALPHA / DIGIT / "!" / "#" / "$" / "%" / "&" / "'" / "*" /
	457	// "+" / "-" / "/" / "=" / "?" / "^" / "_" / "`" / "{" / "\|" / "}" / "~"
	458
	459	char prev = '.';
	460
	461	final int length = str.length();
	462	if (length == 0)
	463	return false;
	464
	465	for (int idx = 0; idx < length; idx++) {
	466	char ch = str.charAt(idx);
	467
	468	if (ch == '.') {
	469	if (prev == '.' \|\| idx == length - 1)
	470	return false;
	471	} else {
	472	if (!ATEXT_CHARS.get(ch))
	473	return false;
	474	}
	475
	476	prev = ch;
	477	}
	478
	479	return true;
	480	}
	481
	482	// RFC 5322 section 3.2.4
	483	private static String quote(String str) {
	484	// quoted-string = [CFWS] DQUOTE *([FWS] qcontent) [FWS] DQUOTE [CFWS]
	485	// qcontent = qtext / quoted-pair
	486	// qtext = %d33 / %d35-91 / %d93-126
	487	// quoted-pair = ("\" (VCHAR / WSP))
	488	// VCHAR = %x21-7E
	489	// DQUOTE = %x22
	490
	491	String escaped = str.replaceAll("[\\\\\"]", "\\\\$0");
	492	return "\"" + escaped + "\"";
	493	}
	494
	495	private static String encodeB(String prefix, String text,
	496	int usedCharacters, Charset charset, byte[] bytes) {
	497	int encodedLength = bEncodedLength(bytes);
	498
	499	int totalLength = prefix.length() + encodedLength
	500	+ ENC_WORD_SUFFIX.length();
	501	if (totalLength <= ENCODED_WORD_MAX_LENGTH - usedCharacters) {
	502	return prefix + encodeB(bytes) + ENC_WORD_SUFFIX;
	503	} else {
	504	String part1 = text.substring(0, text.length() / 2);
	505	byte[] bytes1 = encode(part1, charset);
	506	String word1 = encodeB(prefix, part1, usedCharacters, charset,
	507	bytes1);
	508
	509	String part2 = text.substring(text.length() / 2);
	510	byte[] bytes2 = encode(part2, charset);
	511	String word2 = encodeB(prefix, part2, 0, charset, bytes2);
	512
	513	return word1 + " " + word2;
	514	}
	515	}
	516
	517	private static int bEncodedLength(byte[] bytes) {
	518	return (bytes.length + 2) / 3 * 4;
	519	}
	520
	521	private static String encodeQ(String prefix, String text, Usage usage,
	522	int usedCharacters, Charset charset, byte[] bytes) {
	523	int encodedLength = qEncodedLength(bytes, usage);
	524
	525	int totalLength = prefix.length() + encodedLength
	526	+ ENC_WORD_SUFFIX.length();
	527	if (totalLength <= ENCODED_WORD_MAX_LENGTH - usedCharacters) {
	528	return prefix + encodeQ(bytes, usage) + ENC_WORD_SUFFIX;
	529	} else {
	530	String part1 = text.substring(0, text.length() / 2);
	531	byte[] bytes1 = encode(part1, charset);
	532	String word1 = encodeQ(prefix, part1, usage, usedCharacters,
	533	charset, bytes1);
	534
	535	String part2 = text.substring(text.length() / 2);
	536	byte[] bytes2 = encode(part2, charset);
	537	String word2 = encodeQ(prefix, part2, usage, 0, charset, bytes2);
	538
	539	return word1 + " " + word2;
	540	}
	541	}
	542
	543	private static int qEncodedLength(byte[] bytes, Usage usage) {
	544	BitSet qChars = usage == Usage.TEXT_TOKEN ? Q_REGULAR_CHARS
	545	: Q_RESTRICTED_CHARS;
	546
	547	int count = 0;
	548
	549	for (int idx = 0; idx < bytes.length; idx++) {
	550	int v = bytes[idx] & 0xff;
	551	if (v == 32) {
	552	count++;
	553	} else if (!qChars.get(v)) {
	554	count += 3;
	555	} else {
	556	count++;
	557	}
	558	}
	559
	560	return count;
	561	}
	562
	563	private static byte[] encode(String text, Charset charset) {
	564	ByteBuffer buffer = charset.encode(text);
	565	byte[] bytes = new byte[buffer.limit()];
	566	buffer.get(bytes);
	567	return bytes;
	568	}
	569
	570	private static Charset determineCharset(String text) {
	571	// it is an important property of iso-8859-1 that it directly maps
	572	// unicode code points 0000 to 00ff to byte values 00 to ff.
	573	boolean ascii = true;
	574	final int len = text.length();
	575	for (int index = 0; index < len; index++) {
	576	char ch = text.charAt(index);
	577	if (ch > 0xff) {
	578	return CharsetUtil.UTF_8;
	579	}
	580	if (ch > 0x7f) {
	581	ascii = false;
	582	}
	583	}
	584	return ascii ? CharsetUtil.US_ASCII : CharsetUtil.ISO_8859_1;
	585	}
	586
	587	private static Encoding determineEncoding(byte[] bytes, Usage usage) {
	588	if (bytes.length == 0)
	589	return Encoding.Q;
	590
	591	BitSet qChars = usage == Usage.TEXT_TOKEN ? Q_REGULAR_CHARS
	592	: Q_RESTRICTED_CHARS;
	593
	594	int qEncoded = 0;
	595	for (int i = 0; i < bytes.length; i++) {
	596	int v = bytes[i] & 0xff;
	597	if (v != 32 && !qChars.get(v)) {
	598	qEncoded++;
	599	}
	600	}
	601
	602	int percentage = qEncoded * 100 / bytes.length;
	603	return percentage > 30 ? Encoding.B : Encoding.Q;
	604	}
	605
	606	private static char hexDigit(int i) {
	607	return i < 10 ? (char) (i + '0') : (char) (i - 10 + 'A');
	608	}
	609	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: contrib/MailArchiver/sources/vendor/mime4j/apache-mime4j-0.7-SNAPSHOT-20110327.010440-17/core/src/main/java/org/apache/james/mime4j/codec/EncoderUtil.java @ 6785

Download in other formats: