1 | /**************************************************************** |
---|
2 | * Licensed to the Apache Software Foundation (ASF) under one * |
---|
3 | * or more contributor license agreements. See the NOTICE file * |
---|
4 | * distributed with this work for additional information * |
---|
5 | * regarding copyright ownership. The ASF licenses this file * |
---|
6 | * to you under the Apache License, Version 2.0 (the * |
---|
7 | * "License"); you may not use this file except in compliance * |
---|
8 | * with the License. You may obtain a copy of the License at * |
---|
9 | * * |
---|
10 | * http://www.apache.org/licenses/LICENSE-2.0 * |
---|
11 | * * |
---|
12 | * Unless required by applicable law or agreed to in writing, * |
---|
13 | * software distributed under the License is distributed on an * |
---|
14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * |
---|
15 | * KIND, either express or implied. See the License for the * |
---|
16 | * specific language governing permissions and limitations * |
---|
17 | * under the License. * |
---|
18 | ****************************************************************/ |
---|
19 | |
---|
20 | package org.apache.james.mime4j.stream; |
---|
21 | |
---|
22 | import java.io.IOException; |
---|
23 | import java.io.InputStream; |
---|
24 | import java.io.InputStreamReader; |
---|
25 | import java.io.Reader; |
---|
26 | import java.nio.charset.Charset; |
---|
27 | import java.nio.charset.IllegalCharsetNameException; |
---|
28 | import java.nio.charset.UnsupportedCharsetException; |
---|
29 | import java.util.LinkedList; |
---|
30 | |
---|
31 | import org.apache.james.mime4j.MimeException; |
---|
32 | import org.apache.james.mime4j.codec.DecodeMonitor; |
---|
33 | import org.apache.james.mime4j.io.LineNumberInputStream; |
---|
34 | import org.apache.james.mime4j.io.LineNumberSource; |
---|
35 | import org.apache.james.mime4j.util.CharsetUtil; |
---|
36 | |
---|
37 | /** |
---|
38 | * <p> |
---|
39 | * Parses MIME (or RFC822) message streams of bytes or characters. |
---|
40 | * The stream is converted into an event stream. |
---|
41 | * <p> |
---|
42 | * <p> |
---|
43 | * Typical usage: |
---|
44 | * </p> |
---|
45 | * <pre> |
---|
46 | * MimeTokenStream stream = new MimeTokenStream(); |
---|
47 | * stream.parse(new FileInputStream("mime.msg")); |
---|
48 | * for (int state = stream.getState(); |
---|
49 | * state != MimeTokenStream.T_END_OF_STREAM; |
---|
50 | * state = stream.next()) { |
---|
51 | * switch (state) { |
---|
52 | * case MimeTokenStream.T_BODY: |
---|
53 | * System.out.println("Body detected, contents = " |
---|
54 | * + stream.getInputStream() + ", header data = " |
---|
55 | * + stream.getBodyDescriptor()); |
---|
56 | * break; |
---|
57 | * case MimeTokenStream.T_FIELD: |
---|
58 | * System.out.println("Header field detected: " |
---|
59 | * + stream.getField()); |
---|
60 | * break; |
---|
61 | * case MimeTokenStream.T_START_MULTIPART: |
---|
62 | * System.out.println("Multipart message detexted," |
---|
63 | * + " header data = " |
---|
64 | * + stream.getBodyDescriptor()); |
---|
65 | * ... |
---|
66 | * } |
---|
67 | * } |
---|
68 | * </pre> |
---|
69 | * <p>Instances of {@link MimeTokenStream} are reusable: Invoking the |
---|
70 | * method {@link #parse(InputStream)} resets the token streams internal |
---|
71 | * state. However, they are definitely <em>not</em> thread safe. If you |
---|
72 | * have a multi threaded application, then the suggested use is to have |
---|
73 | * one instance per thread.</p> |
---|
74 | */ |
---|
75 | public class MimeTokenStream { |
---|
76 | |
---|
77 | private final MimeEntityConfig config; |
---|
78 | private final DecodeMonitor monitor; |
---|
79 | private final MutableBodyDescriptorFactory bodyDescFactory; |
---|
80 | private final LinkedList<EntityStateMachine> entities = new LinkedList<EntityStateMachine>(); |
---|
81 | |
---|
82 | private EntityState state = EntityState.T_END_OF_STREAM; |
---|
83 | private EntityStateMachine currentStateMachine; |
---|
84 | private RecursionMode recursionMode = RecursionMode.M_RECURSE; |
---|
85 | private MimeEntity rootentity; |
---|
86 | |
---|
87 | /** |
---|
88 | * Constructs a standard (lax) stream. |
---|
89 | * Optional validation events will be logged only. |
---|
90 | * Use {@link MimeEntityConfig#setStrictParsing(boolean)} to turn on strict |
---|
91 | * parsing mode and pass the config object to |
---|
92 | * {@link MimeTokenStream#MimeTokenStream(MimeEntityConfig)} to create |
---|
93 | * a stream that strictly validates the input. |
---|
94 | */ |
---|
95 | public MimeTokenStream() { |
---|
96 | this(new MimeEntityConfig()); |
---|
97 | } |
---|
98 | |
---|
99 | public MimeTokenStream(final MimeEntityConfig config) { |
---|
100 | this(config, null, null); |
---|
101 | } |
---|
102 | |
---|
103 | public MimeTokenStream( |
---|
104 | final MimeEntityConfig config, |
---|
105 | final MutableBodyDescriptorFactory bodyDescFactory) { |
---|
106 | this(config, bodyDescFactory, null); |
---|
107 | } |
---|
108 | |
---|
109 | public MimeTokenStream( |
---|
110 | final MimeEntityConfig config, |
---|
111 | final MutableBodyDescriptorFactory bodyDescFactory, |
---|
112 | final DecodeMonitor monitor) { |
---|
113 | super(); |
---|
114 | this.config = config; |
---|
115 | this.monitor = monitor != null ? monitor : |
---|
116 | (config.isStrictParsing() ? DecodeMonitor.STRICT : DecodeMonitor.SILENT); |
---|
117 | this.bodyDescFactory = bodyDescFactory; |
---|
118 | } |
---|
119 | |
---|
120 | /** Instructs the {@code MimeTokenStream} to parse the given streams contents. |
---|
121 | * If the {@code MimeTokenStream} has already been in use, resets the streams |
---|
122 | * internal state. |
---|
123 | */ |
---|
124 | public void parse(InputStream stream) { |
---|
125 | doParse(stream, newBodyDescriptor(), EntityState.T_START_MESSAGE); |
---|
126 | } |
---|
127 | |
---|
128 | /** Instructs the {@code MimeTokenStream} to parse the given content with |
---|
129 | * the content type. The message stream is assumed to have no message header |
---|
130 | * and is expected to begin with a message body. This can be the case when |
---|
131 | * the message content is transmitted using a different transport protocol |
---|
132 | * such as HTTP. |
---|
133 | * <p/> |
---|
134 | * If the {@code MimeTokenStream} has already been in use, resets the streams |
---|
135 | * internal state. |
---|
136 | */ |
---|
137 | public void parseHeadless(InputStream stream, String contentType) { |
---|
138 | if (contentType == null) { |
---|
139 | throw new IllegalArgumentException("Content type may not be null"); |
---|
140 | } |
---|
141 | MutableBodyDescriptor newBodyDescriptor = newBodyDescriptor(); |
---|
142 | try { |
---|
143 | newBodyDescriptor.addField(new RawField("Content-Type", contentType)); |
---|
144 | } catch (MimeException ex) { |
---|
145 | // should never happen |
---|
146 | throw new IllegalArgumentException(ex.getMessage()); |
---|
147 | } |
---|
148 | doParse(stream, newBodyDescriptor, EntityState.T_END_HEADER); |
---|
149 | try { |
---|
150 | next(); |
---|
151 | } catch (IOException e) { |
---|
152 | // Should never happend: the first next after END_HEADER does not produce IO |
---|
153 | throw new IllegalStateException(e); |
---|
154 | } catch (MimeException e) { |
---|
155 | // This should never happen |
---|
156 | throw new IllegalStateException(e); |
---|
157 | } |
---|
158 | } |
---|
159 | |
---|
160 | /** |
---|
161 | * Creates a new instance of {@link BodyDescriptor}. Subclasses may override |
---|
162 | * this in order to create body descriptors, that provide more specific |
---|
163 | * information. |
---|
164 | */ |
---|
165 | protected MutableBodyDescriptor newBodyDescriptor() { |
---|
166 | final MutableBodyDescriptor result; |
---|
167 | if (bodyDescFactory != null) { |
---|
168 | result = bodyDescFactory.newInstance(monitor); |
---|
169 | } else { |
---|
170 | result = new DefaultBodyDescriptor(null, monitor); |
---|
171 | } |
---|
172 | return result; |
---|
173 | } |
---|
174 | |
---|
175 | public void doParse(InputStream stream, |
---|
176 | MutableBodyDescriptor newBodyDescriptor, EntityState start) { |
---|
177 | LineNumberSource lineSource = null; |
---|
178 | if (config.isCountLineNumbers()) { |
---|
179 | LineNumberInputStream lineInput = new LineNumberInputStream(stream); |
---|
180 | lineSource = lineInput; |
---|
181 | stream = lineInput; |
---|
182 | } |
---|
183 | |
---|
184 | rootentity = new MimeEntity( |
---|
185 | lineSource, |
---|
186 | stream, |
---|
187 | newBodyDescriptor, |
---|
188 | start, |
---|
189 | EntityState.T_END_MESSAGE, |
---|
190 | config, |
---|
191 | monitor); |
---|
192 | |
---|
193 | rootentity.setRecursionMode(recursionMode); |
---|
194 | currentStateMachine = rootentity; |
---|
195 | entities.clear(); |
---|
196 | entities.add(currentStateMachine); |
---|
197 | state = currentStateMachine.getState(); |
---|
198 | } |
---|
199 | |
---|
200 | /** |
---|
201 | * Determines if this parser is currently in raw mode. |
---|
202 | * |
---|
203 | * @return <code>true</code> if in raw mode, <code>false</code> |
---|
204 | * otherwise. |
---|
205 | * @see #setRecursionMode(int) |
---|
206 | */ |
---|
207 | public boolean isRaw() { |
---|
208 | return recursionMode == RecursionMode.M_RAW; |
---|
209 | } |
---|
210 | |
---|
211 | /** |
---|
212 | * Gets the current recursion mode. |
---|
213 | * The recursion mode specifies the approach taken to parsing parts. |
---|
214 | * {@link #M_RAW} mode does not parse the part at all. |
---|
215 | * {@link #M_RECURSE} mode recursively parses each mail |
---|
216 | * when an <code>message/rfc822</code> part is encounted; |
---|
217 | * {@link #M_NO_RECURSE} does not. |
---|
218 | * @return {@link #M_RECURSE}, {@link #M_RAW} or {@link #M_NO_RECURSE} |
---|
219 | */ |
---|
220 | public RecursionMode getRecursionMode() { |
---|
221 | return recursionMode; |
---|
222 | } |
---|
223 | |
---|
224 | /** |
---|
225 | * Sets the current recursion. |
---|
226 | * The recursion mode specifies the approach taken to parsing parts. |
---|
227 | * {@link #M_RAW} mode does not parse the part at all. |
---|
228 | * {@link #M_RECURSE} mode recursively parses each mail |
---|
229 | * when an <code>message/rfc822</code> part is encounted; |
---|
230 | * {@link #M_NO_RECURSE} does not. |
---|
231 | * @param mode {@link #M_RECURSE}, {@link #M_RAW} or {@link #M_NO_RECURSE} |
---|
232 | */ |
---|
233 | public void setRecursionMode(RecursionMode mode) { |
---|
234 | recursionMode = mode; |
---|
235 | if (currentStateMachine != null) { |
---|
236 | currentStateMachine.setRecursionMode(mode); |
---|
237 | } |
---|
238 | } |
---|
239 | |
---|
240 | /** |
---|
241 | * Finishes the parsing and stops reading lines. |
---|
242 | * NOTE: No more lines will be parsed but the parser |
---|
243 | * will still trigger 'end' events to match previously |
---|
244 | * triggered 'start' events. |
---|
245 | */ |
---|
246 | public void stop() { |
---|
247 | rootentity.stop(); |
---|
248 | } |
---|
249 | |
---|
250 | /** |
---|
251 | * Returns the current state. |
---|
252 | */ |
---|
253 | public EntityState getState() { |
---|
254 | return state; |
---|
255 | } |
---|
256 | |
---|
257 | /** |
---|
258 | * This method returns the raw entity, preamble, or epilogue contents. |
---|
259 | * <p/> |
---|
260 | * This method is valid, if {@link #getState()} returns either of |
---|
261 | * {@link #T_RAW_ENTITY}, {@link #T_PREAMBLE}, or {@link #T_EPILOGUE}. |
---|
262 | * |
---|
263 | * @return Data stream, depending on the current state. |
---|
264 | * @throws IllegalStateException {@link #getState()} returns an |
---|
265 | * invalid value. |
---|
266 | */ |
---|
267 | public InputStream getInputStream() { |
---|
268 | return currentStateMachine.getContentStream(); |
---|
269 | } |
---|
270 | |
---|
271 | /** |
---|
272 | * This method returns a transfer decoded stream based on the MIME |
---|
273 | * fields with the standard defaults. |
---|
274 | * <p/> |
---|
275 | * This method is valid, if {@link #getState()} returns either of |
---|
276 | * {@link #T_RAW_ENTITY}, {@link #T_PREAMBLE}, or {@link #T_EPILOGUE}. |
---|
277 | * |
---|
278 | * @return Data stream, depending on the current state. |
---|
279 | * @throws IllegalStateException {@link #getState()} returns an |
---|
280 | * invalid value. |
---|
281 | */ |
---|
282 | public InputStream getDecodedInputStream() { |
---|
283 | return currentStateMachine.getDecodedContentStream(); |
---|
284 | } |
---|
285 | |
---|
286 | /** |
---|
287 | * Gets a reader configured for the current body or body part. |
---|
288 | * The reader will return a transfer and charset decoded |
---|
289 | * stream of characters based on the MIME fields with the standard |
---|
290 | * defaults. |
---|
291 | * This is a conveniance method and relies on {@link #getInputStream()}. |
---|
292 | * Consult the javadoc for that method for known limitations. |
---|
293 | * |
---|
294 | * @return <code>Reader</code>, not null |
---|
295 | * @see #getInputStream |
---|
296 | * @throws IllegalStateException {@link #getState()} returns an |
---|
297 | * invalid value |
---|
298 | * @throws UnsupportedCharsetException if there is no JVM support |
---|
299 | * for decoding the charset |
---|
300 | * @throws IllegalCharsetNameException if the charset name specified |
---|
301 | * in the mime type is illegal |
---|
302 | */ |
---|
303 | public Reader getReader() { |
---|
304 | final BodyDescriptor bodyDescriptor = getBodyDescriptor(); |
---|
305 | final String mimeCharset = bodyDescriptor.getCharset(); |
---|
306 | final Charset charset; |
---|
307 | if (mimeCharset == null || "".equals(mimeCharset)) { |
---|
308 | charset = CharsetUtil.US_ASCII; |
---|
309 | } else { |
---|
310 | charset = Charset.forName(mimeCharset); |
---|
311 | } |
---|
312 | final InputStream instream = getDecodedInputStream(); |
---|
313 | return new InputStreamReader(instream, charset); |
---|
314 | } |
---|
315 | |
---|
316 | /** |
---|
317 | * <p>Gets a descriptor for the current entity. |
---|
318 | * This method is valid if {@link #getState()} returns:</p> |
---|
319 | * <ul> |
---|
320 | * <li>{@link #T_BODY}</li> |
---|
321 | * <li>{@link #T_START_MULTIPART}</li> |
---|
322 | * <li>{@link #T_EPILOGUE}</li> |
---|
323 | * <li>{@link #T_PREAMBLE}</li> |
---|
324 | * </ul> |
---|
325 | * @return <code>BodyDescriptor</code>, not nulls |
---|
326 | */ |
---|
327 | public BodyDescriptor getBodyDescriptor() { |
---|
328 | return currentStateMachine.getBodyDescriptor(); |
---|
329 | } |
---|
330 | |
---|
331 | /** |
---|
332 | * This method is valid, if {@link #getState()} returns {@link #T_FIELD}. |
---|
333 | * @return String with the fields raw contents. |
---|
334 | * @throws IllegalStateException {@link #getState()} returns another |
---|
335 | * value than {@link #T_FIELD}. |
---|
336 | */ |
---|
337 | public RawField getField() { |
---|
338 | return currentStateMachine.getField(); |
---|
339 | } |
---|
340 | |
---|
341 | /** |
---|
342 | * This method advances the token stream to the next token. |
---|
343 | * @throws IllegalStateException The method has been called, although |
---|
344 | * {@link #getState()} was already {@link #T_END_OF_STREAM}. |
---|
345 | */ |
---|
346 | public EntityState next() throws IOException, MimeException { |
---|
347 | if (state == EntityState.T_END_OF_STREAM || currentStateMachine == null) { |
---|
348 | throw new IllegalStateException("No more tokens are available."); |
---|
349 | } |
---|
350 | while (currentStateMachine != null) { |
---|
351 | EntityStateMachine next = currentStateMachine.advance(); |
---|
352 | if (next != null) { |
---|
353 | entities.add(next); |
---|
354 | currentStateMachine = next; |
---|
355 | } |
---|
356 | state = currentStateMachine.getState(); |
---|
357 | if (state != EntityState.T_END_OF_STREAM) { |
---|
358 | return state; |
---|
359 | } |
---|
360 | entities.removeLast(); |
---|
361 | if (entities.isEmpty()) { |
---|
362 | currentStateMachine = null; |
---|
363 | } else { |
---|
364 | currentStateMachine = entities.getLast(); |
---|
365 | currentStateMachine.setRecursionMode(recursionMode); |
---|
366 | } |
---|
367 | } |
---|
368 | state = EntityState.T_END_OF_STREAM; |
---|
369 | return state; |
---|
370 | } |
---|
371 | |
---|
372 | /** |
---|
373 | * Renders a state as a string suitable for logging. |
---|
374 | * @param state |
---|
375 | * @return rendered as string, not null |
---|
376 | */ |
---|
377 | public static final String stateToString(EntityState state) { |
---|
378 | return AbstractEntity.stateToString(state); |
---|
379 | } |
---|
380 | |
---|
381 | |
---|
382 | public MimeEntityConfig getConfig() { |
---|
383 | return config; |
---|
384 | } |
---|
385 | } |
---|