1 | /** |
---|
2 | * MailArchiver is an application that provides services for storing and managing e-mail messages through a Web Services SOAP interface. |
---|
3 | * Copyright (C) 2012 Marcio Andre Scholl Levien and Fernando Alberto Reuter Wendt and Jose Ronaldo Nogueira Fonseca Junior |
---|
4 | * |
---|
5 | * This program is free software: you can redistribute it and/or modify |
---|
6 | * it under the terms of the GNU Affero General Public License as |
---|
7 | * published by the Free Software Foundation, either version 3 of the |
---|
8 | * License, or (at your option) any later version. |
---|
9 | * |
---|
10 | * This program is distributed in the hope that it will be useful, |
---|
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
---|
13 | * GNU Affero General Public License for more details. |
---|
14 | * |
---|
15 | * You should have received a copy of the GNU Affero General Public License |
---|
16 | * along with this program. If not, see <http://www.gnu.org/licenses/>. |
---|
17 | */ |
---|
18 | |
---|
19 | /******************************************************************************\ |
---|
20 | * |
---|
21 | * This product was developed by |
---|
22 | * |
---|
23 | * SERVIÃO FEDERAL DE PROCESSAMENTO DE DADOS (SERPRO), |
---|
24 | * |
---|
25 | * a government company established under Brazilian law (5.615/70), |
---|
26 | * at Department of Development of Porto Alegre. |
---|
27 | * |
---|
28 | \******************************************************************************/ |
---|
29 | |
---|
30 | package serpro.mailarchiver.domain.metaarchive; |
---|
31 | |
---|
32 | import java.io.BufferedReader; |
---|
33 | import java.io.IOException; |
---|
34 | import java.io.InputStream; |
---|
35 | import java.io.InputStreamReader; |
---|
36 | import java.nio.charset.Charset; |
---|
37 | import java.util.regex.Matcher; |
---|
38 | import java.util.regex.Pattern; |
---|
39 | |
---|
40 | import javax.jdo.JDOHelper; |
---|
41 | import javax.jdo.annotations.NotPersistent; |
---|
42 | import javax.jdo.annotations.PersistenceCapable; |
---|
43 | |
---|
44 | import net.htmlparser.jericho.Element; |
---|
45 | import net.htmlparser.jericho.HTMLElementName; |
---|
46 | import net.htmlparser.jericho.Source; |
---|
47 | import net.htmlparser.jericho.Tag; |
---|
48 | |
---|
49 | import org.apache.commons.lang3.StringEscapeUtils; |
---|
50 | import org.apache.commons.lang3.mutable.MutableBoolean; |
---|
51 | |
---|
52 | import org.apache.james.mime4j.util.CharsetUtil; |
---|
53 | |
---|
54 | import serpro.mailarchiver.util.BodyVisitor; |
---|
55 | import serpro.mailarchiver.util.Charsets; |
---|
56 | import serpro.mailarchiver.util.Logger; |
---|
57 | |
---|
58 | @PersistenceCapable |
---|
59 | public class TextBody |
---|
60 | extends SingleBody |
---|
61 | { |
---|
62 | @NotPersistent |
---|
63 | private static final Logger log = Logger.getLocalLogger(); |
---|
64 | |
---|
65 | @NotPersistent private static final String CR = "\015"; |
---|
66 | @NotPersistent private static final String LF = "\012"; |
---|
67 | @NotPersistent private static final String CRLF = "\015\012"; |
---|
68 | |
---|
69 | //**** P E R S I S T E N T **** |
---|
70 | private String preview; |
---|
71 | //***************************** |
---|
72 | |
---|
73 | public final String getPreview() { |
---|
74 | return preview; |
---|
75 | } |
---|
76 | |
---|
77 | public final void setPreview(String preview) { |
---|
78 | this.preview = preview; |
---|
79 | } |
---|
80 | |
---|
81 | //-------------------------------------------------------------------------- |
---|
82 | @Override |
---|
83 | final String toString(String pad) { |
---|
84 | return String.format( |
---|
85 | "TextBody%n" |
---|
86 | + "%1$sjdoState: %2$s%n" |
---|
87 | + "%1$soid: %3$s%n" |
---|
88 | + "%1$shash: %4$x%n" |
---|
89 | + "%1$soffset: %5$d%n" |
---|
90 | + "%1$slength: %6$d%n" |
---|
91 | + "%1$ssize: %7$d%n" |
---|
92 | + "%1$spreview: %8$s" |
---|
93 | , pad |
---|
94 | , JDOHelper.getObjectState(this) |
---|
95 | , getOid() |
---|
96 | , hashCode() |
---|
97 | , getOffset() |
---|
98 | , getLength() |
---|
99 | , getSize() |
---|
100 | , getPreview()); |
---|
101 | } |
---|
102 | |
---|
103 | //-------------------------------------------------------------------------- |
---|
104 | public String getText() throws IOException { |
---|
105 | Entity entity = getEntity(); |
---|
106 | if(entity != null) { |
---|
107 | ContentTypeField contentTypeField = entity.getContentTypeField(); |
---|
108 | |
---|
109 | Charset cs = null; |
---|
110 | if(contentTypeField != null) { |
---|
111 | cs = CharsetUtil.lookup(contentTypeField.getCharset()); |
---|
112 | } |
---|
113 | if(cs == null) { |
---|
114 | cs = Charsets.Windows_1252; |
---|
115 | } |
---|
116 | |
---|
117 | InputStream is = getDecoderInputStream(); |
---|
118 | InputStreamReader isr = new InputStreamReader(is, cs); |
---|
119 | BufferedReader reader = new BufferedReader(isr); |
---|
120 | |
---|
121 | StringBuilder sb = new StringBuilder(); |
---|
122 | |
---|
123 | char[] cbuf = new char[0x1000]; |
---|
124 | int len; |
---|
125 | while((len = reader.read(cbuf)) > 0) { |
---|
126 | sb.append(cbuf, 0, len); |
---|
127 | } |
---|
128 | |
---|
129 | reader.close(); |
---|
130 | isr.close(); |
---|
131 | is.close(); |
---|
132 | |
---|
133 | return sb.toString(); |
---|
134 | } |
---|
135 | throw new IllegalStateException(); |
---|
136 | } |
---|
137 | |
---|
138 | public String getAdaptedText() throws IOException { |
---|
139 | String text = getText(); |
---|
140 | Entity entity = getEntity(); |
---|
141 | if(entity != null) { |
---|
142 | ContentTypeField contentTypeField = entity.getContentTypeField(); |
---|
143 | if((contentTypeField == null) || (contentTypeField.isTextPlainMimeType())) { |
---|
144 | text = text.replaceAll(CRLF + "|" + CR + "|" + LF, "<br>" + LF); |
---|
145 | } |
---|
146 | } |
---|
147 | |
---|
148 | text = StringEscapeUtils.unescapeHtml4(text); |
---|
149 | text = replaceMailToRef(text); |
---|
150 | text = replaceContentIdRef(text); |
---|
151 | text = replaceUrlRef(text); |
---|
152 | return text; |
---|
153 | } |
---|
154 | |
---|
155 | //-------------------------------------------------------------------------- |
---|
156 | @NotPersistent |
---|
157 | private static final String contentIdRegex = "src\\s*=(\\.*(?!cid))(" + |
---|
158 | "'\\s*cid:\\s*([^'\\s<>]*)\\s*'" + |
---|
159 | "|" + |
---|
160 | "\"\\s*cid:\\s*([^\"\\s<>]*)\\s*\"" + |
---|
161 | "|" + |
---|
162 | "cid:\\s*([^\\s<>]+)" + |
---|
163 | ")"; |
---|
164 | @NotPersistent |
---|
165 | private static final Pattern contentIdPattern = Pattern.compile(contentIdRegex, Pattern.CASE_INSENSITIVE); |
---|
166 | |
---|
167 | private String replaceContentIdRef(String text) { |
---|
168 | Matcher m = contentIdPattern.matcher(text); |
---|
169 | final StringBuilder sb = new StringBuilder(); |
---|
170 | int lastEnd = 0; |
---|
171 | while(m.find()) { |
---|
172 | sb.append(text.substring(lastEnd, m.start())) |
---|
173 | .append("src=\""); |
---|
174 | |
---|
175 | final String cid = |
---|
176 | m.group(4) != null ? m.group(4) : |
---|
177 | m.group(5) != null ? m.group(5) : |
---|
178 | m.group(6) != null ? m.group(6) : |
---|
179 | null; |
---|
180 | |
---|
181 | final MutableBoolean binaryBodyFound = new MutableBoolean(false); |
---|
182 | |
---|
183 | getRootMessage().visitBodies(new BodyVisitor() { |
---|
184 | @Override |
---|
185 | public void visitBinaryBody(BinaryBody binaryBody) { |
---|
186 | Entity entity = binaryBody.getEntity(); |
---|
187 | UnstructuredField contentIdField = entity.getContentIdField(); |
---|
188 | if(contentIdField != null) { |
---|
189 | if(contentIdField.getText().equalsIgnoreCase(cid)) { |
---|
190 | sb.append("\" name=\"embedded_img_").append(binaryBody.getOid()).append("/").append(binaryBody.getFileName()).append("\" "); |
---|
191 | binaryBodyFound.setValue(true); |
---|
192 | quit(); |
---|
193 | } |
---|
194 | } |
---|
195 | } |
---|
196 | }); |
---|
197 | |
---|
198 | if(binaryBodyFound.isFalse()) { |
---|
199 | sb.append("cid:").append(cid).append("\" "); |
---|
200 | } |
---|
201 | |
---|
202 | lastEnd = m.end(); |
---|
203 | } |
---|
204 | sb.append(text.substring(lastEnd)); |
---|
205 | return sb.toString(); |
---|
206 | } |
---|
207 | |
---|
208 | //-------------------------------------------------------------------------- |
---|
209 | @NotPersistent |
---|
210 | private static final String mailToRegex = "href\\s*=\\s*(" + |
---|
211 | "'\\s*mailto:\\s*([^'\\s<>]*)\\s*'" + |
---|
212 | "|" + |
---|
213 | "\"\\s*mailto:\\s*([^\"\\s<>]*)\\s*\"" + |
---|
214 | "|" + |
---|
215 | "mailto:\\s*([^\\s<>]+)" + |
---|
216 | ")"; |
---|
217 | @NotPersistent |
---|
218 | private static final Pattern mailToPattern = Pattern.compile(mailToRegex, Pattern.CASE_INSENSITIVE); |
---|
219 | |
---|
220 | private String replaceMailToRef(String text) { |
---|
221 | Matcher m = mailToPattern.matcher(text); |
---|
222 | StringBuilder sb = new StringBuilder(); |
---|
223 | int lastEnd = 0; |
---|
224 | while(m.find()) { |
---|
225 | sb.append(text.substring(lastEnd, m.start())) |
---|
226 | .append("name=\"_mailto\" href=\"javascript:new_message_to('"); |
---|
227 | |
---|
228 | if(m.group(2) != null) { |
---|
229 | sb.append(m.group(2)); |
---|
230 | } |
---|
231 | else if(m.group(3) != null) { |
---|
232 | sb.append(m.group(3)); |
---|
233 | } |
---|
234 | else if(m.group(4) != null) { |
---|
235 | sb.append(m.group(4)); |
---|
236 | } |
---|
237 | sb.append("');\""); |
---|
238 | lastEnd = m.end(); |
---|
239 | } |
---|
240 | sb.append(text.substring(lastEnd)); |
---|
241 | return sb.toString(); |
---|
242 | } |
---|
243 | |
---|
244 | //-------------------------------------------------------------------------- |
---|
245 | @NotPersistent |
---|
246 | private static final String urlRegex = "\\b(" + |
---|
247 | "(?:(https?://)|www\\d{0,3}[.]|[a-z0-9.\\-]+[.](?=(?:com|org|net|gov|mil|info)(?:[.]br)?)|[a-z0-9.\\-]+[.][a-z]{2,4}/)" + |
---|
248 | "(?:[^\\s()<>]+|\\(([^\\s()<>]+|(\\([^\\s()<>]+\\)))*\\))+" + |
---|
249 | "(?:\\(([^\\s()<>]+|(\\([^\\s()<>]+\\)))*\\)|[^\\s`!()\\[\\]{};:'\".,<>\\?«»ââââ]))"; |
---|
250 | @NotPersistent |
---|
251 | private static final Pattern urlPattern = Pattern.compile(urlRegex, Pattern.CASE_INSENSITIVE); |
---|
252 | |
---|
253 | private String replaceUrlRef(String text) { |
---|
254 | |
---|
255 | Source source = new Source(text); |
---|
256 | source.fullSequentialParse(); |
---|
257 | |
---|
258 | Matcher m = urlPattern.matcher(text); |
---|
259 | StringBuilder sb = new StringBuilder(); |
---|
260 | int lastEnd = 0; |
---|
261 | find: |
---|
262 | while(m.find()) { |
---|
263 | |
---|
264 | Element element = source.getEnclosingElement(m.start()); |
---|
265 | while(element != null) { |
---|
266 | String elementName = element.getName(); |
---|
267 | |
---|
268 | if(elementName == HTMLElementName.A |
---|
269 | || elementName == HTMLElementName.LINK |
---|
270 | || elementName == HTMLElementName.AREA |
---|
271 | || elementName == HTMLElementName.BASE |
---|
272 | || elementName == HTMLElementName.META |
---|
273 | || elementName == HTMLElementName.SCRIPT |
---|
274 | || elementName == HTMLElementName.IMG |
---|
275 | || elementName == HTMLElementName.VIDEO |
---|
276 | || elementName == HTMLElementName.AUDIO |
---|
277 | || elementName == HTMLElementName.SOURCE |
---|
278 | || elementName == HTMLElementName.BLOCKQUOTE |
---|
279 | || elementName == HTMLElementName.DEL |
---|
280 | || elementName == HTMLElementName.INS |
---|
281 | || elementName == HTMLElementName.Q |
---|
282 | || elementName == HTMLElementName.BUTTON |
---|
283 | || elementName == HTMLElementName.INPUT |
---|
284 | || elementName == HTMLElementName.OBJECT |
---|
285 | || elementName == HTMLElementName.EMBED |
---|
286 | || elementName == HTMLElementName.COMMAND) |
---|
287 | { |
---|
288 | continue find; |
---|
289 | } |
---|
290 | |
---|
291 | Tag startTag = element.getStartTag(); |
---|
292 | |
---|
293 | if((startTag.getBegin() < m.start()) && (m.start() < startTag.getEnd())) { |
---|
294 | continue find; |
---|
295 | } |
---|
296 | |
---|
297 | element = element.getParentElement(); |
---|
298 | } |
---|
299 | |
---|
300 | sb.append(text.substring(lastEnd, m.start())) |
---|
301 | .append("<a href=\""); |
---|
302 | |
---|
303 | if(m.group(2) == null) { |
---|
304 | sb.append("http://"); |
---|
305 | } |
---|
306 | |
---|
307 | sb.append(m.group()).append("\">").append(m.group(1)).append("</a>"); |
---|
308 | lastEnd = m.end(); |
---|
309 | } |
---|
310 | sb.append(text.substring(lastEnd)); |
---|
311 | return sb.toString(); |
---|
312 | } |
---|
313 | } |
---|