source: contrib/MailArchiver/sources/src/serpro/mailarchiver/domain/metaarchive/TextBody.java @ 6785

Revision 6785, 11.1 KB checked in by rafaelraymundo, 12 years ago (diff)

Ticket #2946 - Liberado codigo do MailArchiver?. Documentação na subpasta DOCS.

Line 
1/**
2 * MailArchiver is an application that provides services for storing and managing e-mail messages through a Web Services SOAP interface.
3 * Copyright (C) 2012  Marcio Andre Scholl Levien and Fernando Alberto Reuter Wendt and Jose Ronaldo Nogueira Fonseca Junior
4 *
5 * This program is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU Affero General Public License as
7 * published by the Free Software Foundation, either version 3 of the
8 * License, or (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 * GNU Affero General Public License for more details.
14 *
15 * You should have received a copy of the GNU Affero General Public License
16 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 */
18
19/******************************************************************************\
20*
21*  This product was developed by
22*
23*        SERVIÇO FEDERAL DE PROCESSAMENTO DE DADOS (SERPRO),
24*
25*  a government company established under Brazilian law (5.615/70),
26*  at Department of Development of Porto Alegre.
27*
28\******************************************************************************/
29
30package serpro.mailarchiver.domain.metaarchive;
31
32import java.io.BufferedReader;
33import java.io.IOException;
34import java.io.InputStream;
35import java.io.InputStreamReader;
36import java.nio.charset.Charset;
37import java.util.regex.Matcher;
38import java.util.regex.Pattern;
39
40import javax.jdo.JDOHelper;
41import javax.jdo.annotations.NotPersistent;
42import javax.jdo.annotations.PersistenceCapable;
43
44import net.htmlparser.jericho.Element;
45import net.htmlparser.jericho.HTMLElementName;
46import net.htmlparser.jericho.Source;
47import net.htmlparser.jericho.Tag;
48
49import org.apache.commons.lang3.StringEscapeUtils;
50import org.apache.commons.lang3.mutable.MutableBoolean;
51
52import org.apache.james.mime4j.util.CharsetUtil;
53
54import serpro.mailarchiver.util.BodyVisitor;
55import serpro.mailarchiver.util.Charsets;
56import serpro.mailarchiver.util.Logger;
57
58@PersistenceCapable
59public class TextBody
60    extends SingleBody
61{
62    @NotPersistent
63    private static final Logger log = Logger.getLocalLogger();
64
65    @NotPersistent private static final String CR = "\015";
66    @NotPersistent private static final String LF = "\012";
67    @NotPersistent private static final String CRLF = "\015\012";
68
69    //**** P E R S I S T E N T ****
70    private String preview;
71    //*****************************
72
73    public final String getPreview() {
74        return preview;
75    }
76
77    public final void setPreview(String preview) {
78        this.preview = preview;
79    }
80
81    //--------------------------------------------------------------------------
82    @Override
83    final String toString(String pad) {
84        return String.format(
85                "TextBody%n"
86              + "%1$sjdoState: %2$s%n"
87              + "%1$soid: %3$s%n"
88              + "%1$shash: %4$x%n"
89              + "%1$soffset: %5$d%n"
90              + "%1$slength: %6$d%n"
91              + "%1$ssize: %7$d%n"
92              + "%1$spreview: %8$s"
93              , pad
94              , JDOHelper.getObjectState(this)
95              , getOid()
96              , hashCode()
97              , getOffset()
98              , getLength()
99              , getSize()
100              , getPreview());
101    }
102
103    //--------------------------------------------------------------------------
104    public String getText() throws IOException {
105        Entity entity = getEntity();
106        if(entity != null) {
107            ContentTypeField contentTypeField = entity.getContentTypeField();
108
109            Charset cs = null;
110            if(contentTypeField != null) {
111                cs = CharsetUtil.lookup(contentTypeField.getCharset());
112            }
113            if(cs == null) {
114                cs = Charsets.Windows_1252;
115            }
116
117            InputStream is = getDecoderInputStream();
118            InputStreamReader isr = new InputStreamReader(is, cs);
119            BufferedReader reader = new BufferedReader(isr);
120
121            StringBuilder sb = new StringBuilder();
122
123            char[] cbuf = new char[0x1000];
124            int len;
125            while((len = reader.read(cbuf)) > 0) {
126                sb.append(cbuf, 0, len);
127            }
128
129            reader.close();
130            isr.close();
131            is.close();
132
133            return sb.toString();
134        }
135        throw new IllegalStateException();
136    }
137
138    public String getAdaptedText() throws IOException {
139        String text = getText();
140        Entity entity = getEntity();
141        if(entity != null) {
142            ContentTypeField contentTypeField = entity.getContentTypeField();
143            if((contentTypeField == null) || (contentTypeField.isTextPlainMimeType())) {
144                text = text.replaceAll(CRLF + "|" + CR + "|" + LF, "<br>" + LF);
145            }
146        }
147
148        text = StringEscapeUtils.unescapeHtml4(text);
149        text = replaceMailToRef(text);
150        text = replaceContentIdRef(text);
151        text = replaceUrlRef(text);
152        return text;
153    }
154
155    //--------------------------------------------------------------------------
156    @NotPersistent
157    private static final String contentIdRegex = "src\\s*=(\\.*(?!cid))(" +
158            "'\\s*cid:\\s*([^'\\s<>]*)\\s*'" +
159            "|" +
160            "\"\\s*cid:\\s*([^\"\\s<>]*)\\s*\"" +
161            "|" +
162            "cid:\\s*([^\\s<>]+)" +
163            ")";
164    @NotPersistent
165    private static final Pattern contentIdPattern = Pattern.compile(contentIdRegex, Pattern.CASE_INSENSITIVE);
166
167    private String replaceContentIdRef(String text) {
168        Matcher m = contentIdPattern.matcher(text);
169        final StringBuilder sb = new StringBuilder();
170        int lastEnd = 0;
171        while(m.find()) {
172            sb.append(text.substring(lastEnd, m.start()))
173              .append("src=\"");
174
175            final String cid =
176                m.group(4) != null ? m.group(4) :
177                m.group(5) != null ? m.group(5) :
178                m.group(6) != null ? m.group(6) :
179                null;
180
181            final MutableBoolean binaryBodyFound = new MutableBoolean(false);
182
183            getRootMessage().visitBodies(new BodyVisitor() {
184                @Override
185                public void visitBinaryBody(BinaryBody binaryBody) {
186                    Entity entity = binaryBody.getEntity();
187                    UnstructuredField contentIdField = entity.getContentIdField();
188                    if(contentIdField != null) {
189                        if(contentIdField.getText().equalsIgnoreCase(cid)) {
190                            sb.append("\" name=\"embedded_img_").append(binaryBody.getOid()).append("/").append(binaryBody.getFileName()).append("\" ");
191                            binaryBodyFound.setValue(true);
192                            quit();
193                        }
194                    }
195                }
196            });
197
198            if(binaryBodyFound.isFalse()) {
199                sb.append("cid:").append(cid).append("\" ");
200            }
201
202            lastEnd = m.end();
203        }
204        sb.append(text.substring(lastEnd));
205        return sb.toString();
206    }
207
208    //--------------------------------------------------------------------------
209    @NotPersistent
210    private static final String mailToRegex = "href\\s*=\\s*(" +
211            "'\\s*mailto:\\s*([^'\\s<>]*)\\s*'" +
212            "|" +
213            "\"\\s*mailto:\\s*([^\"\\s<>]*)\\s*\"" +
214            "|" +
215            "mailto:\\s*([^\\s<>]+)" +
216            ")";
217    @NotPersistent
218    private static final Pattern mailToPattern = Pattern.compile(mailToRegex, Pattern.CASE_INSENSITIVE);
219
220    private String replaceMailToRef(String text) {
221        Matcher m = mailToPattern.matcher(text);
222        StringBuilder sb = new StringBuilder();
223        int lastEnd = 0;
224        while(m.find()) {
225            sb.append(text.substring(lastEnd, m.start()))
226              .append("name=\"_mailto\" href=\"javascript:new_message_to('");
227
228            if(m.group(2) != null) {
229                sb.append(m.group(2));
230            }
231            else if(m.group(3) != null) {
232                sb.append(m.group(3));
233            }
234            else if(m.group(4) != null) {
235                sb.append(m.group(4));
236            }
237            sb.append("');\"");
238            lastEnd = m.end();
239        }
240        sb.append(text.substring(lastEnd));
241        return sb.toString();
242    }
243
244    //--------------------------------------------------------------------------
245    @NotPersistent
246    private static final String urlRegex = "\\b(" +
247            "(?:(https?://)|www\\d{0,3}[.]|[a-z0-9.\\-]+[.](?=(?:com|org|net|gov|mil|info)(?:[.]br)?)|[a-z0-9.\\-]+[.][a-z]{2,4}/)" +
248            "(?:[^\\s()<>]+|\\(([^\\s()<>]+|(\\([^\\s()<>]+\\)))*\\))+" +
249            "(?:\\(([^\\s()<>]+|(\\([^\\s()<>]+\\)))*\\)|[^\\s`!()\\[\\]{};:'\".,<>\\?«»“”‘’]))";
250    @NotPersistent
251    private static final Pattern urlPattern = Pattern.compile(urlRegex, Pattern.CASE_INSENSITIVE);
252
253    private String replaceUrlRef(String text) {
254
255        Source source = new Source(text);
256        source.fullSequentialParse();
257
258        Matcher m = urlPattern.matcher(text);
259        StringBuilder sb = new StringBuilder();
260        int lastEnd = 0;
261        find:
262        while(m.find()) {
263
264            Element element = source.getEnclosingElement(m.start());
265            while(element != null) {
266                String elementName = element.getName();
267
268                if(elementName == HTMLElementName.A
269                || elementName == HTMLElementName.LINK
270                || elementName == HTMLElementName.AREA
271                || elementName == HTMLElementName.BASE
272                || elementName == HTMLElementName.META
273                || elementName == HTMLElementName.SCRIPT
274                || elementName == HTMLElementName.IMG
275                || elementName == HTMLElementName.VIDEO
276                || elementName == HTMLElementName.AUDIO
277                || elementName == HTMLElementName.SOURCE
278                || elementName == HTMLElementName.BLOCKQUOTE
279                || elementName == HTMLElementName.DEL
280                || elementName == HTMLElementName.INS
281                || elementName == HTMLElementName.Q
282                || elementName == HTMLElementName.BUTTON
283                || elementName == HTMLElementName.INPUT
284                || elementName == HTMLElementName.OBJECT
285                || elementName == HTMLElementName.EMBED
286                || elementName == HTMLElementName.COMMAND)
287                {
288                    continue find;
289                }
290
291                Tag startTag = element.getStartTag();
292
293                if((startTag.getBegin() < m.start()) && (m.start() < startTag.getEnd())) {
294                    continue find;
295                }
296
297                element = element.getParentElement();
298            }
299
300            sb.append(text.substring(lastEnd, m.start()))
301              .append("<a href=\"");
302
303            if(m.group(2) == null) {
304                sb.append("http://");
305            }
306
307            sb.append(m.group()).append("\">").append(m.group(1)).append("</a>");
308            lastEnd = m.end();
309        }
310        sb.append(text.substring(lastEnd));
311        return sb.toString();
312    }
313}
Note: See TracBrowser for help on using the repository browser.