source: sandbox/2.3-MailArchiver/expressoMail1_2/inc/htmlfilter.inc @ 6779

Revision 6779, 35.9 KB checked in by rafaelraymundo, 12 years ago (diff)

Ticket #2946 - Liberado Expresso(branch 2.3) integrado ao MailArchiver?.

Line 
1<?php
2/**
3 * htmlfilter.inc
4 * ---------------
5 * This set of functions allows you to filter html in order to remove
6 * any malicious tags from it. Useful in cases when you need to filter
7 * user input for any cross-site-scripting attempts.
8 *
9 * Copyright (C) 2002-2004 by Duke University
10 *
11 * This library is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License as published by the Free Software Foundation; either
14 * version 2.1 of the License, or (at your option) any later version.
15 *
16 * This library is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19 * Lesser General Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with this library; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 
24 * 02110-1301  USA
25 *
26 * @Author  Konstantin Riabitsev <icon@linux.duke.edu>
27 */
28
29/**
30 * This is a debugging function used throughout the code. To enable
31 * debugging you have to specify a global variable called "debug" before
32 * calling sanitize() and set it to true.
33 *
34 * Note: Although insignificantly, debugging does slow you down even
35 * when $debug is set to false. If you wish to get rid of all
36 * debugging calls, run the following command:
37 *
38 * fgrep -v 'spew("' htmlfilter.inc > htmlfilter.inc.new
39 *
40 * htmlfilter.inc.new will contain no debugging calls.
41 *
42 * @param  $message  A string with the message to output.
43 * @return           void.
44 */
45function spew($message){
46    global $debug;
47    if ($debug == true){
48        echo "$message";
49    }
50}
51
52/**
53 * This function returns the final tag out of the tag name, an array
54 * of attributes, and the type of the tag. This function is called by
55 * sanitize internally.
56 *
57 * @param  $tagname  the name of the tag.
58 * @param  $attary   the array of attributes and their values
59 * @param  $tagtype  The type of the tag (see in comments).
60 * @return           a string with the final tag representation.
61 */
62function tagprint($tagname, $attary, $tagtype){
63    $me = 'tagprint';
64    if ($tagtype == 2){
65        $fulltag = '</' . $tagname . '>';
66    } else {
67        $fulltag = '<' . $tagname;
68        if (is_array($attary) && sizeof($attary)){
69            $atts = Array();
70            while (list($attname, $attvalue) = each($attary)){
71                array_push($atts, "$attname=$attvalue");
72            }
73            $fulltag .= ' ' . join(' ', $atts);
74        }
75        if ($tagtype == 3){
76            $fulltag .= ' /';
77        }
78        $fulltag .= '>';
79    }
80    spew("$me: $fulltag\n");
81    return $fulltag;
82}
83
84/**
85 * A small helper function to use with array_walk. Modifies a by-ref
86 * value and makes it lowercase.
87 *
88 * @param  $val a value passed by-ref.
89 * @return      void since it modifies a by-ref value.
90 */
91function casenormalize(&$val){
92    $val = strtolower($val);
93}
94
95/**
96 * This function skips any whitespace from the current position within
97 * a string and to the next non-whitespace value.
98 *
99 * @param  $body   the string
100 * @param  $offset the offset within the string where we should start
101 *                 looking for the next non-whitespace character.
102 * @return         the location within the $body where the next
103 *                 non-whitespace char is located.
104 */
105function skipspace($body, $offset){
106    $me = 'skipspace';
107    preg_match('/^(\s*)/s', substr($body, $offset), $matches);
108    if (sizeof($matches{1})){
109        $count = strlen($matches{1});
110        spew("$me: skipped $count chars\n");
111        $offset += $count;
112    }
113    return $offset;
114}
115
116/**
117 * This function looks for the next character within a string.  It's
118 * really just a glorified "strpos", except it catches the failures
119 * nicely.
120 *
121 * @param  $body   The string to look for needle in.
122 * @param  $offset Start looking from this position.
123 * @param  $needle The character/string to look for.
124 * @return         location of the next occurance of the needle, or
125 *                 strlen($body) if needle wasn't found.
126 */
127function findnxstr($body, $offset, $needle){
128    $me = 'findnxstr';
129    $pos = strpos($body, $needle, $offset);
130    if ($pos === FALSE){
131        $pos = strlen($body);
132        spew("$me: end of body reached\n");
133    }
134    spew("$me: '$needle' found at pos $pos\n");
135    return $pos;
136}
137
138/**
139 * This function takes a PCRE-style regexp and tries to match it
140 * within the string.
141 *
142 * @param  $body   The string to look for needle in.
143 * @param  $offset Start looking from here.
144 * @param  $reg    A PCRE-style regex to match.
145 * @return         Returns a false if no matches found, or an array
146 *                 with the following members:
147 *                 - integer with the location of the match within $body
148 *                 - string with whatever content between offset and the match
149 *                 - string with whatever it is we matched
150 */
151function findnxreg($body, $offset, $reg){
152    $me = 'findnxreg';
153    $matches = Array();
154    $retarr = Array();
155    $preg_rule = '%^(.*?)(' . $reg . ')%s';
156    preg_match($preg_rule, substr($body, $offset), $matches);
157    if (!isset($matches{0})){
158        spew("$me: No matches found.\n");
159        $retarr = false;
160    } else {
161        $retarr{0} = $offset + strlen($matches{1});
162        $retarr{1} = $matches{1};
163        $retarr{2} = $matches{2};
164        spew("$me: '$reg' found at pos $offset matching '".$matches{2}."'\n");
165    }
166    return $retarr;
167}
168
169/**
170 * This function looks for the next tag.
171 *
172 * @param  $body   String where to look for the next tag.
173 * @param  $offset Start looking from here.
174 * @return         false if no more tags exist in the body, or
175 *                 an array with the following members:
176 *                 - string with the name of the tag
177 *                 - array with attributes and their values
178 *                 - integer with tag type (1, 2, or 3)
179 *                 - integer where the tag starts (starting "<")
180 *                 - integer where the tag ends (ending ">")
181 *                 first three members will be false, if the tag is invalid.
182 */
183function getnxtag($body, $offset){
184    $me = 'getnxtag';
185    if ($offset > strlen($body)){
186        spew("$me: Past the end of body\n");
187        return false;
188    }
189    $lt = findnxstr($body, $offset, '<');
190    if ($lt == strlen($body)){
191        spew("$me: No more tags found!\n");
192        return false;
193    }
194    /**
195     * We are here:
196     * blah blah <tag attribute="value">
197     * \---------^
198     */
199    spew("$me: Found '<' at pos $lt\n");
200    $pos = skipspace($body, $lt + 1);
201    if ($pos >= strlen($body)){
202        spew("$me: End of body reached.\n");
203        return Array(false, false, false, $lt, strlen($body));
204    }
205    /**
206     * There are 3 kinds of tags:
207     * 1. Opening tag, e.g.:
208     *    <a href="blah">
209     * 2. Closing tag, e.g.:
210     *    </a>
211     * 3. XHTML-style content-less tag, e.g.:
212     *    <img src="blah"/>
213     */
214    $tagtype = false;
215    switch (substr($body, $pos, 1)){
216    case '/':
217        spew("$me: This is a closing tag (type 2)\n");
218        $tagtype = 2;
219        $pos++;
220        break;
221    case '!':
222        /**
223         * A comment or an SGML declaration.
224         */
225        if (substr($body, $pos+1, 2) == '--'){
226            spew("$me: A comment found. Stripping.\n");
227            $gt = strpos($body, '-->', $pos);
228            if ($gt === false){
229                $gt = strlen($body);
230            } else {
231                $gt += 2;
232            }
233            return Array(false, false, false, $lt, $gt);
234        } else {
235            spew("$me: An SGML declaration found. Stripping.\n");
236            $gt = findnxstr($body, $pos, '>');
237            return Array(false, false, false, $lt, $gt);
238        }
239        break;
240    default:
241        /**
242         * Assume tagtype 1 for now. If it's type 3, we'll switch values
243         * later.
244         */
245        $tagtype = 1;
246        break;
247    }
248   
249    $tag_start = $pos;
250    $tagname = '';
251    /**
252     * Look for next [\W-_], which will indicate the end of the tag name.
253     */
254    $regary = findnxreg($body, $pos, '[^\w\-_]');
255    if ($regary == false){
256        spew("$me: End of body reached while analyzing tag name\n");
257        return Array(false, false, false, $lt, strlen($body));
258    }
259    list($pos, $tagname, $match) = $regary;
260    $tagname = strtolower($tagname);
261   
262    /**
263     * $match can be either of these:
264     * '>'  indicating the end of the tag entirely.
265     * '\s' indicating the end of the tag name.
266     * '/'  indicating that this is type-3 xhtml tag.
267     *
268     * Whatever else we find there indicates an invalid tag.
269     */
270    switch ($match){
271    case '/':
272        /**
273         * This is an xhtml-style tag with a closing / at the
274         * end, like so: <img src="blah"/>. Check if it's followed
275         * by the closing bracket. If not, then this tag is invalid
276         */
277        if (substr($body, $pos, 2) == '/>'){
278            spew("$me: XHTML-style tag found.\n");
279            $pos++;
280            spew("$me: Setting tagtype to 3\n");
281            $tagtype = 3;
282        } else {
283            spew("$me: Found invalid character '/'.\n");
284            $gt = findnxstr($body, $pos, '>');
285            spew("$me: Tag is invalid. Returning.\n");
286            $retary = Array(false, false, false, $lt, $gt);
287            return $retary;
288        }
289    case '>':
290        spew("$me: End of tag found at $pos\n");
291        spew("$me: Tagname is '$tagname'\n");
292        spew("$me: This tag has no attributes\n");
293        return Array($tagname, false, $tagtype, $lt, $pos);
294        break;
295    default:
296        /**
297         * Check if it's whitespace
298         */
299        if (preg_match('/\s/', $match)){
300            spew("$me: Tagname is '$tagname'\n");
301        } else {
302            /**
303             * This is an invalid tag! Look for the next closing ">".
304             */
305            spew("$me: Invalid characters found in tag name: $match\n");
306            $gt = findnxstr($body, $lt, '>');
307            return Array(false, false, false, $lt, $gt);
308        }
309    }
310   
311    /**
312     * At this point we're here:
313     * <tagname  attribute='blah'>
314     * \-------^
315     *
316     * At this point we loop in order to find all attributes.
317     */
318    $attname = '';
319    $atttype = false;
320    $attary = Array();
321   
322    while ($pos <= strlen($body)){
323        $pos = skipspace($body, $pos);
324        if ($pos == strlen($body)){
325            /**
326             * Non-closed tag.
327             */
328            spew("$me: End of body reached before end of tag. Discarding.\n");
329            return Array(false, false, false, $lt, $pos);
330        }
331        /**
332         * See if we arrived at a ">" or "/>", which means that we reached
333         * the end of the tag.
334         */
335        $matches = Array();
336        preg_match('%^(\s*)(>|/>)%s', substr($body, $pos), $matches);
337        if (isset($matches{0}) && $matches{0}){
338            /**
339             * Yep. So we did.
340             */
341            spew("$me: Arrived at the end of the tag.\n");
342            $pos += strlen($matches{1});
343            if ($matches{2} == '/>'){
344                $tagtype = 3;
345                $pos++;
346            }
347            return Array($tagname, $attary, $tagtype, $lt, $pos);
348        }
349       
350        /**
351         * There are several types of attributes, with optional
352         * [:space:] between members.
353         * Type 1:
354         *   attrname[:space:]=[:space:]'CDATA'
355         * Type 2:
356         *   attrname[:space:]=[:space:]"CDATA"
357         * Type 3:
358         *   attr[:space:]=[:space:]CDATA
359         * Type 4:
360         *   attrname
361         *
362         * We leave types 1 and 2 the same, type 3 we check for
363         * '"' and convert to "&quot" if needed, then wrap in
364         * double quotes. Type 4 we convert into:
365         * attrname="yes".
366         */
367        $regary = findnxreg($body, $pos, '[^\w\-_]');
368        if ($regary == false){
369            /**
370             * Looks like body ended before the end of tag.
371             */
372            spew("$me: End of body found before end of tag.\n");
373            spew("$me: Invalid, returning\n");
374            return Array(false, false, false, $lt, strlen($body));
375        }
376        list($pos, $attname, $match) = $regary;
377        $attname = strtolower($attname);
378        spew("$me: Attribute '$attname' found\n");
379        /**
380         * We arrived at the end of attribute name. Several things possible
381         * here:
382         * '>'  means the end of the tag and this is attribute type 4
383         * '/'  if followed by '>' means the same thing as above
384         * '\s' means a lot of things -- look what it's followed by.
385         *      anything else means the attribute is invalid.
386         */
387        switch($match){
388        case '/':
389            /**
390             * This is an xhtml-style tag with a closing / at the
391             * end, like so: <img src="blah"/>. Check if it's followed
392             * by the closing bracket. If not, then this tag is invalid
393             */
394            if (substr($body, $pos, 2) == '/>'){
395                spew("$me: This is an xhtml-style tag.\n");
396                $pos++;
397                spew("$me: Setting tagtype to 3\n");
398                $tagtype = 3;
399            } else {
400                spew("$me: Found invalid character '/'.\n");
401                $gt = findnxstr($body, $pos, '>');
402                spew("$me: Tag is invalid. Returning.\n");
403                $retary = Array(false, false, false, $lt, $gt);
404                return $retary;
405            }
406        case '>':
407            spew("$me: found type 4 attribute.\n");
408            spew("$me: Additionally, end of tag found at $pos\n");
409            spew("$me: Attname is '$attname'\n");
410            spew("$me: Setting attvalue to 'yes'\n");
411            $attary{$attname} = '"yes"';
412            return Array($tagname, $attary, $tagtype, $lt, $pos);
413            break;
414        default:
415            /**
416             * Skip whitespace and see what we arrive at.
417             */
418            $pos = skipspace($body, $pos);
419            $char = substr($body, $pos, 1);
420            /**
421             * Two things are valid here:
422             * '=' means this is attribute type 1 2 or 3.
423             * \w means this was attribute type 4.
424             * anything else we ignore and re-loop. End of tag and
425             * invalid stuff will be caught by our checks at the beginning
426             * of the loop.
427             */
428            if ($char == '='){
429                spew("$me: Attribute type 1, 2, or 3 found.\n");
430                $pos++;
431                $pos = skipspace($body, $pos);
432                /**
433                 * Here are 3 possibilities:
434                 * "'"  attribute type 1
435                 * '"'  attribute type 2
436                 * everything else is the content of tag type 3
437                 */
438                $quot = substr($body, $pos, 1);
439                if ($quot == '\''){
440                    spew("$me: In fact, this is attribute type 1\n");
441                    spew("$me: looking for closing quote\n");
442                    $regary = findnxreg($body, $pos+1, '\'');
443                    if ($regary == false){
444                        spew("$me: end of body reached before end of val\n");
445                        spew("$me: Returning\n");
446                        return Array(false, false, false, $lt, strlen($body));
447                    }
448                    list($pos, $attval, $match) = $regary;
449                    spew("$me: Attvalue is '$attval'\n");
450                    $pos++;
451                    $attary{$attname} = '\'' . $attval . '\'';
452                } else if ($quot == '"'){
453                    spew("$me: In fact, this is attribute type 2\n");
454                    spew("$me: looking for closing quote\n");
455                    $regary = findnxreg($body, $pos+1, '\"');
456                    if ($regary == false){
457                        spew("$me: end of body reached before end of val\n");
458                        spew("$me: Returning\n");
459                        return Array(false, false, false, $lt, strlen($body));
460                    }
461                    list($pos, $attval, $match) = $regary;
462                    spew("$me: Attvalue is \"$attval\"\n");
463                    $pos++;
464                    $attary{$attname} = '"' . $attval . '"';
465                } else {
466                    spew("$me: This looks like attribute type 3\n");
467                    /**
468                     * These are hateful. Look for \s, or >.
469                     */
470                    spew("$me: Looking for end of attval\n");
471                    $regary = findnxreg($body, $pos, '[\s>]');
472                    if ($regary == false){
473                        spew("$me: end of body reached before end of val\n");
474                        spew("$me: Returning\n");
475                        return Array(false, false, false, $lt, strlen($body));
476                    }
477                    list($pos, $attval, $match) = $regary;
478                    /**
479                     * If it's ">" it will be caught at the top.
480                     */
481                    spew("$me: translating '\"' into &quot;\n");
482                    $attval = preg_replace('/\"/s', '&quot;', $attval);
483                    spew("$me: wrapping in quotes\n");
484                    $attary{$attname} = '"' . $attval . '"';
485                }
486            } else if (preg_match('|[\w/>]|', $char)) {
487                /**
488                 * That was attribute type 4.
489                 */
490                spew("$me: attribute type 4 found.\n");
491                spew("$me: Setting value to 'yes'\n");
492                $attary{$attname} = '"yes"';
493            } else {
494                /**
495                 * An illegal character. Find next '>' and return.
496                 */
497                spew("$me: illegal character '$char' found.\n");
498                spew("$me: returning\n");
499                $gt = findnxstr($body, $pos, '>');
500                return Array(false, false, false, $lt, $gt);
501            }
502        }
503    }
504    /**
505     * The fact that we got here indicates that the tag end was never
506     * found. Return invalid tag indication so it gets stripped.
507     */
508    spew("$me: No tag end found\n");
509    return Array(false, false, false, $lt, strlen($body));
510}
511
512/**
513 * Translates entities into literal values so they can be checked.
514 *
515 * @param $attvalue the by-ref value to check.
516 * @param $regex    the regular expression to check against.
517 * @param $hex      whether the entites are hexadecimal.
518 * @return          True or False depending on whether there were matches.
519 */
520function deent(&$attvalue, $regex, $hex=false){
521    $me = 'deent';
522    spew("$me: matching '$regex' against: $attvalue\n");
523    $ret_match = false;
524    preg_match_all($regex, $attvalue, $matches);
525    if (is_array($matches) && sizeof($matches[0]) > 0){
526        spew("$me: found " . sizeof($matches[0]) . " matches\n");
527        $repl = Array();
528        for ($i = 0; $i < sizeof($matches[0]); $i++){
529            $numval = $matches[1][$i];
530            spew("$me: numval is $numval\n");
531            if ($hex){
532                $numval = hexdec($numval);
533                spew("$me: hex! Numval is now $numval\n");
534            }
535            $repl{$matches[0][$i]} = chr($numval);
536        }
537        $attvalue = strtr($attvalue, $repl);
538        spew("$me: attvalue after translation: $attvalue\n");
539        return true;
540    } else {
541        spew("$me: no matches! Returning false.\n");
542        return false;
543    }
544}
545
546/**
547 * This function checks attribute values for entity-encoded values
548 * and returns them translated into 8-bit strings so we can run
549 * checks on them.
550 *
551 * @param  $attvalue A string to run entity check against.
552 * @return           Nothing, modifies a reference value.
553 */
554function defang(&$attvalue){
555    $me = 'defang';
556    /**
557     * Skip this if there aren't ampersands or backslashes.
558     */
559    spew("$me: Checking '$attvalue' for suspicious content\n");
560    if (strpos($attvalue, '&') === false
561        && strpos($attvalue, '\\') === false){
562        spew("$me: no suspicious content found, returning.\n");
563        return;
564    }
565    $m = false;
566    do {
567        $m = false;
568        $m = $m || deent($attvalue, '/\&#0*(\d+);*/s');
569        $m = $m || deent($attvalue, '/\&#x0*((\d|[a-f])+);*/si', true);
570        $m = $m || deent($attvalue, '/\\\\(\d+)/s', true);
571        spew("$me: m=$m\n");
572    } while ($m == true);
573    $attvalue = stripslashes($attvalue);
574    spew("$me: translated into: $attvalue\n");
575}
576
577/**
578 * Kill any tabs, newlines, or carriage returns. Our friends the
579 * makers of the browser with 95% market value decided that it'd
580 * be funny to make "java[tab]script" be just as good as "javascript".
581 *
582 * @param  attvalue  The attribute value before extraneous spaces removed.
583 * @return attvalue  Nothing, modifies a reference value.
584 */
585function unspace(&$attvalue){
586    $me = 'unspace';
587    if (strcspn($attvalue, "\t\r\n\0 ") != strlen($attvalue)){
588        spew("$me: Killing whitespace.\n");
589        $attvalue = str_replace(Array("\t", "\r", "\n", "\0", " "),
590                                Array('',   '',   '',   '',   ''), $attvalue);
591    }
592    spew("$me: after unspace: $attvalue\n");
593}
594
595/**
596 * This function runs various checks against the attributes.
597 *
598 * @param  $tagname         String with the name of the tag.
599 * @param  $attary          Array with all tag attributes.
600 * @param  $rm_attnames     See description for sanitize
601 * @param  $bad_attvals     See description for sanitize
602 * @param  $add_attr_to_tag See description for sanitize
603 * @return                  Array with modified attributes.
604 */
605function fixatts($tagname,
606                 $attary,
607                 $rm_attnames,
608                 $bad_attvals,
609                 $add_attr_to_tag
610                 ){
611    $me = 'fixatts';
612    spew("$me: Fixing attributes\n");
613    while (list($attname, $attvalue) = each($attary)){
614        /**
615         * See if this attribute should be removed.
616         */
617        foreach ($rm_attnames as $matchtag=>$matchattrs){
618            if (preg_match($matchtag, $tagname)){
619                foreach ($matchattrs as $matchattr){
620                    if (preg_match($matchattr, $attname)){
621                        spew("$me: Attribute '$attname' defined as bad.\n");
622                        spew("$me: Removing.\n");
623                        unset($attary{$attname});
624                        continue;
625                    }
626                }
627            }
628        }
629        /**
630         * Remove any backslashes, entities, or extraneous whitespace.
631         */
632        defang($attvalue);
633        unspace($attvalue);
634       
635        /**
636         * Now let's run checks on the attvalues.
637         * I don't expect anyone to comprehend this. If you do,
638         * get in touch with me so I can drive to where you live and
639         * shake your hand personally. :)
640         */
641        foreach ($bad_attvals as $matchtag=>$matchattrs){
642            if (preg_match($matchtag, $tagname)){
643                foreach ($matchattrs as $matchattr=>$valary){
644                    if (preg_match($matchattr, $attname)){
645                        /**
646                         * There are two arrays in valary.
647                         * First is matches.
648                         * Second one is replacements
649                         */
650                        list($valmatch, $valrepl) = $valary;
651                        $newvalue = preg_replace($valmatch,$valrepl,$attvalue);
652                        if ($newvalue != $attvalue){
653                            spew("$me: attvalue is now $newvalue\n");
654                            $attary{$attname} = $newvalue;
655                        }
656                    }
657                }
658            }
659        }
660    }
661    /**
662     * See if we need to append any attributes to this tag.
663     */
664    foreach ($add_attr_to_tag as $matchtag=>$addattary){
665        if (preg_match($matchtag, $tagname)){
666            $attary = array_merge($attary, $addattary);
667            spew("$me: Added attributes to this tag\n");
668        }
669    }
670    return $attary;
671}
672
673/**
674 * This is the main function and the one you should actually be calling.
675 * There are several variables you should be aware of an which need
676 * special description.
677 *
678 * $tag_list
679 * ----------
680 * This is a simple one-dimentional array of strings, except for the
681 * very first one. The first member should be einter false or true.
682 * In case it's FALSE, the following list will be considered a list of
683 * tags that should be explicitly REMOVED from the body, and all
684 * others that did not match the list will be allowed.  If the first
685 * member is TRUE, then the list is the list of tags that should be
686 * explicitly ALLOWED -- any tag not matching this list will be
687 * discarded.
688 *
689 * Examples:
690 * $tag_list = Array(
691 *                   false,   
692 *                   "blink",
693 *                   "link",
694 *                   "object",
695 *                   "meta",
696 *                   "marquee",
697 *                   "html"
698 *                          );
699 *
700 * This will allow all tags except for blink, link, object, meta, marquee,
701 * and html.
702 *
703 * $tag_list = Array(
704 *                   true,
705 *                   "b",
706 *                   "a",
707 *                   "i",
708 *                   "img",
709 *                   "strong",
710 *                   "em",
711 *                   "p"
712 *                  );
713 *
714 * This will remove all tags from the body except b, a, i, img, strong, em and
715 * p.
716 *
717 * $rm_tags_with_content
718 * ---------------------
719 * This is a simple one-dimentional array of strings, which specifies the
720 * tags to be removed with any and all content between the beginning and
721 * the end of the tag.
722 * Example:
723 * $rm_tags_with_content = Array(
724 *                               "script",
725 *                               "style",
726 *                               "applet",
727 *                               "embed"
728 *                              );
729 *
730 * This will remove the following structure:
731 * <script>
732 *  window.alert("Isn't cross-site-scripting fun?!");
733 * </script>
734 *
735 * $self_closing_tags
736 * ------------------
737 * This is a simple one-dimentional array of strings, which specifies which
738 * tags contain no content and should not be forcefully closed if this option
739 * is turned on (see further).
740 * Example:
741 * $self_closing_tags =  Array(
742 *                             "img",
743 *                             "br",
744 *                             "hr",
745 *                             "input"
746 *                            );   
747 *
748 * $force_tag_closing
749 * ------------------
750 * Set it to true to forcefully close any tags opened within the document.
751 * This is good if you want to take care of people who like to screw up
752 * the pages by leaving unclosed tags like <a>, <b>, <i>, etc.
753 *
754 * $rm_attnames
755 * -------------
756 * Now we come to parameters that are more obscure. This parameter is
757 * a nested array which is used to specify which attributes should be
758 * removed. It goes like so:
759 *
760 * $rm_attnames = Array(
761 *   "PCRE regex to match tag name" =>
762 *     Array(
763 *           "PCRE regex to match attribute name"
764 *           )
765 *   );
766 *
767 * Example:
768 * $rm_attnames = Array(
769 *   "|.*|" =>
770 *     Array(
771 *           "|target|i",
772 *           "|^on.*|i" 
773 *          )
774 *   );
775 *
776 * This will match all attributes (.*), and specify that all attributes
777 * named "target" and starting with "on" should be removed. This will take
778 * care of the following problem:
779 * <em onmouseover="window.alert('muahahahaha')">
780 * The "onmouseover" will be removed.
781 *
782 * $bad_attvals
783 * ------------
784 * This is where it gets ugly. This is a nested array with many levels.
785 * It goes like so:
786 *
787 * $bad_attvals = Array(
788 *   "pcre regex to match tag name" =>
789 *     Array(
790 *           "pcre regex to match attribute name" =>
791 *             Array(
792 *                   "pcre regex to match attribute value"
793 *                  )
794 *             Array(
795 *                   "pcre regex replace a match from above with"
796 *                  )
797 *          )
798 *   );
799 *
800 * An extensive example:
801 *
802 * $bad_attvals = Array(
803 *   "|.*|" =>
804 *      Array(
805 *            "/^src|background|href|action/i" =>
806 *                Array(
807 *                      Array(
808 *                            "/^([\'\"])\s*\S+script\s*:.*([\'\"])/si"
809 *                            ),
810 *                      Array(
811 *                            "\\1http://veryfunny.com/\\2"
812 *                            )
813 *                      ),
814 *            "/^style/i" =>
815 *                Array(
816 *                      Array(
817 *                            "/expression/si",
818 *                            "/url\(([\'\"])\s*https*:.*([\'\"])\)/si",
819 *                            "/url\(([\'\"])\s*\S+script:.*([\'\"])\)/si"
820 *                           ),
821 *                      Array(
822 *                            "idiocy",
823 *                            "url(\\1http://veryfunny.com/\\2)",
824 *                            "url(\\1http://veryfynny.com/\\2)"
825 *                           )
826 *                      )
827 *            )
828 *  );
829 *
830 * This will take care of nearly all known cross-site scripting exploits,
831 * plus some (see my filter sample at
832 * http://www.mricon.com/html/phpfilter.html for a working version).
833 *
834 * $add_attr_to_tag
835 * ----------------
836 * This is a useful little feature which lets you add attributes to
837 * certain tags. It is a nested array as well, but not at all like
838 * the previous one. It goes like so:
839 *
840 * $add_attr_to_tag = Array(
841 *   "PCRE regex to match tag name" =>
842 *     Array(
843 *           "attribute name"=>'"attribute value"'
844 *          )
845 *   );
846 *
847 * Note: don't forget quotes around attribute value.
848 *
849 * Example:
850 *
851 * $add_attr_to_tag = Array(
852 *   "/^a$/si" =>
853 *     Array(
854 *           'target'=>'"_new"'
855 *          )
856 *   );
857 *
858 * This will change all <a> tags and add target="_new" to them so all links
859 * open in a new window.
860 *
861 *
862 *
863 * @param $body                 the string with HTML you wish to filter
864 * @param $tag_list             see description above
865 * @param $rm_tags_with_content see description above
866 * @param $self_closing_tags    see description above
867 * @param $force_tag_closing    see description above
868 * @param $rm_attnames          see description above
869 * @param $bad_attvals          see description above
870 * @param $add_attr_to_tag      see description above
871 * @return                      sanitized html safe to show on your pages.
872 */
873function sanitize($body,
874                  $tag_list,
875                  $rm_tags_with_content,
876                  $self_closing_tags,
877                  $force_tag_closing,
878                  $rm_attnames,
879                  $bad_attvals,
880                  $add_attr_to_tag
881                  ){
882    $me = 'sanitize';
883    /**
884     * Normalize rm_tags and rm_tags_with_content.
885     */
886    @array_walk($tag_list, 'casenormalize');
887    @array_walk($rm_tags_with_content, 'casenormalize');
888    @array_walk($self_closing_tags, 'casenormalize');
889    /**
890     * See if tag_list is of tags to remove or tags to allow.
891     * false  means remove these tags
892     * true   means allow these tags
893     */
894    $rm_tags = array_shift($tag_list);
895    $curpos = 0;
896    $open_tags = Array();
897    //$trusted = "<!-- begin sanitized html -->\n";
898    $trusted = "";
899    $skip_content = false;
900    /**
901     * Take care of netscape's stupid javascript entities like
902     * &{alert('boo')};
903     */
904    $body = preg_replace('/&(\{.*?\};)/si', '&amp;\\1', $body);
905    spew("$me: invoking the loop\n");
906    while (($curtag = getnxtag($body, $curpos)) != FALSE){
907        list($tagname, $attary, $tagtype, $lt, $gt) = $curtag;
908        spew("$me: grabbing free-standing content\n");
909        $free_content = substr($body, $curpos, $lt - $curpos);
910        spew("$me: " . strlen($free_content) . " chars grabbed\n");
911        if ($skip_content == false){
912            spew("$me: appending free content to trusted.\n");
913            $trusted .= $free_content;
914        } else {
915            spew("$me: Skipping free content.\n");
916        }
917        if ($tagname != FALSE){
918            spew("$me: tagname is '$tagname'\n");
919            if ($tagtype == 2){
920                spew("$me: This is a closing tag\n");
921                if ($skip_content == $tagname){
922                    /**
923                     * Got to the end of tag we needed to remove.
924                     */
925                    spew("$me: Finished removing tag with content\n");
926                    $tagname = false;
927                    $skip_content = false;
928                } else {
929                    if ($skip_content == false){
930                        if (isset($open_tags{$tagname}) &&
931                            $open_tags{$tagname} > 0){
932                            spew("$me: popping '$tagname' from open_tags\n");
933                            $open_tags{$tagname}--;
934                        } else {
935                            spew("$me: '$tagname' was never opened\n");
936                            spew("$me: removing\n");
937                            $tagname = false;
938                        }
939                    } else {
940                        spew("$me: Skipping this tag\n");
941                    }
942                }
943            } else {
944                /**
945                 * $rm_tags_with_content
946                 */
947                if ($skip_content == false){
948                    /**
949                     * See if this is a self-closing type and change
950                     * tagtype appropriately.
951                     */
952                    if ($tagtype == 1
953                        && in_array($tagname, $self_closing_tags)){
954                        spew("$me: Self-closing tag. Changing tagtype.\n");
955                        $tagtype = 3;
956                    }
957                    /**
958                     * See if we should skip this tag and any content
959                     * inside it.
960                     */
961                    if ($tagtype == 1
962                        && in_array($tagname, $rm_tags_with_content)){
963                        spew("$me: removing this tag with content\n");
964                        $skip_content = $tagname;
965                    } else {
966                        if (($rm_tags == false
967                             && in_array($tagname, $tag_list)) ||
968                            ($rm_tags == true
969                             && !in_array($tagname, $tag_list))){
970                            spew("$me: Removing this tag.\n");
971                            $tagname = false;
972                        } else {
973                            if ($tagtype == 1){
974                                spew("$me: adding '$tagname' to open_tags\n");
975                                if (isset($open_tags{$tagname})){
976                                    $open_tags{$tagname}++;
977                                } else {
978                                    $open_tags{$tagname} = 1;
979                                }
980                            }
981                            /**
982                             * This is where we run other checks.
983                             */
984                            if (is_array($attary) && sizeof($attary) > 0){
985                                $attary = fixatts($tagname,
986                                                  $attary,
987                                                  $rm_attnames,
988                                                  $bad_attvals,
989                                                  $add_attr_to_tag);
990                            }
991                        }
992                    }
993                } else {
994                    spew("$me: Skipping this tag\n");
995                }
996            }
997            if ($tagname != false && $skip_content == false){
998                spew("$me: Appending tag to trusted.\n");
999                $trusted .= tagprint($tagname, $attary, $tagtype);
1000            }
1001        } else {
1002            spew("$me: Removing invalid tag\n");
1003        }
1004        $curpos = $gt + 1;
1005    }
1006    spew("$me: Appending any leftover content\n");
1007    $trusted .= substr($body, $curpos, strlen($body) - $curpos);
1008    if ($force_tag_closing == true){
1009        foreach ($open_tags as $tagname=>$opentimes){
1010            while ($opentimes > 0){
1011                spew("$me: '$tagname' left open. Closing by force.\n");
1012                $trusted .= '</' . $tagname . '>';
1013                $opentimes--;
1014            }
1015        }
1016        $trusted .= "\n";
1017    }
1018    //$trusted .= "<!-- end sanitized html -->\n";
1019    return $trusted;
1020}
1021?>
Note: See TracBrowser for help on using the repository browser.