source: trunk/expressoMail1_2/inc/htmlfilter.inc @ 2

Revision 2, 35.9 KB checked in by niltonneto, 17 years ago (diff)

Removida todas as tags usadas pelo CVS ($Id, $Source).
Primeira versão no CVS externo.

  • Property svn:eol-style set to native
  • Property svn:executable set to *
Line 
1<?php
2/**
3 * htmlfilter.inc
4 * ---------------
5 * This set of functions allows you to filter html in order to remove
6 * any malicious tags from it. Useful in cases when you need to filter
7 * user input for any cross-site-scripting attempts.
8 *
9 * Copyright (C) 2002-2004 by Duke University
10 *
11 * This library is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License as published by the Free Software Foundation; either
14 * version 2.1 of the License, or (at your option) any later version.
15 *
16 * This library is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19 * Lesser General Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with this library; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 
24 * 02110-1301  USA
25 *
26 * @Author  Konstantin Riabitsev <icon@linux.duke.edu>
27 * @Version 1.1 ($Date$)
28 */
29
30/**
31 * This is a debugging function used throughout the code. To enable
32 * debugging you have to specify a global variable called "debug" before
33 * calling sanitize() and set it to true.
34 *
35 * Note: Although insignificantly, debugging does slow you down even
36 * when $debug is set to false. If you wish to get rid of all
37 * debugging calls, run the following command:
38 *
39 * fgrep -v 'spew("' htmlfilter.inc > htmlfilter.inc.new
40 *
41 * htmlfilter.inc.new will contain no debugging calls.
42 *
43 * @param  $message  A string with the message to output.
44 * @return           void.
45 */
46function spew($message){
47    global $debug;
48    if ($debug == true){
49        echo "$message";
50    }
51}
52
53/**
54 * This function returns the final tag out of the tag name, an array
55 * of attributes, and the type of the tag. This function is called by
56 * sanitize internally.
57 *
58 * @param  $tagname  the name of the tag.
59 * @param  $attary   the array of attributes and their values
60 * @param  $tagtype  The type of the tag (see in comments).
61 * @return           a string with the final tag representation.
62 */
63function tagprint($tagname, $attary, $tagtype){
64    $me = 'tagprint';
65    if ($tagtype == 2){
66        $fulltag = '</' . $tagname . '>';
67    } else {
68        $fulltag = '<' . $tagname;
69        if (is_array($attary) && sizeof($attary)){
70            $atts = Array();
71            while (list($attname, $attvalue) = each($attary)){
72                array_push($atts, "$attname=$attvalue");
73            }
74            $fulltag .= ' ' . join(' ', $atts);
75        }
76        if ($tagtype == 3){
77            $fulltag .= ' /';
78        }
79        $fulltag .= '>';
80    }
81    spew("$me: $fulltag\n");
82    return $fulltag;
83}
84
85/**
86 * A small helper function to use with array_walk. Modifies a by-ref
87 * value and makes it lowercase.
88 *
89 * @param  $val a value passed by-ref.
90 * @return      void since it modifies a by-ref value.
91 */
92function casenormalize(&$val){
93    $val = strtolower($val);
94}
95
96/**
97 * This function skips any whitespace from the current position within
98 * a string and to the next non-whitespace value.
99 *
100 * @param  $body   the string
101 * @param  $offset the offset within the string where we should start
102 *                 looking for the next non-whitespace character.
103 * @return         the location within the $body where the next
104 *                 non-whitespace char is located.
105 */
106function skipspace($body, $offset){
107    $me = 'skipspace';
108    preg_match('/^(\s*)/s', substr($body, $offset), $matches);
109    if (sizeof($matches{1})){
110        $count = strlen($matches{1});
111        spew("$me: skipped $count chars\n");
112        $offset += $count;
113    }
114    return $offset;
115}
116
117/**
118 * This function looks for the next character within a string.  It's
119 * really just a glorified "strpos", except it catches the failures
120 * nicely.
121 *
122 * @param  $body   The string to look for needle in.
123 * @param  $offset Start looking from this position.
124 * @param  $needle The character/string to look for.
125 * @return         location of the next occurance of the needle, or
126 *                 strlen($body) if needle wasn't found.
127 */
128function findnxstr($body, $offset, $needle){
129    $me = 'findnxstr';
130    $pos = strpos($body, $needle, $offset);
131    if ($pos === FALSE){
132        $pos = strlen($body);
133        spew("$me: end of body reached\n");
134    }
135    spew("$me: '$needle' found at pos $pos\n");
136    return $pos;
137}
138
139/**
140 * This function takes a PCRE-style regexp and tries to match it
141 * within the string.
142 *
143 * @param  $body   The string to look for needle in.
144 * @param  $offset Start looking from here.
145 * @param  $reg    A PCRE-style regex to match.
146 * @return         Returns a false if no matches found, or an array
147 *                 with the following members:
148 *                 - integer with the location of the match within $body
149 *                 - string with whatever content between offset and the match
150 *                 - string with whatever it is we matched
151 */
152function findnxreg($body, $offset, $reg){
153    $me = 'findnxreg';
154    $matches = Array();
155    $retarr = Array();
156    $preg_rule = '%^(.*?)(' . $reg . ')%s';
157    preg_match($preg_rule, substr($body, $offset), $matches);
158    if (!isset($matches{0})){
159        spew("$me: No matches found.\n");
160        $retarr = false;
161    } else {
162        $retarr{0} = $offset + strlen($matches{1});
163        $retarr{1} = $matches{1};
164        $retarr{2} = $matches{2};
165        spew("$me: '$reg' found at pos $offset matching '".$matches{2}."'\n");
166    }
167    return $retarr;
168}
169
170/**
171 * This function looks for the next tag.
172 *
173 * @param  $body   String where to look for the next tag.
174 * @param  $offset Start looking from here.
175 * @return         false if no more tags exist in the body, or
176 *                 an array with the following members:
177 *                 - string with the name of the tag
178 *                 - array with attributes and their values
179 *                 - integer with tag type (1, 2, or 3)
180 *                 - integer where the tag starts (starting "<")
181 *                 - integer where the tag ends (ending ">")
182 *                 first three members will be false, if the tag is invalid.
183 */
184function getnxtag($body, $offset){
185    $me = 'getnxtag';
186    if ($offset > strlen($body)){
187        spew("$me: Past the end of body\n");
188        return false;
189    }
190    $lt = findnxstr($body, $offset, '<');
191    if ($lt == strlen($body)){
192        spew("$me: No more tags found!\n");
193        return false;
194    }
195    /**
196     * We are here:
197     * blah blah <tag attribute="value">
198     * \---------^
199     */
200    spew("$me: Found '<' at pos $lt\n");
201    $pos = skipspace($body, $lt + 1);
202    if ($pos >= strlen($body)){
203        spew("$me: End of body reached.\n");
204        return Array(false, false, false, $lt, strlen($body));
205    }
206    /**
207     * There are 3 kinds of tags:
208     * 1. Opening tag, e.g.:
209     *    <a href="blah">
210     * 2. Closing tag, e.g.:
211     *    </a>
212     * 3. XHTML-style content-less tag, e.g.:
213     *    <img src="blah"/>
214     */
215    $tagtype = false;
216    switch (substr($body, $pos, 1)){
217    case '/':
218        spew("$me: This is a closing tag (type 2)\n");
219        $tagtype = 2;
220        $pos++;
221        break;
222    case '!':
223        /**
224         * A comment or an SGML declaration.
225         */
226        if (substr($body, $pos+1, 2) == '--'){
227            spew("$me: A comment found. Stripping.\n");
228            $gt = strpos($body, '-->', $pos);
229            if ($gt === false){
230                $gt = strlen($body);
231            } else {
232                $gt += 2;
233            }
234            return Array(false, false, false, $lt, $gt);
235        } else {
236            spew("$me: An SGML declaration found. Stripping.\n");
237            $gt = findnxstr($body, $pos, '>');
238            return Array(false, false, false, $lt, $gt);
239        }
240        break;
241    default:
242        /**
243         * Assume tagtype 1 for now. If it's type 3, we'll switch values
244         * later.
245         */
246        $tagtype = 1;
247        break;
248    }
249   
250    $tag_start = $pos;
251    $tagname = '';
252    /**
253     * Look for next [\W-_], which will indicate the end of the tag name.
254     */
255    $regary = findnxreg($body, $pos, '[^\w\-_]');
256    if ($regary == false){
257        spew("$me: End of body reached while analyzing tag name\n");
258        return Array(false, false, false, $lt, strlen($body));
259    }
260    list($pos, $tagname, $match) = $regary;
261    $tagname = strtolower($tagname);
262   
263    /**
264     * $match can be either of these:
265     * '>'  indicating the end of the tag entirely.
266     * '\s' indicating the end of the tag name.
267     * '/'  indicating that this is type-3 xhtml tag.
268     *
269     * Whatever else we find there indicates an invalid tag.
270     */
271    switch ($match){
272    case '/':
273        /**
274         * This is an xhtml-style tag with a closing / at the
275         * end, like so: <img src="blah"/>. Check if it's followed
276         * by the closing bracket. If not, then this tag is invalid
277         */
278        if (substr($body, $pos, 2) == '/>'){
279            spew("$me: XHTML-style tag found.\n");
280            $pos++;
281            spew("$me: Setting tagtype to 3\n");
282            $tagtype = 3;
283        } else {
284            spew("$me: Found invalid character '/'.\n");
285            $gt = findnxstr($body, $pos, '>');
286            spew("$me: Tag is invalid. Returning.\n");
287            $retary = Array(false, false, false, $lt, $gt);
288            return $retary;
289        }
290    case '>':
291        spew("$me: End of tag found at $pos\n");
292        spew("$me: Tagname is '$tagname'\n");
293        spew("$me: This tag has no attributes\n");
294        return Array($tagname, false, $tagtype, $lt, $pos);
295        break;
296    default:
297        /**
298         * Check if it's whitespace
299         */
300        if (preg_match('/\s/', $match)){
301            spew("$me: Tagname is '$tagname'\n");
302        } else {
303            /**
304             * This is an invalid tag! Look for the next closing ">".
305             */
306            spew("$me: Invalid characters found in tag name: $match\n");
307            $gt = findnxstr($body, $lt, '>');
308            return Array(false, false, false, $lt, $gt);
309        }
310    }
311   
312    /**
313     * At this point we're here:
314     * <tagname  attribute='blah'>
315     * \-------^
316     *
317     * At this point we loop in order to find all attributes.
318     */
319    $attname = '';
320    $atttype = false;
321    $attary = Array();
322   
323    while ($pos <= strlen($body)){
324        $pos = skipspace($body, $pos);
325        if ($pos == strlen($body)){
326            /**
327             * Non-closed tag.
328             */
329            spew("$me: End of body reached before end of tag. Discarding.\n");
330            return Array(false, false, false, $lt, $pos);
331        }
332        /**
333         * See if we arrived at a ">" or "/>", which means that we reached
334         * the end of the tag.
335         */
336        $matches = Array();
337        preg_match('%^(\s*)(>|/>)%s', substr($body, $pos), $matches);
338        if (isset($matches{0}) && $matches{0}){
339            /**
340             * Yep. So we did.
341             */
342            spew("$me: Arrived at the end of the tag.\n");
343            $pos += strlen($matches{1});
344            if ($matches{2} == '/>'){
345                $tagtype = 3;
346                $pos++;
347            }
348            return Array($tagname, $attary, $tagtype, $lt, $pos);
349        }
350       
351        /**
352         * There are several types of attributes, with optional
353         * [:space:] between members.
354         * Type 1:
355         *   attrname[:space:]=[:space:]'CDATA'
356         * Type 2:
357         *   attrname[:space:]=[:space:]"CDATA"
358         * Type 3:
359         *   attr[:space:]=[:space:]CDATA
360         * Type 4:
361         *   attrname
362         *
363         * We leave types 1 and 2 the same, type 3 we check for
364         * '"' and convert to "&quot" if needed, then wrap in
365         * double quotes. Type 4 we convert into:
366         * attrname="yes".
367         */
368        $regary = findnxreg($body, $pos, '[^\w\-_]');
369        if ($regary == false){
370            /**
371             * Looks like body ended before the end of tag.
372             */
373            spew("$me: End of body found before end of tag.\n");
374            spew("$me: Invalid, returning\n");
375            return Array(false, false, false, $lt, strlen($body));
376        }
377        list($pos, $attname, $match) = $regary;
378        $attname = strtolower($attname);
379        spew("$me: Attribute '$attname' found\n");
380        /**
381         * We arrived at the end of attribute name. Several things possible
382         * here:
383         * '>'  means the end of the tag and this is attribute type 4
384         * '/'  if followed by '>' means the same thing as above
385         * '\s' means a lot of things -- look what it's followed by.
386         *      anything else means the attribute is invalid.
387         */
388        switch($match){
389        case '/':
390            /**
391             * This is an xhtml-style tag with a closing / at the
392             * end, like so: <img src="blah"/>. Check if it's followed
393             * by the closing bracket. If not, then this tag is invalid
394             */
395            if (substr($body, $pos, 2) == '/>'){
396                spew("$me: This is an xhtml-style tag.\n");
397                $pos++;
398                spew("$me: Setting tagtype to 3\n");
399                $tagtype = 3;
400            } else {
401                spew("$me: Found invalid character '/'.\n");
402                $gt = findnxstr($body, $pos, '>');
403                spew("$me: Tag is invalid. Returning.\n");
404                $retary = Array(false, false, false, $lt, $gt);
405                return $retary;
406            }
407        case '>':
408            spew("$me: found type 4 attribute.\n");
409            spew("$me: Additionally, end of tag found at $pos\n");
410            spew("$me: Attname is '$attname'\n");
411            spew("$me: Setting attvalue to 'yes'\n");
412            $attary{$attname} = '"yes"';
413            return Array($tagname, $attary, $tagtype, $lt, $pos);
414            break;
415        default:
416            /**
417             * Skip whitespace and see what we arrive at.
418             */
419            $pos = skipspace($body, $pos);
420            $char = substr($body, $pos, 1);
421            /**
422             * Two things are valid here:
423             * '=' means this is attribute type 1 2 or 3.
424             * \w means this was attribute type 4.
425             * anything else we ignore and re-loop. End of tag and
426             * invalid stuff will be caught by our checks at the beginning
427             * of the loop.
428             */
429            if ($char == '='){
430                spew("$me: Attribute type 1, 2, or 3 found.\n");
431                $pos++;
432                $pos = skipspace($body, $pos);
433                /**
434                 * Here are 3 possibilities:
435                 * "'"  attribute type 1
436                 * '"'  attribute type 2
437                 * everything else is the content of tag type 3
438                 */
439                $quot = substr($body, $pos, 1);
440                if ($quot == '\''){
441                    spew("$me: In fact, this is attribute type 1\n");
442                    spew("$me: looking for closing quote\n");
443                    $regary = findnxreg($body, $pos+1, '\'');
444                    if ($regary == false){
445                        spew("$me: end of body reached before end of val\n");
446                        spew("$me: Returning\n");
447                        return Array(false, false, false, $lt, strlen($body));
448                    }
449                    list($pos, $attval, $match) = $regary;
450                    spew("$me: Attvalue is '$attval'\n");
451                    $pos++;
452                    $attary{$attname} = '\'' . $attval . '\'';
453                } else if ($quot == '"'){
454                    spew("$me: In fact, this is attribute type 2\n");
455                    spew("$me: looking for closing quote\n");
456                    $regary = findnxreg($body, $pos+1, '\"');
457                    if ($regary == false){
458                        spew("$me: end of body reached before end of val\n");
459                        spew("$me: Returning\n");
460                        return Array(false, false, false, $lt, strlen($body));
461                    }
462                    list($pos, $attval, $match) = $regary;
463                    spew("$me: Attvalue is \"$attval\"\n");
464                    $pos++;
465                    $attary{$attname} = '"' . $attval . '"';
466                } else {
467                    spew("$me: This looks like attribute type 3\n");
468                    /**
469                     * These are hateful. Look for \s, or >.
470                     */
471                    spew("$me: Looking for end of attval\n");
472                    $regary = findnxreg($body, $pos, '[\s>]');
473                    if ($regary == false){
474                        spew("$me: end of body reached before end of val\n");
475                        spew("$me: Returning\n");
476                        return Array(false, false, false, $lt, strlen($body));
477                    }
478                    list($pos, $attval, $match) = $regary;
479                    /**
480                     * If it's ">" it will be caught at the top.
481                     */
482                    spew("$me: translating '\"' into &quot;\n");
483                    $attval = preg_replace('/\"/s', '&quot;', $attval);
484                    spew("$me: wrapping in quotes\n");
485                    $attary{$attname} = '"' . $attval . '"';
486                }
487            } else if (preg_match('|[\w/>]|', $char)) {
488                /**
489                 * That was attribute type 4.
490                 */
491                spew("$me: attribute type 4 found.\n");
492                spew("$me: Setting value to 'yes'\n");
493                $attary{$attname} = '"yes"';
494            } else {
495                /**
496                 * An illegal character. Find next '>' and return.
497                 */
498                spew("$me: illegal character '$char' found.\n");
499                spew("$me: returning\n");
500                $gt = findnxstr($body, $pos, '>');
501                return Array(false, false, false, $lt, $gt);
502            }
503        }
504    }
505    /**
506     * The fact that we got here indicates that the tag end was never
507     * found. Return invalid tag indication so it gets stripped.
508     */
509    spew("$me: No tag end found\n");
510    return Array(false, false, false, $lt, strlen($body));
511}
512
513/**
514 * Translates entities into literal values so they can be checked.
515 *
516 * @param $attvalue the by-ref value to check.
517 * @param $regex    the regular expression to check against.
518 * @param $hex      whether the entites are hexadecimal.
519 * @return          True or False depending on whether there were matches.
520 */
521function deent(&$attvalue, $regex, $hex=false){
522    $me = 'deent';
523    spew("$me: matching '$regex' against: $attvalue\n");
524    $ret_match = false;
525    preg_match_all($regex, $attvalue, $matches);
526    if (is_array($matches) && sizeof($matches[0]) > 0){
527        spew("$me: found " . sizeof($matches[0]) . " matches\n");
528        $repl = Array();
529        for ($i = 0; $i < sizeof($matches[0]); $i++){
530            $numval = $matches[1][$i];
531            spew("$me: numval is $numval\n");
532            if ($hex){
533                $numval = hexdec($numval);
534                spew("$me: hex! Numval is now $numval\n");
535            }
536            $repl{$matches[0][$i]} = chr($numval);
537        }
538        $attvalue = strtr($attvalue, $repl);
539        spew("$me: attvalue after translation: $attvalue\n");
540        return true;
541    } else {
542        spew("$me: no matches! Returning false.\n");
543        return false;
544    }
545}
546
547/**
548 * This function checks attribute values for entity-encoded values
549 * and returns them translated into 8-bit strings so we can run
550 * checks on them.
551 *
552 * @param  $attvalue A string to run entity check against.
553 * @return           Nothing, modifies a reference value.
554 */
555function defang(&$attvalue){
556    $me = 'defang';
557    /**
558     * Skip this if there aren't ampersands or backslashes.
559     */
560    spew("$me: Checking '$attvalue' for suspicious content\n");
561    if (strpos($attvalue, '&') === false
562        && strpos($attvalue, '\\') === false){
563        spew("$me: no suspicious content found, returning.\n");
564        return;
565    }
566    $m = false;
567    do {
568        $m = false;
569        $m = $m || deent($attvalue, '/\&#0*(\d+);*/s');
570        $m = $m || deent($attvalue, '/\&#x0*((\d|[a-f])+);*/si', true);
571        $m = $m || deent($attvalue, '/\\\\(\d+)/s', true);
572        spew("$me: m=$m\n");
573    } while ($m == true);
574    $attvalue = stripslashes($attvalue);
575    spew("$me: translated into: $attvalue\n");
576}
577
578/**
579 * Kill any tabs, newlines, or carriage returns. Our friends the
580 * makers of the browser with 95% market value decided that it'd
581 * be funny to make "java[tab]script" be just as good as "javascript".
582 *
583 * @param  attvalue  The attribute value before extraneous spaces removed.
584 * @return attvalue  Nothing, modifies a reference value.
585 */
586function unspace(&$attvalue){
587    $me = 'unspace';
588    if (strcspn($attvalue, "\t\r\n\0 ") != strlen($attvalue)){
589        spew("$me: Killing whitespace.\n");
590        $attvalue = str_replace(Array("\t", "\r", "\n", "\0", " "),
591                                Array('',   '',   '',   '',   ''), $attvalue);
592    }
593    spew("$me: after unspace: $attvalue\n");
594}
595
596/**
597 * This function runs various checks against the attributes.
598 *
599 * @param  $tagname         String with the name of the tag.
600 * @param  $attary          Array with all tag attributes.
601 * @param  $rm_attnames     See description for sanitize
602 * @param  $bad_attvals     See description for sanitize
603 * @param  $add_attr_to_tag See description for sanitize
604 * @return                  Array with modified attributes.
605 */
606function fixatts($tagname,
607                 $attary,
608                 $rm_attnames,
609                 $bad_attvals,
610                 $add_attr_to_tag
611                 ){
612    $me = 'fixatts';
613    spew("$me: Fixing attributes\n");
614    while (list($attname, $attvalue) = each($attary)){
615        /**
616         * See if this attribute should be removed.
617         */
618        foreach ($rm_attnames as $matchtag=>$matchattrs){
619            if (preg_match($matchtag, $tagname)){
620                foreach ($matchattrs as $matchattr){
621                    if (preg_match($matchattr, $attname)){
622                        spew("$me: Attribute '$attname' defined as bad.\n");
623                        spew("$me: Removing.\n");
624                        unset($attary{$attname});
625                        continue;
626                    }
627                }
628            }
629        }
630        /**
631         * Remove any backslashes, entities, or extraneous whitespace.
632         */
633        defang($attvalue);
634        unspace($attvalue);
635       
636        /**
637         * Now let's run checks on the attvalues.
638         * I don't expect anyone to comprehend this. If you do,
639         * get in touch with me so I can drive to where you live and
640         * shake your hand personally. :)
641         */
642        foreach ($bad_attvals as $matchtag=>$matchattrs){
643            if (preg_match($matchtag, $tagname)){
644                foreach ($matchattrs as $matchattr=>$valary){
645                    if (preg_match($matchattr, $attname)){
646                        /**
647                         * There are two arrays in valary.
648                         * First is matches.
649                         * Second one is replacements
650                         */
651                        list($valmatch, $valrepl) = $valary;
652                        $newvalue = preg_replace($valmatch,$valrepl,$attvalue);
653                        if ($newvalue != $attvalue){
654                            spew("$me: attvalue is now $newvalue\n");
655                            $attary{$attname} = $newvalue;
656                        }
657                    }
658                }
659            }
660        }
661    }
662    /**
663     * See if we need to append any attributes to this tag.
664     */
665    foreach ($add_attr_to_tag as $matchtag=>$addattary){
666        if (preg_match($matchtag, $tagname)){
667            $attary = array_merge($attary, $addattary);
668            spew("$me: Added attributes to this tag\n");
669        }
670    }
671    return $attary;
672}
673
674/**
675 * This is the main function and the one you should actually be calling.
676 * There are several variables you should be aware of an which need
677 * special description.
678 *
679 * $tag_list
680 * ----------
681 * This is a simple one-dimentional array of strings, except for the
682 * very first one. The first member should be einter false or true.
683 * In case it's FALSE, the following list will be considered a list of
684 * tags that should be explicitly REMOVED from the body, and all
685 * others that did not match the list will be allowed.  If the first
686 * member is TRUE, then the list is the list of tags that should be
687 * explicitly ALLOWED -- any tag not matching this list will be
688 * discarded.
689 *
690 * Examples:
691 * $tag_list = Array(
692 *                   false,   
693 *                   "blink",
694 *                   "link",
695 *                   "object",
696 *                   "meta",
697 *                   "marquee",
698 *                   "html"
699 *                          );
700 *
701 * This will allow all tags except for blink, link, object, meta, marquee,
702 * and html.
703 *
704 * $tag_list = Array(
705 *                   true,
706 *                   "b",
707 *                   "a",
708 *                   "i",
709 *                   "img",
710 *                   "strong",
711 *                   "em",
712 *                   "p"
713 *                  );
714 *
715 * This will remove all tags from the body except b, a, i, img, strong, em and
716 * p.
717 *
718 * $rm_tags_with_content
719 * ---------------------
720 * This is a simple one-dimentional array of strings, which specifies the
721 * tags to be removed with any and all content between the beginning and
722 * the end of the tag.
723 * Example:
724 * $rm_tags_with_content = Array(
725 *                               "script",
726 *                               "style",
727 *                               "applet",
728 *                               "embed"
729 *                              );
730 *
731 * This will remove the following structure:
732 * <script>
733 *  window.alert("Isn't cross-site-scripting fun?!");
734 * </script>
735 *
736 * $self_closing_tags
737 * ------------------
738 * This is a simple one-dimentional array of strings, which specifies which
739 * tags contain no content and should not be forcefully closed if this option
740 * is turned on (see further).
741 * Example:
742 * $self_closing_tags =  Array(
743 *                             "img",
744 *                             "br",
745 *                             "hr",
746 *                             "input"
747 *                            );   
748 *
749 * $force_tag_closing
750 * ------------------
751 * Set it to true to forcefully close any tags opened within the document.
752 * This is good if you want to take care of people who like to screw up
753 * the pages by leaving unclosed tags like <a>, <b>, <i>, etc.
754 *
755 * $rm_attnames
756 * -------------
757 * Now we come to parameters that are more obscure. This parameter is
758 * a nested array which is used to specify which attributes should be
759 * removed. It goes like so:
760 *
761 * $rm_attnames = Array(
762 *   "PCRE regex to match tag name" =>
763 *     Array(
764 *           "PCRE regex to match attribute name"
765 *           )
766 *   );
767 *
768 * Example:
769 * $rm_attnames = Array(
770 *   "|.*|" =>
771 *     Array(
772 *           "|target|i",
773 *           "|^on.*|i" 
774 *          )
775 *   );
776 *
777 * This will match all attributes (.*), and specify that all attributes
778 * named "target" and starting with "on" should be removed. This will take
779 * care of the following problem:
780 * <em onmouseover="window.alert('muahahahaha')">
781 * The "onmouseover" will be removed.
782 *
783 * $bad_attvals
784 * ------------
785 * This is where it gets ugly. This is a nested array with many levels.
786 * It goes like so:
787 *
788 * $bad_attvals = Array(
789 *   "pcre regex to match tag name" =>
790 *     Array(
791 *           "pcre regex to match attribute name" =>
792 *             Array(
793 *                   "pcre regex to match attribute value"
794 *                  )
795 *             Array(
796 *                   "pcre regex replace a match from above with"
797 *                  )
798 *          )
799 *   );
800 *
801 * An extensive example:
802 *
803 * $bad_attvals = Array(
804 *   "|.*|" =>
805 *      Array(
806 *            "/^src|background|href|action/i" =>
807 *                Array(
808 *                      Array(
809 *                            "/^([\'\"])\s*\S+script\s*:.*([\'\"])/si"
810 *                            ),
811 *                      Array(
812 *                            "\\1http://veryfunny.com/\\2"
813 *                            )
814 *                      ),
815 *            "/^style/i" =>
816 *                Array(
817 *                      Array(
818 *                            "/expression/si",
819 *                            "/url\(([\'\"])\s*https*:.*([\'\"])\)/si",
820 *                            "/url\(([\'\"])\s*\S+script:.*([\'\"])\)/si"
821 *                           ),
822 *                      Array(
823 *                            "idiocy",
824 *                            "url(\\1http://veryfunny.com/\\2)",
825 *                            "url(\\1http://veryfynny.com/\\2)"
826 *                           )
827 *                      )
828 *            )
829 *  );
830 *
831 * This will take care of nearly all known cross-site scripting exploits,
832 * plus some (see my filter sample at
833 * http://www.mricon.com/html/phpfilter.html for a working version).
834 *
835 * $add_attr_to_tag
836 * ----------------
837 * This is a useful little feature which lets you add attributes to
838 * certain tags. It is a nested array as well, but not at all like
839 * the previous one. It goes like so:
840 *
841 * $add_attr_to_tag = Array(
842 *   "PCRE regex to match tag name" =>
843 *     Array(
844 *           "attribute name"=>'"attribute value"'
845 *          )
846 *   );
847 *
848 * Note: don't forget quotes around attribute value.
849 *
850 * Example:
851 *
852 * $add_attr_to_tag = Array(
853 *   "/^a$/si" =>
854 *     Array(
855 *           'target'=>'"_new"'
856 *          )
857 *   );
858 *
859 * This will change all <a> tags and add target="_new" to them so all links
860 * open in a new window.
861 *
862 *
863 *
864 * @param $body                 the string with HTML you wish to filter
865 * @param $tag_list             see description above
866 * @param $rm_tags_with_content see description above
867 * @param $self_closing_tags    see description above
868 * @param $force_tag_closing    see description above
869 * @param $rm_attnames          see description above
870 * @param $bad_attvals          see description above
871 * @param $add_attr_to_tag      see description above
872 * @return                      sanitized html safe to show on your pages.
873 */
874function sanitize($body,
875                  $tag_list,
876                  $rm_tags_with_content,
877                  $self_closing_tags,
878                  $force_tag_closing,
879                  $rm_attnames,
880                  $bad_attvals,
881                  $add_attr_to_tag
882                  ){
883    $me = 'sanitize';
884    /**
885     * Normalize rm_tags and rm_tags_with_content.
886     */
887    @array_walk($tag_list, 'casenormalize');
888    @array_walk($rm_tags_with_content, 'casenormalize');
889    @array_walk($self_closing_tags, 'casenormalize');
890    /**
891     * See if tag_list is of tags to remove or tags to allow.
892     * false  means remove these tags
893     * true   means allow these tags
894     */
895    $rm_tags = array_shift($tag_list);
896    $curpos = 0;
897    $open_tags = Array();
898    //$trusted = "<!-- begin sanitized html -->\n";
899    $trusted = "";
900    $skip_content = false;
901    /**
902     * Take care of netscape's stupid javascript entities like
903     * &{alert('boo')};
904     */
905    $body = preg_replace('/&(\{.*?\};)/si', '&amp;\\1', $body);
906    spew("$me: invoking the loop\n");
907    while (($curtag = getnxtag($body, $curpos)) != FALSE){
908        list($tagname, $attary, $tagtype, $lt, $gt) = $curtag;
909        spew("$me: grabbing free-standing content\n");
910        $free_content = substr($body, $curpos, $lt - $curpos);
911        spew("$me: " . strlen($free_content) . " chars grabbed\n");
912        if ($skip_content == false){
913            spew("$me: appending free content to trusted.\n");
914            $trusted .= $free_content;
915        } else {
916            spew("$me: Skipping free content.\n");
917        }
918        if ($tagname != FALSE){
919            spew("$me: tagname is '$tagname'\n");
920            if ($tagtype == 2){
921                spew("$me: This is a closing tag\n");
922                if ($skip_content == $tagname){
923                    /**
924                     * Got to the end of tag we needed to remove.
925                     */
926                    spew("$me: Finished removing tag with content\n");
927                    $tagname = false;
928                    $skip_content = false;
929                } else {
930                    if ($skip_content == false){
931                        if (isset($open_tags{$tagname}) &&
932                            $open_tags{$tagname} > 0){
933                            spew("$me: popping '$tagname' from open_tags\n");
934                            $open_tags{$tagname}--;
935                        } else {
936                            spew("$me: '$tagname' was never opened\n");
937                            spew("$me: removing\n");
938                            $tagname = false;
939                        }
940                    } else {
941                        spew("$me: Skipping this tag\n");
942                    }
943                }
944            } else {
945                /**
946                 * $rm_tags_with_content
947                 */
948                if ($skip_content == false){
949                    /**
950                     * See if this is a self-closing type and change
951                     * tagtype appropriately.
952                     */
953                    if ($tagtype == 1
954                        && in_array($tagname, $self_closing_tags)){
955                        spew("$me: Self-closing tag. Changing tagtype.\n");
956                        $tagtype = 3;
957                    }
958                    /**
959                     * See if we should skip this tag and any content
960                     * inside it.
961                     */
962                    if ($tagtype == 1
963                        && in_array($tagname, $rm_tags_with_content)){
964                        spew("$me: removing this tag with content\n");
965                        $skip_content = $tagname;
966                    } else {
967                        if (($rm_tags == false
968                             && in_array($tagname, $tag_list)) ||
969                            ($rm_tags == true
970                             && !in_array($tagname, $tag_list))){
971                            spew("$me: Removing this tag.\n");
972                            $tagname = false;
973                        } else {
974                            if ($tagtype == 1){
975                                spew("$me: adding '$tagname' to open_tags\n");
976                                if (isset($open_tags{$tagname})){
977                                    $open_tags{$tagname}++;
978                                } else {
979                                    $open_tags{$tagname} = 1;
980                                }
981                            }
982                            /**
983                             * This is where we run other checks.
984                             */
985                            if (is_array($attary) && sizeof($attary) > 0){
986                                $attary = fixatts($tagname,
987                                                  $attary,
988                                                  $rm_attnames,
989                                                  $bad_attvals,
990                                                  $add_attr_to_tag);
991                            }
992                        }
993                    }
994                } else {
995                    spew("$me: Skipping this tag\n");
996                }
997            }
998            if ($tagname != false && $skip_content == false){
999                spew("$me: Appending tag to trusted.\n");
1000                $trusted .= tagprint($tagname, $attary, $tagtype);
1001            }
1002        } else {
1003            spew("$me: Removing invalid tag\n");
1004        }
1005        $curpos = $gt + 1;
1006    }
1007    spew("$me: Appending any leftover content\n");
1008    $trusted .= substr($body, $curpos, strlen($body) - $curpos);
1009    if ($force_tag_closing == true){
1010        foreach ($open_tags as $tagname=>$opentimes){
1011            while ($opentimes > 0){
1012                spew("$me: '$tagname' left open. Closing by force.\n");
1013                $trusted .= '</' . $tagname . '>';
1014                $opentimes--;
1015            }
1016        }
1017        $trusted .= "\n";
1018    }
1019    //$trusted .= "<!-- end sanitized html -->\n";
1020    return $trusted;
1021}
1022?>
Note: See TracBrowser for help on using the repository browser.